Skip to content

Conversation

@tschuett
Copy link

into zext x

LegalizerHelper has two padding strategies: undef or zero.

see LegalizerHelper:273
see LegalizerHelper:315

This PR is about zero sugar and Coke Zero.

; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [C]

Please continue padding merge values.

// %bits_8_15:(s8) = G_CONSTANT i8 0
// %0:(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)

%bits_8_15 is defined by zero. For optimization, we pick zext.

// %0:_(s16) = G_ZEXT %bits_0_7:(s8)

The upper bits of %0 are zero and the lower bits come from %bits_0_7.

into zext x

LegalizerHelper has two padding strategies: undef or zero.

see LegalizerHelper:273
see LegalizerHelper:315

This PR is about zero sugar and Coke Zero.

 ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [[C]](s32)

Please continue padding merge values.

// %bits_8_15:(s8) = G_CONSTANT i8 0
// %0:(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)

%bits_8_15 is defined by zero. For optimization, we pick zext.

// %0:_(s16) = G_ZEXT %bits_0_7:(s8)

The upper bits of %0 are zero and the lower bits come from %bits_0_7.
@llvmbot
Copy link
Member

llvmbot commented Nov 14, 2024

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-backend-aarch64

Author: Thorsten Schütt (tschuett)

Changes

into zext x

LegalizerHelper has two padding strategies: undef or zero.

see LegalizerHelper:273
see LegalizerHelper:315

This PR is about zero sugar and Coke Zero.

; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [C]

Please continue padding merge values.

// %bits_8_15:(s8) = G_CONSTANT i8 0
// %0:(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)

%bits_8_15 is defined by zero. For optimization, we pick zext.

// %0:_(s16) = G_ZEXT %bits_0_7:(s8)

The upper bits of %0 are zero and the lower bits come from %bits_0_7.


Patch is 92.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116283.diff

12 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3)
  • (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+10-1)
  • (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp (+28)
  • (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir (+46)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll (+79-91)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir (+8-12)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+16-32)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+78-87)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+62-68)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+48-53)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+33-37)
  • (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+10-20)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 49e9d6bd73a4cc..566a5e412662b6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -932,6 +932,9 @@ class CombinerHelper {
   // merge_values(_, undef) -> anyext
   bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  // merge_values(_, zero) -> zext
+  bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 95f3d637da8548..87f043979262af 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -865,6 +865,14 @@ def merge_of_x_and_undef : GICombineRule <
          [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
 
+/// Transform merge_x_zero -> zext.
+def merge_of_x_and_zero : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_CONSTANT $zero, 0),
+         (G_MERGE_VALUES $root, $x, $zero):$MI,
+         [{ return Helper.matchMergeXAndZero(*${MI}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
 def merge_combines: GICombineGroup<[
   unmerge_anyext_build_vector,
   unmerge_merge,
@@ -873,7 +881,8 @@ def merge_combines: GICombineGroup<[
   unmerge_undef,
   unmerge_dead_to_trunc,
   unmerge_zext_to_zext,
-  merge_of_x_and_undef
+  merge_of_x_and_undef,
+  merge_of_x_and_zero
 ]>;
 
 // Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 8f4095f01be7a3..797a1e84e21e35 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -56,3 +56,31 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
   };
   return true;
 }
+
+bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
+                                        BuildFnTy &MatchInfo) {
+  const GMerge *Merge = cast<GMerge>(&MI);
+
+  Register Dst = Merge->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+  // No multi-use check. It is a constant.
+
+  //
+  //   %bits_8_15:_(s8) = G_CONSTANT i8 0
+  //   %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+  //
+  // ->
+  //
+  //   %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+  //
+
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildZExt(Dst, Merge->getSourceReg(0));
+  };
+  return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index d6a79780b8bb16..f427f8648a301e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -567,3 +567,49 @@ body:             |
     $q0 = COPY %me(s128)
     $x0 = COPY %def(s64)
 ...
+# Check that we zext the merge
+---
+name:            test_merge_zero
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_merge_zero
+    ; CHECK: %opaque:_(s64) = COPY $x0
+    ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+    ; CHECK-NEXT: $q0 = COPY %me(s128)
+    %opaque:_(s64) = COPY $x0
+    %def:_(s64) = G_CONSTANT i64 0
+    %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+    $q0 = COPY %me(s128)
+...
+# Check that we still zext the merge, multi-use
+---
+name:            test_merge_zero_multi_use
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_merge_zero_multi_use
+    ; CHECK: %opaque:_(s64) = COPY $x0
+    ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+    ; CHECK-NEXT: $q0 = COPY %me(s128)
+    ; CHECK-NEXT: $x0 = COPY %def(s64)
+    %opaque:_(s64) = COPY $x0
+    %def:_(s64) = G_CONSTANT i64 0
+    %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+    $q0 = COPY %me(s128)
+    $x0 = COPY %def(s64)
+...
+# Check that we don't zext the merge with one
+---
+name:            test_merge_one
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_merge_one
+    ; CHECK: %opaque:_(s64) = COPY $x0
+    ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+    ; CHECK-NEXT: $q0 = COPY %me(s128)
+    %opaque:_(s64) = COPY $x0
+    %def:_(s64) = G_CONSTANT i64 1
+    %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+    $q0 = COPY %me(s128)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index bdfafa89cd0477..28ed88f4cf8fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -16,7 +16,7 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4i32(<4 x i32>, ptr ad
 define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_add(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -26,19 +26,18 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg)  {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR:       [[BB9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_add:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
@@ -57,7 +56,7 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg)  {
 define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_add_and_format(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -67,12 +66,12 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR:       [[TMP9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
-; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
+; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
 ; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -81,9 +80,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; GCN-LABEL: atomic_add_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -114,7 +112,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_sub(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -124,19 +122,18 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg)  {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR:       [[BB9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_sub:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB2_2
@@ -155,7 +152,7 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg)  {
 define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_sub_and_format(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -165,12 +162,12 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR:       [[TMP9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
-; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
+; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
 ; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -179,9 +176,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; GCN-LABEL: atomic_sub_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -212,7 +208,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_xor(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -223,19 +219,18 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg)  {
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], 1
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR:       10:
+; IR-NEXT:    br i1 [[TMP9]], label %[[BB10:.*]], label %[[BB12:.*]]
+; IR:       [[BB10]]:
 ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP12]]
-; IR:       12:
+; IR-NEXT:    br label %[[BB12]]
+; IR:       [[BB12]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_xor:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB4_2
@@ -255,7 +250,7 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg)  {
 define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_xor_and_format(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -266,12 +261,12 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], 1
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR:       10:
+; IR-NEXT:    br i1 [[TMP9]], label %[[TMP10:.*]], label %[[BB12:.*]]
+; IR:       [[TMP10]]:
 ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP12]]
-; IR:       12:
-; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT:    br label %[[BB12]]
+; IR:       [[BB12]]:
+; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], %[[TMP10]] ]
 ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP5]], 1
 ; IR-NEXT:    [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
@@ -281,9 +276,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; GCN-LABEL: atomic_xor_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -316,7 +310,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_ptr_add(
 ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -326,19 +320,18 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg)  {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR:       [[BB9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_ptr_add:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB6_2
@@ -357,7 +350,7 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg)  {
 define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_ptr_add_and_format(
 ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -367,12 +360,12 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR:       [[TMP9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
-; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
+; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
 ; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
@@ -383,9 +376,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; GCN-LABEL: atomic_ptr_add_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcn...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Nov 14, 2024

@llvm/pr-subscribers-llvm-globalisel

Author: Thorsten Schütt (tschuett)

Changes

into zext x

LegalizerHelper has two padding strategies: undef or zero.

see LegalizerHelper:273
see LegalizerHelper:315

This PR is about zero sugar and Coke Zero.

; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [C]

Please continue padding merge values.

// %bits_8_15:(s8) = G_CONSTANT i8 0
// %0:(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)

%bits_8_15 is defined by zero. For optimization, we pick zext.

// %0:_(s16) = G_ZEXT %bits_0_7:(s8)

The upper bits of %0 are zero and the lower bits come from %bits_0_7.


Patch is 92.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116283.diff

12 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3)
  • (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+10-1)
  • (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp (+28)
  • (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir (+46)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll (+79-91)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir (+8-12)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+16-32)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+78-87)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+62-68)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+48-53)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+33-37)
  • (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+10-20)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 49e9d6bd73a4cc..566a5e412662b6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -932,6 +932,9 @@ class CombinerHelper {
   // merge_values(_, undef) -> anyext
   bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  // merge_values(_, zero) -> zext
+  bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 95f3d637da8548..87f043979262af 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -865,6 +865,14 @@ def merge_of_x_and_undef : GICombineRule <
          [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
 
+/// Transform merge_x_zero -> zext.
+def merge_of_x_and_zero : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_CONSTANT $zero, 0),
+         (G_MERGE_VALUES $root, $x, $zero):$MI,
+         [{ return Helper.matchMergeXAndZero(*${MI}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
 def merge_combines: GICombineGroup<[
   unmerge_anyext_build_vector,
   unmerge_merge,
@@ -873,7 +881,8 @@ def merge_combines: GICombineGroup<[
   unmerge_undef,
   unmerge_dead_to_trunc,
   unmerge_zext_to_zext,
-  merge_of_x_and_undef
+  merge_of_x_and_undef,
+  merge_of_x_and_zero
 ]>;
 
 // Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 8f4095f01be7a3..797a1e84e21e35 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -56,3 +56,31 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
   };
   return true;
 }
+
+bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
+                                        BuildFnTy &MatchInfo) {
+  const GMerge *Merge = cast<GMerge>(&MI);
+
+  Register Dst = Merge->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+  // No multi-use check. It is a constant.
+
+  //
+  //   %bits_8_15:_(s8) = G_CONSTANT i8 0
+  //   %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+  //
+  // ->
+  //
+  //   %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+  //
+
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildZExt(Dst, Merge->getSourceReg(0));
+  };
+  return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index d6a79780b8bb16..f427f8648a301e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -567,3 +567,49 @@ body:             |
     $q0 = COPY %me(s128)
     $x0 = COPY %def(s64)
 ...
+# Check that we zext the merge
+---
+name:            test_merge_zero
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_merge_zero
+    ; CHECK: %opaque:_(s64) = COPY $x0
+    ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+    ; CHECK-NEXT: $q0 = COPY %me(s128)
+    %opaque:_(s64) = COPY $x0
+    %def:_(s64) = G_CONSTANT i64 0
+    %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+    $q0 = COPY %me(s128)
+...
+# Check that we still zext the merge, multi-use
+---
+name:            test_merge_zero_multi_use
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_merge_zero_multi_use
+    ; CHECK: %opaque:_(s64) = COPY $x0
+    ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+    ; CHECK-NEXT: $q0 = COPY %me(s128)
+    ; CHECK-NEXT: $x0 = COPY %def(s64)
+    %opaque:_(s64) = COPY $x0
+    %def:_(s64) = G_CONSTANT i64 0
+    %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+    $q0 = COPY %me(s128)
+    $x0 = COPY %def(s64)
+...
+# Check that we don't zext the merge with one
+---
+name:            test_merge_one
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_merge_one
+    ; CHECK: %opaque:_(s64) = COPY $x0
+    ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+    ; CHECK-NEXT: $q0 = COPY %me(s128)
+    %opaque:_(s64) = COPY $x0
+    %def:_(s64) = G_CONSTANT i64 1
+    %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+    $q0 = COPY %me(s128)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index bdfafa89cd0477..28ed88f4cf8fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -16,7 +16,7 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4i32(<4 x i32>, ptr ad
 define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_add(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -26,19 +26,18 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg)  {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR:       [[BB9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_add:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
@@ -57,7 +56,7 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg)  {
 define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_add_and_format(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -67,12 +66,12 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR:       [[TMP9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
-; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
+; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
 ; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -81,9 +80,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; GCN-LABEL: atomic_add_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -114,7 +112,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_sub(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -124,19 +122,18 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg)  {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR:       [[BB9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_sub:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB2_2
@@ -155,7 +152,7 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg)  {
 define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_sub_and_format(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -165,12 +162,12 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR:       [[TMP9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
-; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
+; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
 ; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -179,9 +176,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; GCN-LABEL: atomic_sub_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -212,7 +208,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_xor(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -223,19 +219,18 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg)  {
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], 1
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR:       10:
+; IR-NEXT:    br i1 [[TMP9]], label %[[BB10:.*]], label %[[BB12:.*]]
+; IR:       [[BB10]]:
 ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP12]]
-; IR:       12:
+; IR-NEXT:    br label %[[BB12]]
+; IR:       [[BB12]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_xor:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB4_2
@@ -255,7 +250,7 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg)  {
 define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_xor_and_format(
 ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -266,12 +261,12 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], 1
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR:       10:
+; IR-NEXT:    br i1 [[TMP9]], label %[[TMP10:.*]], label %[[BB12:.*]]
+; IR:       [[TMP10]]:
 ; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP12]]
-; IR:       12:
-; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT:    br label %[[BB12]]
+; IR:       [[BB12]]:
+; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], %[[TMP10]] ]
 ; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP5]], 1
 ; IR-NEXT:    [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
@@ -281,9 +276,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; GCN-LABEL: atomic_xor_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -316,7 +310,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg)  {
 ; IR-LABEL: define amdgpu_cs void @atomic_ptr_add(
 ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -326,19 +320,18 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg)  {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR:       [[BB9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_ptr_add:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB6_2
@@ -357,7 +350,7 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg)  {
 define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-LABEL: define amdgpu_cs void @atomic_ptr_add_and_format(
 ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT:  .entry:
+; IR-NEXT:  [[_ENTRY:.*:]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -367,12 +360,12 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR:       9:
+; IR-NEXT:    br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR:       [[TMP9]]:
 ; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT:    br label [[TMP11]]
-; IR:       11:
-; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT:    br label %[[BB11]]
+; IR:       [[BB11]]:
+; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
 ; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
@@ -383,9 +376,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; GCN-LABEL: atomic_ptr_add_and_format:
 ; GCN:       ; %bb.0: ; %.entry
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_mov_b32 s4, s7
 ; GCN-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_mbcn...
[truncated]

@llvm-ci
Copy link
Collaborator

llvm-ci commented Nov 16, 2024

LLVM Buildbot has detected a new failure on builder sanitizer-x86_64-linux-bootstrap-asan running on sanitizer-buildbot2 while building llvm at step 2 "annotate".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/52/builds/3759

Here is the relevant piece of the build log for the reference
Step 2 (annotate) failure: 'python ../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py' (failure)
...
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld.lld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 900 seconds was requested on the command line. Forcing timeout to be 900 seconds.
-- Testing: 87236 of 87237 tests, 88 workers --
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
FAIL: lld :: ELF/lto/devirt_vcall_vis_localize.ll (85847 of 87236)
******************** TEST 'lld :: ELF/lto/devirt_vcall_vis_localize.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 5: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/devirt_vcall_vis_localize.ll
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/devirt_vcall_vis_localize.ll
RUN: at line 6: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/Inputs/devirt_vcall_vis_shared_def.ll
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/Inputs/devirt_vcall_vis_shared_def.ll
RUN: at line 7: echo '{ global: _start; local: *; };' > /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp.ver
+ echo '{ global: _start; local: *; };'
RUN: at line 9: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp.out --save-temps --lto-whole-program-visibility -shared    -mllvm -pass-remarks=. 2>&1 | /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/count 0
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp.out --save-temps --lto-whole-program-visibility -shared -mllvm -pass-remarks=.
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/count 0
Expected 0 lines, got 1.

--

********************
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90.. 
Slowest Tests:
--------------------------------------------------------------------------
231.75s: LLVM :: CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir
114.64s: Clang :: Driver/fsanitize.c
101.56s: Clang :: Preprocessor/riscv-target-features.c
90.94s: Clang :: Analysis/a_flaky_crash.cpp
87.47s: Clang :: OpenMP/target_defaultmap_codegen_01.cpp
86.36s: Clang :: Driver/arm-cortex-cpus-2.c
86.27s: Clang :: OpenMP/target_update_codegen.cpp
85.26s: Clang :: Driver/arm-cortex-cpus-1.c
82.25s: Clang :: Analysis/runtime-regression.c
72.73s: Clang :: Preprocessor/arm-target-features.c
72.71s: Clang :: Preprocessor/aarch64-target-features.c
64.45s: Clang :: Preprocessor/predefined-arch-macros.c
62.67s: Clang :: CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret.c
61.50s: LLVM :: CodeGen/AMDGPU/memintrinsic-unroll.ll
58.02s: LLVM :: CodeGen/RISCV/attributes.ll
57.08s: Clang :: Driver/linux-ld.c
Step 10 (stage2/asan check) failure: stage2/asan check (failure)
...
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld.lld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 900 seconds was requested on the command line. Forcing timeout to be 900 seconds.
-- Testing: 87236 of 87237 tests, 88 workers --
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
FAIL: lld :: ELF/lto/devirt_vcall_vis_localize.ll (85847 of 87236)
******************** TEST 'lld :: ELF/lto/devirt_vcall_vis_localize.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 5: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/devirt_vcall_vis_localize.ll
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/devirt_vcall_vis_localize.ll
RUN: at line 6: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/Inputs/devirt_vcall_vis_shared_def.ll
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/opt --thinlto-bc -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/lto/Inputs/devirt_vcall_vis_shared_def.ll
RUN: at line 7: echo '{ global: _start; local: *; };' > /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp.ver
+ echo '{ global: _start; local: *; };'
RUN: at line 9: /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp.out --save-temps --lto-whole-program-visibility -shared    -mllvm -pass-remarks=. 2>&1 | /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/count 0
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp1.o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp2.o -o /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/lto/Output/devirt_vcall_vis_localize.ll.tmp.out --save-temps --lto-whole-program-visibility -shared -mllvm -pass-remarks=.
+ /home/b/sanitizer-x86_64-linux-bootstrap-asan/build/llvm_build_asan/bin/count 0
Expected 0 lines, got 1.

--

********************
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90.. 
Slowest Tests:
--------------------------------------------------------------------------
231.75s: LLVM :: CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir
114.64s: Clang :: Driver/fsanitize.c
101.56s: Clang :: Preprocessor/riscv-target-features.c
90.94s: Clang :: Analysis/a_flaky_crash.cpp
87.47s: Clang :: OpenMP/target_defaultmap_codegen_01.cpp
86.36s: Clang :: Driver/arm-cortex-cpus-2.c
86.27s: Clang :: OpenMP/target_update_codegen.cpp
85.26s: Clang :: Driver/arm-cortex-cpus-1.c
82.25s: Clang :: Analysis/runtime-regression.c
72.73s: Clang :: Preprocessor/arm-target-features.c
72.71s: Clang :: Preprocessor/aarch64-target-features.c
64.45s: Clang :: Preprocessor/predefined-arch-macros.c
62.67s: Clang :: CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret.c
61.50s: LLVM :: CodeGen/AMDGPU/memintrinsic-unroll.ll
58.02s: LLVM :: CodeGen/RISCV/attributes.ll
57.08s: Clang :: Driver/linux-ld.c

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants