-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[GlobalISel] Combine G_MERGE_VALUES of x and zero #116283
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
into zext x LegalizerHelper has two padding strategies: undef or zero. see LegalizerHelper:273 see LegalizerHelper:315 This PR is about zero sugar and Coke Zero. ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [[C]](s32) Please continue padding merge values. // %bits_8_15:(s8) = G_CONSTANT i8 0 // %0:(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8) %bits_8_15 is defined by zero. For optimization, we pick zext. // %0:_(s16) = G_ZEXT %bits_0_7:(s8) The upper bits of %0 are zero and the lower bits come from %bits_0_7.
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-aarch64 Author: Thorsten Schütt (tschuett) Changesinto zext x LegalizerHelper has two padding strategies: undef or zero. see LegalizerHelper:273 This PR is about zero sugar and Coke Zero. ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [C] Please continue padding merge values. // %bits_8_15:(s8) = G_CONSTANT i8 0 %bits_8_15 is defined by zero. For optimization, we pick zext. // %0:_(s16) = G_ZEXT %bits_0_7:(s8) The upper bits of %0 are zero and the lower bits come from %bits_0_7. Patch is 92.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116283.diff 12 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 49e9d6bd73a4cc..566a5e412662b6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -932,6 +932,9 @@ class CombinerHelper {
// merge_values(_, undef) -> anyext
bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
+ // merge_values(_, zero) -> zext
+ bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 95f3d637da8548..87f043979262af 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -865,6 +865,14 @@ def merge_of_x_and_undef : GICombineRule <
[{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+/// Transform merge_x_zero -> zext.
+def merge_of_x_and_zero : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $zero, 0),
+ (G_MERGE_VALUES $root, $x, $zero):$MI,
+ [{ return Helper.matchMergeXAndZero(*${MI}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
def merge_combines: GICombineGroup<[
unmerge_anyext_build_vector,
unmerge_merge,
@@ -873,7 +881,8 @@ def merge_combines: GICombineGroup<[
unmerge_undef,
unmerge_dead_to_trunc,
unmerge_zext_to_zext,
- merge_of_x_and_undef
+ merge_of_x_and_undef,
+ merge_of_x_and_zero
]>;
// Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 8f4095f01be7a3..797a1e84e21e35 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -56,3 +56,31 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
};
return true;
}
+
+bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GMerge *Merge = cast<GMerge>(&MI);
+
+ Register Dst = Merge->getReg(0);
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+ // No multi-use check. It is a constant.
+
+ //
+ // %bits_8_15:_(s8) = G_CONSTANT i8 0
+ // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+ //
+ // ->
+ //
+ // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+ //
+
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildZExt(Dst, Merge->getSourceReg(0));
+ };
+ return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index d6a79780b8bb16..f427f8648a301e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -567,3 +567,49 @@ body: |
$q0 = COPY %me(s128)
$x0 = COPY %def(s64)
...
+# Check that we zext the merge
+---
+name: test_merge_zero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_zero
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_CONSTANT i64 0
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
+# Check that we still zext the merge, multi-use
+---
+name: test_merge_zero_multi_use
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_zero_multi_use
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ ; CHECK-NEXT: $x0 = COPY %def(s64)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_CONSTANT i64 0
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+ $x0 = COPY %def(s64)
+...
+# Check that we don't zext the merge with one
+---
+name: test_merge_one
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_one
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_CONSTANT i64 1
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index bdfafa89cd0477..28ed88f4cf8fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -16,7 +16,7 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4i32(<4 x i32>, ptr ad
define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_add(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -26,19 +26,18 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR: [[BB9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_add:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_2
@@ -57,7 +56,7 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_add_and_format(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -67,12 +66,12 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR: [[TMP9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
-; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
+; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -81,9 +80,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; GCN-LABEL: atomic_add_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -114,7 +112,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_sub(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -124,19 +122,18 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR: [[BB9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_sub:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB2_2
@@ -155,7 +152,7 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_sub_and_format(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -165,12 +162,12 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR: [[TMP9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
-; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
+; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -179,9 +176,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; GCN-LABEL: atomic_sub_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -212,7 +208,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_xor(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -223,19 +219,18 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR: 10:
+; IR-NEXT: br i1 [[TMP9]], label %[[BB10:.*]], label %[[BB12:.*]]
+; IR: [[BB10]]:
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP12]]
-; IR: 12:
+; IR-NEXT: br label %[[BB12]]
+; IR: [[BB12]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_xor:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB4_2
@@ -255,7 +250,7 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_xor_and_format(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -266,12 +261,12 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR: 10:
+; IR-NEXT: br i1 [[TMP9]], label %[[TMP10:.*]], label %[[BB12:.*]]
+; IR: [[TMP10]]:
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP12]]
-; IR: 12:
-; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: br label %[[BB12]]
+; IR: [[BB12]]:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], %[[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1
; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
@@ -281,9 +276,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; GCN-LABEL: atomic_xor_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -316,7 +310,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_ptr_add(
; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -326,19 +320,18 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR: [[BB9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_ptr_add:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB6_2
@@ -357,7 +350,7 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) {
define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_ptr_add_and_format(
; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -367,12 +360,12 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR: [[TMP9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
-; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
+; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
@@ -383,9 +376,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; GCN-LABEL: atomic_ptr_add_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcn...
[truncated]
|
|
@llvm/pr-subscribers-llvm-globalisel Author: Thorsten Schütt (tschuett) Changesinto zext x LegalizerHelper has two padding strategies: undef or zero. see LegalizerHelper:273 This PR is about zero sugar and Coke Zero. ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [C] Please continue padding merge values. // %bits_8_15:(s8) = G_CONSTANT i8 0 %bits_8_15 is defined by zero. For optimization, we pick zext. // %0:_(s16) = G_ZEXT %bits_0_7:(s8) The upper bits of %0 are zero and the lower bits come from %bits_0_7. Patch is 92.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116283.diff 12 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 49e9d6bd73a4cc..566a5e412662b6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -932,6 +932,9 @@ class CombinerHelper {
// merge_values(_, undef) -> anyext
bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
+ // merge_values(_, zero) -> zext
+ bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 95f3d637da8548..87f043979262af 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -865,6 +865,14 @@ def merge_of_x_and_undef : GICombineRule <
[{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+/// Transform merge_x_zero -> zext.
+def merge_of_x_and_zero : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $zero, 0),
+ (G_MERGE_VALUES $root, $x, $zero):$MI,
+ [{ return Helper.matchMergeXAndZero(*${MI}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
def merge_combines: GICombineGroup<[
unmerge_anyext_build_vector,
unmerge_merge,
@@ -873,7 +881,8 @@ def merge_combines: GICombineGroup<[
unmerge_undef,
unmerge_dead_to_trunc,
unmerge_zext_to_zext,
- merge_of_x_and_undef
+ merge_of_x_and_undef,
+ merge_of_x_and_zero
]>;
// Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 8f4095f01be7a3..797a1e84e21e35 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -56,3 +56,31 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
};
return true;
}
+
+bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GMerge *Merge = cast<GMerge>(&MI);
+
+ Register Dst = Merge->getReg(0);
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+ // No multi-use check. It is a constant.
+
+ //
+ // %bits_8_15:_(s8) = G_CONSTANT i8 0
+ // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+ //
+ // ->
+ //
+ // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+ //
+
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildZExt(Dst, Merge->getSourceReg(0));
+ };
+ return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index d6a79780b8bb16..f427f8648a301e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -567,3 +567,49 @@ body: |
$q0 = COPY %me(s128)
$x0 = COPY %def(s64)
...
+# Check that we zext the merge
+---
+name: test_merge_zero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_zero
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_CONSTANT i64 0
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
+# Check that we still zext the merge, multi-use
+---
+name: test_merge_zero_multi_use
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_zero_multi_use
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ ; CHECK-NEXT: $x0 = COPY %def(s64)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_CONSTANT i64 0
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+ $x0 = COPY %def(s64)
+...
+# Check that we don't zext the merge with one
+---
+name: test_merge_one
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_one
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_CONSTANT i64 1
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index bdfafa89cd0477..28ed88f4cf8fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -16,7 +16,7 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4i32(<4 x i32>, ptr ad
define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_add(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -26,19 +26,18 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR: [[BB9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_add:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_2
@@ -57,7 +56,7 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_add_and_format(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -67,12 +66,12 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR: [[TMP9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
-; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
+; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -81,9 +80,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; GCN-LABEL: atomic_add_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -114,7 +112,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_sub(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -124,19 +122,18 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR: [[BB9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_sub:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB2_2
@@ -155,7 +152,7 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_sub_and_format(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -165,12 +162,12 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR: [[TMP9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
-; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
+; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
@@ -179,9 +176,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; GCN-LABEL: atomic_sub_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -212,7 +208,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_xor(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -223,19 +219,18 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR: 10:
+; IR-NEXT: br i1 [[TMP9]], label %[[BB10:.*]], label %[[BB12:.*]]
+; IR: [[BB10]]:
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP12]]
-; IR: 12:
+; IR-NEXT: br label %[[BB12]]
+; IR: [[BB12]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_xor:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB4_2
@@ -255,7 +250,7 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_xor_and_format(
; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -266,12 +261,12 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR: 10:
+; IR-NEXT: br i1 [[TMP9]], label %[[TMP10:.*]], label %[[BB12:.*]]
+; IR: [[TMP10]]:
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP12]]
-; IR: 12:
-; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: br label %[[BB12]]
+; IR: [[BB12]]:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], %[[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1
; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
@@ -281,9 +276,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; GCN-LABEL: atomic_xor_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -316,7 +310,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_ptr_add(
; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -326,19 +320,18 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]]
+; IR: [[BB9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
; IR-NEXT: ret void
;
; GCN-LABEL: atomic_ptr_add:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
-; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
+; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB6_2
@@ -357,7 +350,7 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) {
define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; IR-LABEL: define amdgpu_cs void @atomic_ptr_add_and_format(
; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) {
-; IR-NEXT: .entry:
+; IR-NEXT: [[_ENTRY:.*:]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
@@ -367,12 +360,12 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
-; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; IR: 9:
+; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]]
+; IR: [[TMP9]]:
; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0)
-; IR-NEXT: br label [[TMP11]]
-; IR: 11:
-; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
+; IR-NEXT: br label %[[BB11]]
+; IR: [[BB11]]:
+; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ]
; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
@@ -383,9 +376,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; GCN-LABEL: atomic_ptr_add_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
+; GCN-NEXT: v_mbcn...
[truncated]
|
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/52/builds/3759 Here is the relevant piece of the build log for the reference |
into zext x
LegalizerHelper has two padding strategies: undef or zero.
see LegalizerHelper:273
see LegalizerHelper:315
This PR is about zero sugar and Coke Zero.
; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES %a(s32), [C]
Please continue padding merge values.
// %bits_8_15:(s8) = G_CONSTANT i8 0
// %0:(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
%bits_8_15 is defined by zero. For optimization, we pick zext.
// %0:_(s16) = G_ZEXT %bits_0_7:(s8)
The upper bits of %0 are zero and the lower bits come from %bits_0_7.