From 1565efb81c99476b88af6079831447f9af85b6af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Thu, 24 Oct 2024 10:31:28 +0200 Subject: [PATCH 1/6] [GlobalISel] Combine G_MERGE_VALUES of x and undef into zext x ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[DEF]](s32) Please continue padding merge values. // %bits_8_15:_(s8) = G_IMPLICIT_DEF // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8) %bits_8_15 is defined by undef. Its value is undefined and we can pick an arbitrary value. For optimization, we pick zero. // %0:_(s16) = G_ZEXT %bits_0_7:(s8) The upper bits of %0 are zero and the lower bits come from %bits_0_7. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + .../include/llvm/Target/GlobalISel/Combine.td | 11 ++- llvm/lib/CodeGen/GlobalISel/CMakeLists.txt | 1 + .../GlobalISel/CombinerHelperArtifacts.cpp | 57 ++++++++++++ .../AArch64/GlobalISel/combine-unmerge.mir | 50 +++++++++-- llvm/test/CodeGen/AArch64/bswap.ll | 27 ++---- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +++--- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 ++++++++++--------- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++++---- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 40 +++++---- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +- 11 files changed, 222 insertions(+), 117 deletions(-) create mode 100644 llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b09981eaef506..ff97e5f15bb1b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -925,6 +925,9 @@ class CombinerHelper { bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI, BuildFnTy &MatchInfo); + // merge_values(_, undef) -> zext + bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 80a22c35ebcef..2651a6927424b 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -856,6 +856,14 @@ def unmerge_zext_to_zext : GICombineRule< (apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }]) >; +/// Transform merge_x_undef -> zext. +def merge_of_x_and_undef : GICombineRule < + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_IMPLICIT_DEF $undef), + (G_MERGE_VALUES $root, $x, $undef):$MI, + [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>; + def merge_combines: GICombineGroup<[ unmerge_anyext_build_vector, unmerge_merge, @@ -863,7 +871,8 @@ def merge_combines: GICombineGroup<[ unmerge_cst, unmerge_undef, unmerge_dead_to_trunc, - unmerge_zext_to_zext + unmerge_zext_to_zext, + merge_of_x_and_undef ]>; // Under certain conditions, transform: diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt index af1717dbf76f3..a45024d120be6 100644 --- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt +++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel GlobalISel.cpp Combiner.cpp CombinerHelper.cpp + CombinerHelperArtifacts.cpp CombinerHelperCasts.cpp CombinerHelperCompares.cpp CombinerHelperVectorOps.cpp diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp new file mode 100644 index 0000000000000..29875b04c3798 --- /dev/null +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp @@ -0,0 +1,57 @@ +//===- CombinerHelperArtifacts.cpp-----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements CombinerHelper for legalization artifacts. +// +//===----------------------------------------------------------------------===// +// +// G_MERGE_VALUES +// +//===----------------------------------------------------------------------===// +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/LowLevelTypeUtils.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/Support/Casting.h" + +#define DEBUG_TYPE "gi-combiner" + +using namespace llvm; + +bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, + BuildFnTy &MatchInfo) { + const GMerge *Merge = cast(&MI); + + Register Dst = Merge->getReg(0); + Register Undef = Merge->getSourceReg(1); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Merge->getSourceReg(0)); + + // + // %bits_8_15:_(s8) = G_IMPLICIT_DEF + // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8) + // + // -> + // + // %0:_(s16) = G_ZEXT %bits_0_7:(s8) + // + + if (!MRI.hasOneNonDBGUse(Undef) || + !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildZExt(Dst, Merge->getSourceReg(0)); + }; + return true; +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index 7566d38e6c6cf..67cbdd19a0568 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -10,9 +10,9 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) - ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $w1 = COPY [[C]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -115,9 +115,11 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) - ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[ZEXT]](s64) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; CHECK-NEXT: $w0 = COPY [[UV]](s32) + ; CHECK-NEXT: $w1 = COPY [[UV1]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -136,9 +138,8 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[MV]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64) ; CHECK-NEXT: $h0 = COPY [[UV]](s16) ; CHECK-NEXT: $h1 = COPY [[UV1]](s16) ; CHECK-NEXT: $h2 = COPY [[UV2]](s16) @@ -539,3 +540,36 @@ body: | $q0 = COPY %un1(s128) $q1 = COPY %un2(s128) ... + +# Check that we zext the merge +--- +name: test_merge_undef +body: | + bb.1: + ; CHECK-LABEL: name: test_merge_undef + ; CHECK: %opaque:_(s64) = COPY $x0 + ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) + ; CHECK-NEXT: $q0 = COPY %me(s128) + %opaque:_(s64) = COPY $x0 + %def:_(s64) = G_IMPLICIT_DEF + %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def + $q0 = COPY %me(s128) +... + +# Check that we don't zext the merge, multi-use +--- +name: test_merge_undef_multi_use +body: | + bb.1: + ; CHECK-LABEL: name: test_merge_undef_multi_use + ; CHECK: %opaque:_(s64) = COPY $x0 + ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64) + ; CHECK-NEXT: $q0 = COPY %me(s128) + ; CHECK-NEXT: $x0 = COPY %def(s64) + %opaque:_(s64) = COPY $x0 + %def:_(s64) = G_IMPLICIT_DEF + %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def + $q0 = COPY %me(s128) + $x0 = COPY %def(s64) +... diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index 74e4a167ae14c..afc1d932840ff 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -45,25 +45,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) { ; The zext here is optimised to an any_extend during isel.. define i128 @bswap_i16_to_i128_anyext(i16 %a) { -; CHECK-SD-LABEL: bswap_i16_to_i128_anyext: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, w0 -; CHECK-SD-NEXT: mov x0, xzr -; CHECK-SD-NEXT: rev w8, w8 -; CHECK-SD-NEXT: lsr w8, w8, #16 -; CHECK-SD-NEXT: lsl x1, x8, #48 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: bswap_i16_to_i128_anyext: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, w0 -; CHECK-GI-NEXT: mov x0, xzr -; CHECK-GI-NEXT: rev w8, w8 -; CHECK-GI-NEXT: lsr w8, w8, #16 -; CHECK-GI-NEXT: bfi x8, x8, #32, #32 -; CHECK-GI-NEXT: and x8, x8, #0xffff -; CHECK-GI-NEXT: lsl x1, x8, #48 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: bswap_i16_to_i128_anyext: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: lsr w8, w8, #16 +; CHECK-NEXT: lsl x1, x8, #48 +; CHECK-NEXT: ret %3 = call i16 @llvm.bswap.i16(i16 %a) %4 = zext i16 %3 to i128 %5 = shl i128 %4, 112 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 493e8cef63890..bda97c980acee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_ashr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: s_ashr_i32 s2, s3, 1 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; GCN-NEXT: s_lshr_b32 s2, s1, 1 +; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_ashr_i32 s2, s5, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index cc185aff9eff2..9f678a5e46fbb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-LABEL: v_lshr_i65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 @@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-LABEL: v_lshr_i65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-LABEL: v_lshr_i65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1749,20 +1749,22 @@ define i65 @v_lshr_i65_33(i65 %value) { define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_lshr_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GCN-NEXT: s_sub_i32 s10, s3, 64 -; GCN-NEXT: s_sub_i32 s8, 64, s3 -; GCN-NEXT: s_cmp_lt_u32 s3, 64 +; GCN-NEXT: s_mov_b32 s4, s3 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GCN-NEXT: s_sub_i32 s10, s4, 64 +; GCN-NEXT: s_sub_i32 s8, 64, s4 +; GCN-NEXT: s_cmp_lt_u32 s4, 64 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3 -; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 -; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 +; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s11, 0 @@ -1771,24 +1773,26 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; ; GFX10PLUS-LABEL: s_lshr_i65: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 -; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 +; GFX10PLUS-NEXT: s_mov_b32 s4, s3 +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64 +; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4 +; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 ; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount ret i65 %result @@ -1797,22 +1801,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_lshr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: s_lshr_b32 s2, s3, 1 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GCN-NEXT: s_lshr_b32 s2, s1, 1 +; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_lshr_b32 s2, s5, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index bac80f0777c02..ac6660b76ded9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1440,6 +1440,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX6-LABEL: v_sext_inreg_i65_22: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1455,6 +1456,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX8-LABEL: v_sext_inreg_i65_22: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1470,6 +1472,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX9-LABEL: v_sext_inreg_i65_22: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1484,6 +1487,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX10PLUS-LABEL: v_sext_inreg_i65_22: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] @@ -1555,29 +1559,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) { ; GCN-LABEL: s_sext_inreg_i65_18: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 -; GCN-NEXT: s_lshr_b32 s4, s1, 14 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 +; GCN-NEXT: s_lshr_b32 s2, s1, 14 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] +; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GCN-NEXT: s_lshl_b32 s7, s2, 14 -; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: s_lshl_b32 s7, s4, 14 +; GCN-NEXT: s_mov_b32 s6, s3 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 +; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i65_18: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14 -; GFX10PLUS-NEXT: s_mov_b32 s5, 0 +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14 ; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_mov_b32 s6, s5 -; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14 -; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 +; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] +; GFX10PLUS-NEXT: s_mov_b32 s6, s3 +; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14 +; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 ; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i65 %value, 18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index b12e915c7d21b..3e0b29665aa88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1720,20 +1720,22 @@ define i65 @v_shl_i65_33(i65 %value) { define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_shl_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_sub_i32 s10, s3, 64 -; GCN-NEXT: s_sub_i32 s6, 64, s3 -; GCN-NEXT: s_cmp_lt_u32 s3, 64 +; GCN-NEXT: s_mov_b32 s4, s3 +; GCN-NEXT: s_sub_i32 s10, s4, 64 +; GCN-NEXT: s_sub_i32 s5, 64, s4 +; GCN-NEXT: s_cmp_lt_u32 s4, 64 +; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 -; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3 -; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 +; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GCN-NEXT: s_cselect_b32 s3, s6, s8 +; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GCN-NEXT: s_cselect_b32 s3, s4, s8 ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b32 s2, s2, s3 ; GCN-NEXT: ; return to shader part epilog @@ -1741,19 +1743,21 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-LABEL: s_shl_i65: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3 +; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 +; GFX10PLUS-NEXT: s_mov_b32 s4, s3 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 -; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 -; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 6f4f7c27a5147..9220e48cd4b07 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -1604,8 +1604,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_mov_b32_e32 v6, 0 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 @@ -1955,8 +1955,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_mov_b32_e32 v6, 0 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 From ad9a2838ca608fb7b9ff7e8a8451259d97d28f8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Fri, 25 Oct 2024 06:38:17 +0200 Subject: [PATCH 2/6] address review comments --- llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp | 8 +++++--- llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp index 29875b04c3798..3c6bcd9cc144f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp @@ -33,10 +33,13 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, const GMerge *Merge = cast(&MI); Register Dst = Merge->getReg(0); - Register Undef = Merge->getSourceReg(1); LLT DstTy = MRI.getType(Dst); LLT SrcTy = MRI.getType(Merge->getSourceReg(0)); + // Otherwise, we would miscompile. + if (Merge->getNumSources() > 2) + return false; + // // %bits_8_15:_(s8) = G_IMPLICIT_DEF // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8) @@ -46,8 +49,7 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, // %0:_(s16) = G_ZEXT %bits_0_7:(s8) // - if (!MRI.hasOneNonDBGUse(Undef) || - !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) return false; MatchInfo = [=](MachineIRBuilder &B) { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index 67cbdd19a0568..6a62e01029c1c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -564,7 +564,7 @@ body: | ; CHECK-LABEL: name: test_merge_undef_multi_use ; CHECK: %opaque:_(s64) = COPY $x0 ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF - ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64) + ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) ; CHECK-NEXT: $q0 = COPY %me(s128) ; CHECK-NEXT: $x0 = COPY %def(s64) %opaque:_(s64) = COPY $x0 From 795621113a128c155d7941078133fdd63e5640ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Fri, 25 Oct 2024 08:10:47 +0200 Subject: [PATCH 3/6] address review cmments --- .../GlobalISel/CombinerHelperArtifacts.cpp | 9 +- .../AArch64/GlobalISel/combine-unmerge.mir | 26 +++--- llvm/test/CodeGen/AArch64/bswap.ll | 26 ++++-- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +++--- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 +++++++++---------- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++++---- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 40 ++++----- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +- 8 files changed, 121 insertions(+), 130 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp index 3c6bcd9cc144f..8f4095f01be7a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp @@ -37,8 +37,7 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, LLT SrcTy = MRI.getType(Merge->getSourceReg(0)); // Otherwise, we would miscompile. - if (Merge->getNumSources() > 2) - return false; + assert(Merge->getNumSources() == 2 && "Unexpected number of operands"); // // %bits_8_15:_(s8) = G_IMPLICIT_DEF @@ -46,14 +45,14 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, // // -> // - // %0:_(s16) = G_ZEXT %bits_0_7:(s8) + // %0:_(s16) = G_ANYEXT %bits_0_7:(s8) // - if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ANYEXT, {DstTy, SrcTy}})) return false; MatchInfo = [=](MachineIRBuilder &B) { - B.buildZExt(Dst, Merge->getSourceReg(0)); + B.buildAnyExt(Dst, Merge->getSourceReg(0)); }; return true; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index 6a62e01029c1c..4e9adf847260b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -10,9 +10,8 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) - ; CHECK-NEXT: $w1 = COPY [[C]](s32) + ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -115,11 +114,8 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32) - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[ZEXT]](s64) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; CHECK-NEXT: $w0 = COPY [[UV]](s32) - ; CHECK-NEXT: $w1 = COPY [[UV1]](s32) + ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) + ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -137,13 +133,11 @@ name: test_combine_unmerge_merge_incompatible_types body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types - ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64) - ; CHECK-NEXT: $h0 = COPY [[UV]](s16) - ; CHECK-NEXT: $h1 = COPY [[UV1]](s16) - ; CHECK-NEXT: $h2 = COPY [[UV2]](s16) - ; CHECK-NEXT: $h3 = COPY [[UV3]](s16) + ; CHECK: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: $h0 = COPY [[DEF]](s16) + ; CHECK-NEXT: $h1 = COPY [[DEF]](s16) + ; CHECK-NEXT: $h2 = COPY [[DEF]](s16) + ; CHECK-NEXT: $h3 = COPY [[DEF]](s16) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -548,7 +542,7 @@ body: | bb.1: ; CHECK-LABEL: name: test_merge_undef ; CHECK: %opaque:_(s64) = COPY $x0 - ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) + ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64) ; CHECK-NEXT: $q0 = COPY %me(s128) %opaque:_(s64) = COPY $x0 %def:_(s64) = G_IMPLICIT_DEF @@ -564,7 +558,7 @@ body: | ; CHECK-LABEL: name: test_merge_undef_multi_use ; CHECK: %opaque:_(s64) = COPY $x0 ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF - ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) + ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64) ; CHECK-NEXT: $q0 = COPY %me(s128) ; CHECK-NEXT: $x0 = COPY %def(s64) %opaque:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index afc1d932840ff..e86f55d63f754 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -45,14 +45,24 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) { ; The zext here is optimised to an any_extend during isel.. define i128 @bswap_i16_to_i128_anyext(i16 %a) { -; CHECK-LABEL: bswap_i16_to_i128_anyext: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: mov x0, xzr -; CHECK-NEXT: rev w8, w8 -; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl x1, x8, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: bswap_i16_to_i128_anyext: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, w0 +; CHECK-SD-NEXT: mov x0, xzr +; CHECK-SD-NEXT: rev w8, w8 +; CHECK-SD-NEXT: lsr w8, w8, #16 +; CHECK-SD-NEXT: lsl x1, x8, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bswap_i16_to_i128_anyext: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: mov x0, xzr +; CHECK-GI-NEXT: rev w8, w8 +; CHECK-GI-NEXT: lsr w8, w8, #16 +; CHECK-GI-NEXT: and x8, x8, #0xffff +; CHECK-GI-NEXT: lsl x1, x8, #48 +; CHECK-GI-NEXT: ret %3 = call i16 @llvm.bswap.i16(i16 %a) %4 = zext i16 %3 to i128 %5 = shl i128 %4, 112 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index bda97c980acee..493e8cef63890 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_ashr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GCN-NEXT: s_lshr_b32 s2, s1, 1 -; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_ashr_i32 s2, s5, 1 +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GCN-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_ashr_i32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 -; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 9f678a5e46fbb..cc185aff9eff2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-LABEL: v_lshr_i65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 @@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-LABEL: v_lshr_i65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-LABEL: v_lshr_i65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1749,22 +1749,20 @@ define i65 @v_lshr_i65_33(i65 %value) { define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_lshr_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, s3 -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GCN-NEXT: s_sub_i32 s10, s4, 64 -; GCN-NEXT: s_sub_i32 s8, 64, s4 -; GCN-NEXT: s_cmp_lt_u32 s4, 64 +; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GCN-NEXT: s_sub_i32 s10, s3, 64 +; GCN-NEXT: s_sub_i32 s8, 64, s3 +; GCN-NEXT: s_cmp_lt_u32 s3, 64 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3 +; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 +; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s11, 0 @@ -1773,26 +1771,24 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; ; GFX10PLUS-LABEL: s_lshr_i65: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s4, s3 -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64 -; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4 -; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 +; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 +; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 -; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 ; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount ret i65 %result @@ -1801,22 +1797,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_lshr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GCN-NEXT: s_lshr_b32 s2, s1, 1 -; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_lshr_b32 s2, s5, 1 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GCN-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 -; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1 +; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index ac6660b76ded9..bac80f0777c02 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1440,7 +1440,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX6-LABEL: v_sext_inreg_i65_22: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1456,7 +1455,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX8-LABEL: v_sext_inreg_i65_22: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1472,7 +1470,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX9-LABEL: v_sext_inreg_i65_22: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1487,7 +1484,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX10PLUS-LABEL: v_sext_inreg_i65_22: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] @@ -1559,29 +1555,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) { ; GCN-LABEL: s_sext_inreg_i65_18: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 -; GCN-NEXT: s_lshr_b32 s2, s1, 14 -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] -; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 +; GCN-NEXT: s_lshr_b32 s4, s1, 14 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 ; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GCN-NEXT: s_lshl_b32 s7, s4, 14 -; GCN-NEXT: s_mov_b32 s6, s3 +; GCN-NEXT: s_lshl_b32 s7, s2, 14 +; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i65_18: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14 +; GFX10PLUS-NEXT: s_mov_b32 s5, 0 ; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] -; GFX10PLUS-NEXT: s_mov_b32 s6, s3 -; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14 -; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s6, s5 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14 +; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 ; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i65 %value, 18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 3e0b29665aa88..b12e915c7d21b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1720,22 +1720,20 @@ define i65 @v_shl_i65_33(i65 %value) { define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_shl_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, s3 -; GCN-NEXT: s_sub_i32 s10, s4, 64 -; GCN-NEXT: s_sub_i32 s5, 64, s4 -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_sub_i32 s10, s3, 64 +; GCN-NEXT: s_sub_i32 s6, 64, s3 +; GCN-NEXT: s_cmp_lt_u32 s3, 64 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 -; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3 +; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 -; GCN-NEXT: s_cselect_b32 s3, s4, s8 +; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GCN-NEXT: s_cselect_b32 s3, s6, s8 ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b32 s2, s2, s3 ; GCN-NEXT: ; return to shader part epilog @@ -1743,21 +1741,19 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-LABEL: s_shl_i65: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3 +; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 -; GFX10PLUS-NEXT: s_mov_b32 s4, s3 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 +; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 +; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 9220e48cd4b07..6f4f7c27a5147 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -1604,8 +1604,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 @@ -1955,8 +1955,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 From dfcefca7b6e0766335e4349bc2f5390699ac4533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Fri, 25 Oct 2024 09:02:54 +0200 Subject: [PATCH 4/6] back to zext --- .../include/llvm/Target/GlobalISel/Combine.td | 2 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 7 +- .../GlobalISel/CombinerHelperArtifacts.cpp | 6 +- .../AArch64/GlobalISel/combine-unmerge.mir | 28 +++--- llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll | 13 +-- llvm/test/CodeGen/AArch64/bswap.ll | 26 ++---- .../CodeGen/AArch64/extract-vector-elt.ll | 15 +--- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +++--- .../combine-amdgpu-cvt-f32-ubyte.mir | 6 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 ++++++++++--------- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++++---- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 40 +++++---- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 7 +- 14 files changed, 147 insertions(+), 153 deletions(-) diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 2651a6927424b..643898ce36674 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -420,7 +420,7 @@ def binop_right_undef_to_undef: GICombineRule< def unary_undef_to_zero: GICombineRule< (defs root:$root), - (match (wip_match_opcode G_ABS):$root, + (match (wip_match_opcode G_ABS, G_ZEXT):$root, [{ return Helper.matchOperandIsUndef(*${root}, 1); }]), (apply [{ Helper.replaceInstWithConstant(*${root}, 0); }])>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ede8d82fc1a35..077175f2e0452 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2944,8 +2944,11 @@ void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); - Builder.buildConstant(MI.getOperand(0), C); - MI.eraseFromParent(); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + if (isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstTy}})) { + Builder.buildConstant(MI.getOperand(0), C); + MI.eraseFromParent(); + } } void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) { diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp index 8f4095f01be7a..047e411eb76cf 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp @@ -45,14 +45,14 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, // // -> // - // %0:_(s16) = G_ANYEXT %bits_0_7:(s8) + // %0:_(s16) = G_ZEXT %bits_0_7:(s8) // - if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ANYEXT, {DstTy, SrcTy}})) + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) return false; MatchInfo = [=](MachineIRBuilder &B) { - B.buildAnyExt(Dst, Merge->getSourceReg(0)); + B.buildZExt(Dst, Merge->getSourceReg(0)); }; return true; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index 4e9adf847260b..e4e7f315397ff 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -9,9 +9,9 @@ name: test_combine_unmerge_merge body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge - ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) - ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: $w1 = COPY [[C]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -113,9 +113,11 @@ name: test_combine_unmerge_bitcast_merge body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge - ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) - ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[C]](s64) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; CHECK-NEXT: $w0 = COPY [[UV]](s32) + ; CHECK-NEXT: $w1 = COPY [[UV1]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -133,11 +135,11 @@ name: test_combine_unmerge_merge_incompatible_types body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types - ; CHECK: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: $h0 = COPY [[DEF]](s16) - ; CHECK-NEXT: $h1 = COPY [[DEF]](s16) - ; CHECK-NEXT: $h2 = COPY [[DEF]](s16) - ; CHECK-NEXT: $h3 = COPY [[DEF]](s16) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK-NEXT: $h0 = COPY [[C]](s16) + ; CHECK-NEXT: $h1 = COPY [[C]](s16) + ; CHECK-NEXT: $h2 = COPY [[C]](s16) + ; CHECK-NEXT: $h3 = COPY [[C]](s16) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -542,7 +544,7 @@ body: | bb.1: ; CHECK-LABEL: name: test_merge_undef ; CHECK: %opaque:_(s64) = COPY $x0 - ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64) + ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) ; CHECK-NEXT: $q0 = COPY %me(s128) %opaque:_(s64) = COPY $x0 %def:_(s64) = G_IMPLICIT_DEF @@ -558,7 +560,7 @@ body: | ; CHECK-LABEL: name: test_merge_undef_multi_use ; CHECK: %opaque:_(s64) = COPY $x0 ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF - ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64) + ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) ; CHECK-NEXT: $q0 = COPY %me(s128) ; CHECK-NEXT: $x0 = COPY %def(s64) %opaque:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index a39c2b5d14ddd..98c1a1bef569a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -322,17 +322,18 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) { ; ; CHECK-GI-LABEL: typei1_orig: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr q1, [x2] +; CHECK-GI-NEXT: ldr q0, [x2] ; CHECK-GI-NEXT: cmp x0, #0 -; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff ; CHECK-GI-NEXT: cset w8, gt -; CHECK-GI-NEXT: neg v1.8h, v1.8h -; CHECK-GI-NEXT: dup v2.8h, w8 +; CHECK-GI-NEXT: neg v0.8h, v0.8h +; CHECK-GI-NEXT: dup v1.8h, w8 +; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: mul v1.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: cmeq v0.8h, v0.8h, #0 ; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: mul v1.8h, v1.8h, v2.8h ; CHECK-GI-NEXT: cmeq v1.8h, v1.8h, #0 ; CHECK-GI-NEXT: mvn v1.16b, v1.16b -; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-GI-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-GI-NEXT: str q0, [x1] diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index e86f55d63f754..afc1d932840ff 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -45,24 +45,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) { ; The zext here is optimised to an any_extend during isel.. define i128 @bswap_i16_to_i128_anyext(i16 %a) { -; CHECK-SD-LABEL: bswap_i16_to_i128_anyext: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, w0 -; CHECK-SD-NEXT: mov x0, xzr -; CHECK-SD-NEXT: rev w8, w8 -; CHECK-SD-NEXT: lsr w8, w8, #16 -; CHECK-SD-NEXT: lsl x1, x8, #48 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: bswap_i16_to_i128_anyext: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, w0 -; CHECK-GI-NEXT: mov x0, xzr -; CHECK-GI-NEXT: rev w8, w8 -; CHECK-GI-NEXT: lsr w8, w8, #16 -; CHECK-GI-NEXT: and x8, x8, #0xffff -; CHECK-GI-NEXT: lsl x1, x8, #48 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: bswap_i16_to_i128_anyext: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: lsr w8, w8, #16 +; CHECK-NEXT: lsl x1, x8, #48 +; CHECK-NEXT: ret %3 = call i16 @llvm.bswap.i16(i16 %a) %4 = zext i16 %3 to i128 %5 = shl i128 %4, 112 diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 5e5fdd6d31705..e89e1516fb1f5 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -8,17 +8,10 @@ ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v4i32_vector_extract_const define i64 @extract_v2i64_undef_index(<2 x i64> %a, i32 %c) { -; CHECK-SD-LABEL: extract_v2i64_undef_index: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmov x0, d0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: extract_v2i64_undef_index: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str q0, [sp, #-16]! -; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: ldr x0, [sp], #16 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: extract_v2i64_undef_index: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret entry: %d = extractelement <2 x i64> %a, i32 undef ret i64 %d diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 493e8cef63890..bda97c980acee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_ashr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: s_ashr_i32 s2, s3, 1 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; GCN-NEXT: s_lshr_b32 s2, s1, 1 +; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_ashr_i32 s2, s5, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir index 7893bfa1d38f0..9b39afd32ac37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir @@ -261,8 +261,7 @@ body: | ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_16 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16) + ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext ; CHECK-NEXT: $vgpr0 = COPY %result(s32) %arg:_(s32) = COPY $vgpr0 @@ -284,8 +283,7 @@ body: | ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_24 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16) + ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext ; CHECK-NEXT: $vgpr0 = COPY %result(s32) %arg:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index cc185aff9eff2..9f678a5e46fbb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-LABEL: v_lshr_i65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 @@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-LABEL: v_lshr_i65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-LABEL: v_lshr_i65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1749,20 +1749,22 @@ define i65 @v_lshr_i65_33(i65 %value) { define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_lshr_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GCN-NEXT: s_sub_i32 s10, s3, 64 -; GCN-NEXT: s_sub_i32 s8, 64, s3 -; GCN-NEXT: s_cmp_lt_u32 s3, 64 +; GCN-NEXT: s_mov_b32 s4, s3 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GCN-NEXT: s_sub_i32 s10, s4, 64 +; GCN-NEXT: s_sub_i32 s8, 64, s4 +; GCN-NEXT: s_cmp_lt_u32 s4, 64 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3 -; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 -; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 +; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s11, 0 @@ -1771,24 +1773,26 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; ; GFX10PLUS-LABEL: s_lshr_i65: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 -; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 +; GFX10PLUS-NEXT: s_mov_b32 s4, s3 +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64 +; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4 +; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 ; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount ret i65 %result @@ -1797,22 +1801,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_lshr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: s_lshr_b32 s2, s3, 1 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GCN-NEXT: s_lshr_b32 s2, s1, 1 +; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_lshr_b32 s2, s5, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index bac80f0777c02..ac6660b76ded9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1440,6 +1440,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX6-LABEL: v_sext_inreg_i65_22: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1455,6 +1456,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX8-LABEL: v_sext_inreg_i65_22: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1470,6 +1472,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX9-LABEL: v_sext_inreg_i65_22: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1484,6 +1487,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX10PLUS-LABEL: v_sext_inreg_i65_22: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] @@ -1555,29 +1559,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) { ; GCN-LABEL: s_sext_inreg_i65_18: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 -; GCN-NEXT: s_lshr_b32 s4, s1, 14 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 +; GCN-NEXT: s_lshr_b32 s2, s1, 14 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] +; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GCN-NEXT: s_lshl_b32 s7, s2, 14 -; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: s_lshl_b32 s7, s4, 14 +; GCN-NEXT: s_mov_b32 s6, s3 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 +; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i65_18: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14 -; GFX10PLUS-NEXT: s_mov_b32 s5, 0 +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14 ; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_mov_b32 s6, s5 -; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14 -; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 +; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] +; GFX10PLUS-NEXT: s_mov_b32 s6, s3 +; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14 +; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 ; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i65 %value, 18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index b12e915c7d21b..3e0b29665aa88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1720,20 +1720,22 @@ define i65 @v_shl_i65_33(i65 %value) { define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_shl_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_sub_i32 s10, s3, 64 -; GCN-NEXT: s_sub_i32 s6, 64, s3 -; GCN-NEXT: s_cmp_lt_u32 s3, 64 +; GCN-NEXT: s_mov_b32 s4, s3 +; GCN-NEXT: s_sub_i32 s10, s4, 64 +; GCN-NEXT: s_sub_i32 s5, 64, s4 +; GCN-NEXT: s_cmp_lt_u32 s4, 64 +; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 -; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3 -; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 +; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GCN-NEXT: s_cselect_b32 s3, s6, s8 +; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GCN-NEXT: s_cselect_b32 s3, s4, s8 ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b32 s2, s2, s3 ; GCN-NEXT: ; return to shader part epilog @@ -1741,19 +1743,21 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-LABEL: s_shl_i65: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3 +; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 +; GFX10PLUS-NEXT: s_mov_b32 s4, s3 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 -; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 -; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 6f4f7c27a5147..9220e48cd4b07 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -1604,8 +1604,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_mov_b32_e32 v6, 0 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 @@ -1955,8 +1955,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_mov_b32_e32 v6, 0 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index b3c06756a8987..c499d457558a8 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -4074,14 +4074,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_not_b32_e32 v2, 31 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-GISEL-NEXT: v_or_b32_e32 v2, s0, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -4191,15 +4189,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] -; VI-GISEL-NEXT: s_and_b32 s2, 0xffff, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: s_lshl_b32 s0, s2, 16 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffe0, v3 -; VI-GISEL-NEXT: v_or_b32_e32 v2, s0, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; From d654ee9fb939b897ae8b57ae8a0386c9a5064674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Fri, 25 Oct 2024 15:18:08 +0200 Subject: [PATCH 5/6] anyext --- .../GlobalISel/CombinerHelperArtifacts.cpp | 6 +- .../AArch64/GlobalISel/combine-unmerge.mir | 28 +++--- llvm/test/CodeGen/AArch64/bswap.ll | 26 ++++-- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +++--- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 +++++++++---------- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++++---- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 40 ++++----- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +- 8 files changed, 123 insertions(+), 127 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp index 047e411eb76cf..8f4095f01be7a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp @@ -45,14 +45,14 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, // // -> // - // %0:_(s16) = G_ZEXT %bits_0_7:(s8) + // %0:_(s16) = G_ANYEXT %bits_0_7:(s8) // - if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ANYEXT, {DstTy, SrcTy}})) return false; MatchInfo = [=](MachineIRBuilder &B) { - B.buildZExt(Dst, Merge->getSourceReg(0)); + B.buildAnyExt(Dst, Merge->getSourceReg(0)); }; return true; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index e4e7f315397ff..4e9adf847260b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -9,9 +9,9 @@ name: test_combine_unmerge_merge body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: $w0 = COPY [[C]](s32) - ; CHECK-NEXT: $w1 = COPY [[C]](s32) + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) + ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -113,11 +113,9 @@ name: test_combine_unmerge_bitcast_merge body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[C]](s64) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; CHECK-NEXT: $w0 = COPY [[UV]](s32) - ; CHECK-NEXT: $w1 = COPY [[UV1]](s32) + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) + ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -135,11 +133,11 @@ name: test_combine_unmerge_merge_incompatible_types body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types - ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK-NEXT: $h0 = COPY [[C]](s16) - ; CHECK-NEXT: $h1 = COPY [[C]](s16) - ; CHECK-NEXT: $h2 = COPY [[C]](s16) - ; CHECK-NEXT: $h3 = COPY [[C]](s16) + ; CHECK: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: $h0 = COPY [[DEF]](s16) + ; CHECK-NEXT: $h1 = COPY [[DEF]](s16) + ; CHECK-NEXT: $h2 = COPY [[DEF]](s16) + ; CHECK-NEXT: $h3 = COPY [[DEF]](s16) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) @@ -544,7 +542,7 @@ body: | bb.1: ; CHECK-LABEL: name: test_merge_undef ; CHECK: %opaque:_(s64) = COPY $x0 - ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) + ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64) ; CHECK-NEXT: $q0 = COPY %me(s128) %opaque:_(s64) = COPY $x0 %def:_(s64) = G_IMPLICIT_DEF @@ -560,7 +558,7 @@ body: | ; CHECK-LABEL: name: test_merge_undef_multi_use ; CHECK: %opaque:_(s64) = COPY $x0 ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF - ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) + ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64) ; CHECK-NEXT: $q0 = COPY %me(s128) ; CHECK-NEXT: $x0 = COPY %def(s64) %opaque:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index afc1d932840ff..e86f55d63f754 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -45,14 +45,24 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) { ; The zext here is optimised to an any_extend during isel.. define i128 @bswap_i16_to_i128_anyext(i16 %a) { -; CHECK-LABEL: bswap_i16_to_i128_anyext: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: mov x0, xzr -; CHECK-NEXT: rev w8, w8 -; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl x1, x8, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: bswap_i16_to_i128_anyext: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, w0 +; CHECK-SD-NEXT: mov x0, xzr +; CHECK-SD-NEXT: rev w8, w8 +; CHECK-SD-NEXT: lsr w8, w8, #16 +; CHECK-SD-NEXT: lsl x1, x8, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bswap_i16_to_i128_anyext: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: mov x0, xzr +; CHECK-GI-NEXT: rev w8, w8 +; CHECK-GI-NEXT: lsr w8, w8, #16 +; CHECK-GI-NEXT: and x8, x8, #0xffff +; CHECK-GI-NEXT: lsl x1, x8, #48 +; CHECK-GI-NEXT: ret %3 = call i16 @llvm.bswap.i16(i16 %a) %4 = zext i16 %3 to i128 %5 = shl i128 %4, 112 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index bda97c980acee..493e8cef63890 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_ashr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GCN-NEXT: s_lshr_b32 s2, s1, 1 -; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_ashr_i32 s2, s5, 1 +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GCN-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_ashr_i32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 -; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 9f678a5e46fbb..cc185aff9eff2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-LABEL: v_lshr_i65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 @@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-LABEL: v_lshr_i65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-LABEL: v_lshr_i65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1749,22 +1749,20 @@ define i65 @v_lshr_i65_33(i65 %value) { define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_lshr_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, s3 -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GCN-NEXT: s_sub_i32 s10, s4, 64 -; GCN-NEXT: s_sub_i32 s8, 64, s4 -; GCN-NEXT: s_cmp_lt_u32 s4, 64 +; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GCN-NEXT: s_sub_i32 s10, s3, 64 +; GCN-NEXT: s_sub_i32 s8, 64, s3 +; GCN-NEXT: s_cmp_lt_u32 s3, 64 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3 +; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 +; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s11, 0 @@ -1773,26 +1771,24 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; ; GFX10PLUS-LABEL: s_lshr_i65: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s4, s3 -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64 -; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4 -; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 +; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 +; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 -; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 ; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount ret i65 %result @@ -1801,22 +1797,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_lshr_i65_33: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GCN-NEXT: s_lshr_b32 s2, s1, 1 -; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_lshr_b32 s2, s5, 1 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GCN-NEXT: s_lshr_b32 s0, s1, 1 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i65_33: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1 -; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1 +; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index ac6660b76ded9..bac80f0777c02 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1440,7 +1440,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX6-LABEL: v_sext_inreg_i65_22: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1456,7 +1455,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX8-LABEL: v_sext_inreg_i65_22: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1472,7 +1470,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX9-LABEL: v_sext_inreg_i65_22: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1487,7 +1484,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) { ; GFX10PLUS-LABEL: v_sext_inreg_i65_22: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] @@ -1559,29 +1555,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) { ; GCN-LABEL: s_sext_inreg_i65_18: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 -; GCN-NEXT: s_lshr_b32 s2, s1, 14 -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] -; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 +; GCN-NEXT: s_lshr_b32 s4, s1, 14 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 ; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GCN-NEXT: s_lshl_b32 s7, s4, 14 -; GCN-NEXT: s_mov_b32 s6, s3 +; GCN-NEXT: s_lshl_b32 s7, s2, 14 +; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i65_18: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14 +; GFX10PLUS-NEXT: s_mov_b32 s5, 0 ; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3] -; GFX10PLUS-NEXT: s_mov_b32 s6, s3 -; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14 -; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18 +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s6, s5 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14 +; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 ; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i65 %value, 18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 3e0b29665aa88..b12e915c7d21b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1720,22 +1720,20 @@ define i65 @v_shl_i65_33(i65 %value) { define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-LABEL: s_shl_i65: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, s3 -; GCN-NEXT: s_sub_i32 s10, s4, 64 -; GCN-NEXT: s_sub_i32 s5, 64, s4 -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_sub_i32 s10, s3, 64 +; GCN-NEXT: s_sub_i32 s6, 64, s3 +; GCN-NEXT: s_cmp_lt_u32 s3, 64 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 -; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 -; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3 +; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 -; GCN-NEXT: s_cselect_b32 s3, s4, s8 +; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GCN-NEXT: s_cselect_b32 s3, s6, s8 ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b32 s2, s2, s3 ; GCN-NEXT: ; return to shader part epilog @@ -1743,21 +1741,19 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-LABEL: s_shl_i65: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3 +; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 -; GFX10PLUS-NEXT: s_mov_b32 s4, s3 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 +; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 +; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 9220e48cd4b07..6f4f7c27a5147 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -1604,8 +1604,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 @@ -1955,8 +1955,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL: ; %bb.0: ; %fp-to-i-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f ; GISEL-NEXT: s_mov_b64 s[4:5], 0 From d9baace55e4196b844e8459777e971d8b7e95d2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Tue, 5 Nov 2024 20:50:09 +0100 Subject: [PATCH 6/6] address review comments --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 2 +- llvm/include/llvm/Target/GlobalISel/Combine.td | 4 ++-- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 7 ++----- .../AArch64/GlobalISel/combine-unmerge.mir | 4 ++-- llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll | 13 ++++++------- llvm/test/CodeGen/AArch64/extract-vector-elt.ll | 15 +++++++++++---- .../GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir | 6 ++++-- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 7 ++++++- 8 files changed, 34 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index ff97e5f15bb1b..15b06fe0fb87f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -925,7 +925,7 @@ class CombinerHelper { bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI, BuildFnTy &MatchInfo); - // merge_values(_, undef) -> zext + // merge_values(_, undef) -> anyext bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo); private: diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 643898ce36674..37825f3906db7 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -420,7 +420,7 @@ def binop_right_undef_to_undef: GICombineRule< def unary_undef_to_zero: GICombineRule< (defs root:$root), - (match (wip_match_opcode G_ABS, G_ZEXT):$root, + (match (wip_match_opcode G_ABS):$root, [{ return Helper.matchOperandIsUndef(*${root}, 1); }]), (apply [{ Helper.replaceInstWithConstant(*${root}, 0); }])>; @@ -856,7 +856,7 @@ def unmerge_zext_to_zext : GICombineRule< (apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }]) >; -/// Transform merge_x_undef -> zext. +/// Transform merge_x_undef -> anyext. def merge_of_x_and_undef : GICombineRule < (defs root:$root, build_fn_matchinfo:$matchinfo), (match (G_IMPLICIT_DEF $undef), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 077175f2e0452..ede8d82fc1a35 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2944,11 +2944,8 @@ void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - if (isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstTy}})) { - Builder.buildConstant(MI.getOperand(0), C); - MI.eraseFromParent(); - } + Builder.buildConstant(MI.getOperand(0), C); + MI.eraseFromParent(); } void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index 4e9adf847260b..d6a79780b8bb1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -535,7 +535,7 @@ body: | $q1 = COPY %un2(s128) ... -# Check that we zext the merge +# Check that we anyext the merge --- name: test_merge_undef body: | @@ -550,7 +550,7 @@ body: | $q0 = COPY %me(s128) ... -# Check that we don't zext the merge, multi-use +# Check that we don't anyext the merge, multi-use --- name: test_merge_undef_multi_use body: | diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index 98c1a1bef569a..a39c2b5d14ddd 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -322,18 +322,17 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) { ; ; CHECK-GI-LABEL: typei1_orig: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr q0, [x2] +; CHECK-GI-NEXT: ldr q1, [x2] ; CHECK-GI-NEXT: cmp x0, #0 +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff ; CHECK-GI-NEXT: cset w8, gt -; CHECK-GI-NEXT: neg v0.8h, v0.8h -; CHECK-GI-NEXT: dup v1.8h, w8 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: mul v1.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: cmeq v0.8h, v0.8h, #0 +; CHECK-GI-NEXT: neg v1.8h, v1.8h +; CHECK-GI-NEXT: dup v2.8h, w8 ; CHECK-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-GI-NEXT: mul v1.8h, v1.8h, v2.8h ; CHECK-GI-NEXT: cmeq v1.8h, v1.8h, #0 ; CHECK-GI-NEXT: mvn v1.16b, v1.16b -; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-GI-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-GI-NEXT: str q0, [x1] diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index e89e1516fb1f5..5e5fdd6d31705 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -8,10 +8,17 @@ ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v4i32_vector_extract_const define i64 @extract_v2i64_undef_index(<2 x i64> %a, i32 %c) { -; CHECK-LABEL: extract_v2i64_undef_index: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v2i64_undef_index: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i64_undef_index: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str q0, [sp, #-16]! +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: ldr x0, [sp], #16 +; CHECK-GI-NEXT: ret entry: %d = extractelement <2 x i64> %a, i32 undef ret i64 %d diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir index 9b39afd32ac37..7893bfa1d38f0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir @@ -261,7 +261,8 @@ body: | ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_16 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16) ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext ; CHECK-NEXT: $vgpr0 = COPY %result(s32) %arg:_(s32) = COPY $vgpr0 @@ -283,7 +284,8 @@ body: | ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_24 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16) ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext ; CHECK-NEXT: $vgpr0 = COPY %result(s32) %arg:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index c499d457558a8..b3c06756a8987 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -4074,12 +4074,14 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_not_b32_e32 v2, 31 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_or_b32_e32 v2, s0, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -4189,12 +4191,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: s_and_b32 s2, 0xffff, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: s_lshl_b32 s0, s2, 16 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffe0, v3 +; VI-GISEL-NEXT: v_or_b32_e32 v2, s0, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ;