diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 76d51ab819f441..b6a1dc66affa72 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -388,9 +388,6 @@ class CombinerHelper { /// Transform anyext(trunc(x)) to x. bool matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg); - /// Transform zext(trunc(x)) to x. - bool matchCombineZextTrunc(MachineInstr &MI, Register &Reg); - /// Transform trunc (shl x, K) to shl (trunc x), K /// if K < VT.getScalarSizeInBits(). /// @@ -918,6 +915,13 @@ class CombinerHelper { bool matchCanonicalizeICmp(const MachineInstr &MI, BuildFnTy &MatchInfo); bool matchCanonicalizeFCmp(const MachineInstr &MI, BuildFnTy &MatchInfo); + /// Transform zext of truncate to x. + bool matchCombineZextTrunc(const MachineInstr &ZextMI, + const MachineInstr &TruncMI, BuildFnTy &MatchInfo); + + /// Transform zext(trunc(x)) to x. + bool matchCombineZextTrunc(MachineInstr &MI, Register &Reg); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 77cb4370b54664..66a5337de85b93 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -758,15 +758,6 @@ def anyext_trunc_fold: GICombineRule < (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }]) >; -// Fold (zext (trunc x)) -> x if the source type is same as the destination type -// and truncated bits are known to be zero. -def zext_trunc_fold: GICombineRule < - (defs root:$root, register_matchinfo:$matchinfo), - (match (wip_match_opcode G_ZEXT):$root, - [{ return Helper.matchCombineZextTrunc(*${root}, ${matchinfo}); }]), - (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }]) ->; - def not_cmp_fold_matchinfo : GIDefMatchData<"SmallVector">; def not_cmp_fold : GICombineRule< (defs root:$d, not_cmp_fold_matchinfo:$info), @@ -1791,6 +1782,25 @@ class integer_of_opcode : GICombineRule < def integer_of_truncate : integer_of_opcode; +/// Transform zext of truncate to x or and(x, mask). +def zext_of_truncate : GICombineRule < + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_TRUNC $trunc, $src):$TruncMI, + (G_ZEXT $root, $trunc):$ZextMI, + [{ return Helper.matchCombineZextTrunc(*${ZextMI}, *${TruncMI}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${ZextMI}, ${matchinfo}); }])>; + + +// Fold (zext (trunc x)) -> x if the source type is same as the destination type +// and truncated bits are known to be zero. +def zext_trunc_fold: GICombineRule < + (defs root:$root, register_matchinfo:$matchinfo), + (match (G_TRUNC $trunc, $src):$TruncMI, + (G_ZEXT $root, $trunc):$ZextMI, + [{ return Helper.matchCombineZextTrunc(*${ZextMI}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${ZextMI}, ${matchinfo}); }]) +>; + def cast_combines: GICombineGroup<[ truncate_of_zext, truncate_of_sext, @@ -1812,7 +1822,9 @@ def cast_combines: GICombineGroup<[ narrow_binop_and, narrow_binop_or, narrow_binop_xor, - integer_of_truncate + integer_of_truncate, + zext_of_truncate, + zext_trunc_fold ]>; def canonicalize_icmp : GICombineRule< @@ -1869,7 +1881,6 @@ def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p, def known_bits_simplifications : GICombineGroup<[ redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask, - zext_trunc_fold, sext_inreg_to_zext_inreg]>; def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend, diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index 547529bbe699ab..5addf93599085a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -333,8 +333,10 @@ MachineInstrBuilder CSEMIRBuilder::buildConstant(const DstOp &Res, // For vectors, CSE the element only for now. LLT Ty = Res.getLLTTy(*getMRI()); - if (Ty.isVector()) + if (Ty.isFixedVector()) return buildSplatBuildVector(Res, buildConstant(Ty.getElementType(), Val)); + if (Ty.isScalableVector()) + return buildSplatVector(Res, buildConstant(Ty.getElementType(), Val)); FoldingSetNodeID ID; GISelInstProfileBuilder ProfBuilder(ID, *getMRI()); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 14e94d48bf8362..9d2c31760e969f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2526,20 +2526,6 @@ bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) { m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy)))); } -bool CombinerHelper::matchCombineZextTrunc(MachineInstr &MI, Register &Reg) { - assert(MI.getOpcode() == TargetOpcode::G_ZEXT && "Expected a G_ZEXT"); - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - if (mi_match(SrcReg, MRI, - m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))))) { - unsigned DstSize = DstTy.getScalarSizeInBits(); - unsigned SrcSize = MRI.getType(SrcReg).getScalarSizeInBits(); - return KB->getKnownBits(Reg).countMinLeadingZeros() >= DstSize - SrcSize; - } - return false; -} - static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) { const unsigned ShiftSize = ShiftTy.getScalarSizeInBits(); const unsigned TruncSize = TruncTy.getScalarSizeInBits(); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp index 30557e6a2304e6..2428b1e3822cde 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp @@ -13,6 +13,7 @@ #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelTypeUtils.h" @@ -24,6 +25,7 @@ #define DEBUG_TYPE "gi-combiner" using namespace llvm; +using namespace MIPatternMatch; bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo) { @@ -359,3 +361,78 @@ bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI, return false; } } + +bool CombinerHelper::matchCombineZextTrunc(const MachineInstr &ZextMI, + const MachineInstr &TruncMI, + BuildFnTy &MatchInfo) { + const GZext *Zext = cast(&ZextMI); + const GTrunc *Trunc = cast(&TruncMI); + + Register Dst = Zext->getReg(0); + Register Mid = Zext->getSrcReg(); + Register Src = Trunc->getSrcReg(); + + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + + if (!MRI.hasOneNonDBGUse(Mid)) + return false; + + unsigned DstSize = DstTy.getScalarSizeInBits(); + unsigned MidSize = MRI.getType(Mid).getScalarSizeInBits(); + unsigned SrcSize = SrcTy.getScalarSizeInBits(); + + // Are the truncated bits known to be zero? + if (DstTy == SrcTy && + (KB->getKnownBits(Src).countMinLeadingZeros() >= DstSize - MidSize)) { + MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); }; + return true; + } + + // If the sizes are just right we can convert this into a logical + // 'and', which will be much cheaper than the pair of casts. + + // If we're actually extending zero bits, then if + // SrcSize < DstSize: zext(Src & mask) + // SrcSize == DstSize: Src & mask + // SrcSize > DstSize: trunc(Src) & mask + + if (DstSize == SrcSize) { + // Src & mask. + + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {DstTy}}) || + !isConstantLegalOrBeforeLegalizer(DstTy)) + return false; + + // build mask. + APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize)); + + MatchInfo = [=](MachineIRBuilder &B) { + auto Mask = B.buildConstant(DstTy, AndValue); + B.buildAnd(Dst, Src, Mask); + }; + return true; + } + + return false; +} + +bool CombinerHelper::matchCombineZextTrunc(MachineInstr &MI, Register &Reg) { + GZext *Zext = cast(&MI); + + Register DstReg = Zext->getReg(0); + Register SrcReg = Zext->getSrcReg(); + LLT DstTy = MRI.getType(DstReg); + + if (!MRI.hasOneNonDBGUse(SrcReg)) + return false; + + if (mi_match(SrcReg, MRI, + m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))))) { + unsigned DstSize = DstTy.getScalarSizeInBits(); + unsigned SrcSize = MRI.getType(SrcReg).getScalarSizeInBits(); + return KB->getKnownBits(Reg).countMinLeadingZeros() >= DstSize - SrcSize; + } + + return false; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index b2a3f9392157d1..ccc6bd9ce9219b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -168,6 +168,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< def AMDGPURegBankCombiner : GICombiner< "AMDGPURegBankCombinerImpl", [unmerge_merge, unmerge_cst, unmerge_undef, - zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, + zext_trunc_fold, + int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir index 83b5c388520eb3..c5f41836c8e7a1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir @@ -49,8 +49,8 @@ body: | ; CHECK: liveins: $x0, $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %arg1:_(s64) = COPY $x0 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64) - ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[TRUNC]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: %zext:_(s64) = G_AND %arg1, [[C]] ; CHECK-NEXT: $x0 = COPY %zext(s64) ; CHECK-NEXT: RET_ReallyLR implicit $x0 %arg1:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir index 8cb44605246ffa..e0ffbb8e51da38 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir @@ -10,7 +10,9 @@ body: | ; CHECK: liveins: $w0, $w1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK-NEXT: $x1 = COPY [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: $x1 = COPY [[AND]](s64) %0:_(s64) = COPY $x0 %2:_(s32) = nuw G_TRUNC %0 %3:_(s64) = G_ZEXT %2 @@ -25,9 +27,9 @@ body: | ; CHECK: liveins: $w0, $w1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nsw G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32) - ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: $x1 = COPY [[AND]](s64) %0:_(s64) = COPY $x0 %2:_(s32) = nsw G_TRUNC %0 %3:_(s64) = G_ZEXT %2 @@ -42,9 +44,9 @@ body: | ; CHECK: liveins: $w0, $w1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32) - ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: $x1 = COPY [[AND]](s64) %0:_(s64) = COPY $x0 %2:_(s32) = G_TRUNC %0 %3:_(s64) = G_ZEXT %2 @@ -300,7 +302,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: %sv0:_() = G_SPLAT_VECTOR [[COPY]](s64) - ; CHECK-NEXT: $z0 = COPY %sv0() + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C]](s64) + ; CHECK-NEXT: %z:_() = G_AND %sv0, [[SPLAT_VECTOR]] + ; CHECK-NEXT: $z0 = COPY %z() %0:_(s64) = COPY $x0 %1:_(s64) = COPY $x1 %sv0:_() = G_SPLAT_VECTOR %0:_(s64) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir index 47c85f76785935..48cc1660fe0030 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir @@ -165,9 +165,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load (s8)) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[SEXTLOAD]](s32) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8) - ; CHECK-NEXT: $w0 = COPY [[ZEXT]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXTLOAD]], [[C]] + ; CHECK-NEXT: $w0 = COPY [[AND]](s32) ; CHECK-NEXT: $w1 = COPY [[SEXTLOAD]](s32) %0:_(p0) = COPY $x0 %1:_(s8) = G_LOAD %0 :: (load (s8)) diff --git a/llvm/test/CodeGen/AArch64/addsub_ext.ll b/llvm/test/CodeGen/AArch64/addsub_ext.ll index 04a98bd5088803..81eae261f26477 100644 --- a/llvm/test/CodeGen/AArch64/addsub_ext.ll +++ b/llvm/test/CodeGen/AArch64/addsub_ext.ll @@ -24,22 +24,12 @@ define i32 @add_z_shli8i32(i8 %v, i32 %lhs) minsize { } define i64 @add_z_i8i64(i8 %v, i64 %lhs) minsize { -; CHECK-LABEL: add_z_i8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: add x0, x1, w0, uxtb -; CHECK-NEXT: ret %vz = zext i8 %v to i64 %r = add i64 %lhs, %vz ret i64 %r } define i64 @add_z_shli8i64(i8 %v, i64 %lhs) minsize { -; CHECK-LABEL: add_z_shli8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: add x0, x1, w0, uxtb #3 -; CHECK-NEXT: ret %vz = zext i8 %v to i64 %s = shl i64 %vz, 3 %r = add i64 %lhs, %s @@ -112,22 +102,12 @@ define i32 @add_z_shli16i32(i16 %v, i32 %lhs) minsize { } define i64 @add_z_i16i64(i16 %v, i64 %lhs) minsize { -; CHECK-LABEL: add_z_i16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: add x0, x1, w0, uxth -; CHECK-NEXT: ret %vz = zext i16 %v to i64 %r = add i64 %lhs, %vz ret i64 %r } define i64 @add_z_shli16i64(i16 %v, i64 %lhs) minsize { -; CHECK-LABEL: add_z_shli16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: add x0, x1, w0, uxth #3 -; CHECK-NEXT: ret %vz = zext i16 %v to i64 %s = shl i64 %vz, 3 %r = add i64 %lhs, %s @@ -242,22 +222,12 @@ define i32 @sub_z_shli8i32(i8 %v, i32 %lhs) minsize { } define i64 @sub_z_i8i64(i8 %v, i64 %lhs) minsize { -; CHECK-LABEL: sub_z_i8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sub x0, x1, w0, uxtb -; CHECK-NEXT: ret %vz = zext i8 %v to i64 %r = sub i64 %lhs, %vz ret i64 %r } define i64 @sub_z_shli8i64(i8 %v, i64 %lhs) minsize { -; CHECK-LABEL: sub_z_shli8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sub x0, x1, w0, uxtb #3 -; CHECK-NEXT: ret %vz = zext i8 %v to i64 %s = shl i64 %vz, 3 %r = sub i64 %lhs, %s @@ -330,22 +300,12 @@ define i32 @sub_z_shli16i32(i16 %v, i32 %lhs) minsize { } define i64 @sub_z_i16i64(i16 %v, i64 %lhs) minsize { -; CHECK-LABEL: sub_z_i16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sub x0, x1, w0, uxth -; CHECK-NEXT: ret %vz = zext i16 %v to i64 %r = sub i64 %lhs, %vz ret i64 %r } define i64 @sub_z_shli16i64(i16 %v, i64 %lhs) minsize { -; CHECK-LABEL: sub_z_shli16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sub x0, x1, w0, uxth #3 -; CHECK-NEXT: ret %vz = zext i16 %v to i64 %s = shl i64 %vz, 3 %r = sub i64 %lhs, %s @@ -444,7 +404,7 @@ define i32 @cmp_s_i8i32(i8 %v, i32 %lhs) minsize { ; CHECK-NEXT: cmp w1, w0, uxtb ; CHECK-NEXT: b.ge .LBB40_2 ; CHECK-NEXT: // %bb.1: // %then -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB40_2: // %end ; CHECK-NEXT: mov w0, w1 @@ -465,7 +425,7 @@ define i64 @cmp_s_i8i64(i8 %v, i64 %lhs) minsize { ; CHECK-NEXT: cmp x1, w0, sxtb ; CHECK-NEXT: b.ge .LBB41_2 ; CHECK-NEXT: // %bb.1: // %then -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB41_2: // %end ; CHECK-NEXT: mov x0, x1 @@ -485,7 +445,7 @@ define i32 @cmp_s_i16i32(i16 %v, i32 %lhs) minsize { ; CHECK-NEXT: cmp w1, w0, uxth ; CHECK-NEXT: b.ge .LBB42_2 ; CHECK-NEXT: // %bb.1: // %then -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB42_2: // %end ; CHECK-NEXT: mov w0, w1 @@ -506,7 +466,7 @@ define i64 @cmp_s_i16i64(i16 %v, i64 %lhs) minsize { ; CHECK-NEXT: cmp x1, w0, sxth ; CHECK-NEXT: b.ge .LBB43_2 ; CHECK-NEXT: // %bb.1: // %then -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB43_2: // %end ; CHECK-NEXT: mov x0, x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 63f5464371cc62..26a21f9246937c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1847,11 +1847,12 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GCN-NEXT: s_ashr_i32 s7, s5, 31 ; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], s10 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_and_b32 s8, s11, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_cselect_b32 s2, s6, s7 ; GCN-NEXT: ; return to shader part epilog ; @@ -1868,13 +1869,14 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 ; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], s3 ; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10PLUS-NEXT: s_and_b32 s8, s11, 1 ; GFX10PLUS-NEXT: s_ashr_i32 s3, s5, 31 ; GFX10PLUS-NEXT: s_ashr_i64 s[4:5], s[4:5], s10 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, %amount diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll index 132dc876b3b054..f985760627942d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -453,8 +453,8 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX7-NEXT: s_lshl_b32 s2, s1, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX7-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir index 6a291510fe66c1..3041d79295a657 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir @@ -413,9 +413,9 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32) ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16 - ; GFX6-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16) + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX6-NEXT: %extend:_(s32) = G_AND %argument, [[C]] ; GFX6-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16) ; GFX6-NEXT: $vgpr0 = COPY %shl(s32) ; @@ -423,9 +423,9 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32) ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16 - ; GFX9-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-NEXT: %extend:_(s32) = G_AND %argument, [[C]] ; GFX9-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16) ; GFX9-NEXT: $vgpr0 = COPY %shl(s32) %argument:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir index 6ceb41199af6da..4256bb849664a3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir @@ -285,9 +285,9 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32) ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16 - ; GFX6-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16) + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX6-NEXT: %extend:_(s32) = G_AND %argument, [[C]] ; GFX6-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16) ; GFX6-NEXT: $vgpr0 = COPY %shl(s32) ; @@ -295,9 +295,9 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32) ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16 - ; GFX9-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-NEXT: %extend:_(s32) = G_AND %argument, [[C]] ; GFX9-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16) ; GFX9-NEXT: $vgpr0 = COPY %shl(s32) %argument:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir index 3423af64162e52..cb41cb5aa1ff2d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir @@ -36,8 +36,8 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32) - ; GCN-NEXT: %zext:_(s32) = G_ZEXT [[TRUNC]](s16) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GCN-NEXT: %zext:_(s32) = G_AND %var, [[C]] ; GCN-NEXT: $vgpr0 = COPY %zext(s32) %var:_(s32) = COPY $vgpr0 %cFFFFF:_(s32) = G_CONSTANT i32 1048575 @@ -136,8 +136,9 @@ body: | ; GCN-NEXT: %c7FFF:_(s32) = G_CONSTANT i32 32767 ; GCN-NEXT: %c:_(<2 x s32>) = G_BUILD_VECTOR %cFFFFF(s32), %c7FFF(s32) ; GCN-NEXT: %low_bits:_(<2 x s32>) = G_AND %var, %c - ; GCN-NEXT: %trunc:_(<2 x s16>) = G_TRUNC %low_bits(<2 x s32>) - ; GCN-NEXT: %zext:_(<2 x s32>) = G_ZEXT %trunc(<2 x s16>) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; GCN-NEXT: %zext:_(<2 x s32>) = G_AND %low_bits, [[BUILD_VECTOR]] ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(<2 x s32>) %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 %cFFFFF:_(s32) = G_CONSTANT i32 1048575 @@ -176,7 +177,6 @@ body: | %zext:_(<2 x s32>) = G_ZEXT %trunc(<2 x s16>) $vgpr0_vgpr1 = COPY %zext(<2 x s32>) ... - --- name: zext_trunc_v2s32_v2s16_v2s64 tracksRegLiveness: true @@ -204,3 +204,85 @@ body: | %zext:_(<2 x s64>) = G_ZEXT %trunc(<2 x s16>) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %zext(<2 x s64>) ... +--- +name: zext_trunc_s32_s16_s64_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: zext_trunc_s32_s16_s64_2 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %var(s32) + ; GCN-NEXT: %zext:_(s64) = G_ZEXT %trunc(s16) + ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64) + %var:_(s32) = COPY $vgpr0 + %trunc:_(s16) = G_TRUNC %var(s32) + %zext:_(s64) = G_ZEXT %trunc(s16) + $vgpr0_vgpr1 = COPY %zext(s64) +... +--- +name: zext_trunc_s64_s16_s64_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: zext_trunc_s64_s16_s64_2 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535 + ; GCN-NEXT: %zext:_(s64) = G_AND %var, [[C]] + ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64) + %var:_(s64) = COPY $vgpr0_vgpr1 + %trunc:_(s16) = G_TRUNC %var(s64) + %zext:_(s64) = G_ZEXT %trunc(s16) + $vgpr0_vgpr1 = COPY %zext(s64) +... +--- +name: zext_trunc_s64_s16_s32_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: zext_trunc_s64_s16_s32_2 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %var(s64) + ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) + ; GCN-NEXT: $vgpr0 = COPY %zext(s32) + %var:_(s64) = COPY $vgpr0_vgpr1 + %trunc:_(s16) = G_TRUNC %var(s64) + %zext:_(s32) = G_ZEXT %trunc(s16) + $vgpr0 = COPY %zext(s32) +... +--- +name: zext_trunc_s64_s16_s32_2_multi_use +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: zext_trunc_s64_s16_s32_2_multi_use + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %var(s64) + ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) + ; GCN-NEXT: %zext2:_(s32) = G_ZEXT %trunc(s16) + ; GCN-NEXT: $vgpr0 = COPY %zext(s32) + ; GCN-NEXT: $vgpr0 = COPY %zext2(s32) + %var:_(s64) = COPY $vgpr0_vgpr1 + %trunc:_(s16) = G_TRUNC %var(s64) + %zext:_(s32) = G_ZEXT %trunc(s16) + %zext2:_(s32) = G_ZEXT %trunc(s16) + $vgpr0 = COPY %zext(s32) + $vgpr0 = COPY %zext2(s32) +... + + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 3bd3486ec261d4..694abfeac32e9e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3853,8 +3853,8 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX6-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -4451,12 +4451,12 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 ; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v3i16: @@ -4805,13 +4805,13 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001 ; GFX6-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 -; GFX6-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -5831,7 +5831,8 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 ; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_and_b32 s11, s18, 1 +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 ; GFX6-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 @@ -5854,11 +5855,12 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_and_b32 s10, s15, 1 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -5878,7 +5880,8 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 ; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 -; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_and_b32 s11, s18, 1 +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] ; GFX8-NEXT: s_cmp_lg_u32 s9, 0 @@ -5901,11 +5904,12 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_and_b32 s10, s15, 1 +; GFX8-NEXT: s_cmp_lg_u32 s10, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cmp_lg_u32 s10, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -5925,7 +5929,8 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 ; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 -; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_and_b32 s11, s18, 1 +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] ; GFX9-NEXT: s_cmp_lg_u32 s9, 0 @@ -5948,11 +5953,12 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_and_b32 s10, s15, 1 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -5970,6 +5976,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 ; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 ; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 @@ -5994,12 +6001,13 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 ; GFX10-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_and_b32 s8, s13, 1 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 ; GFX10-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -6017,6 +6025,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 ; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX11-NEXT: s_and_b32 s18, s18, 1 ; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 ; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 @@ -6041,12 +6050,13 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_and_b32 s8, s13, 1 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cmp_lg_u32 s8, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cmp_lg_u32 s8, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 ; GFX11-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -6575,7 +6585,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_and_b32 s10, s13, 1 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 @@ -6627,7 +6638,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_and_b32 s10, s13, 1 +; GFX8-NEXT: s_cmp_lg_u32 s10, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 @@ -6679,7 +6691,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_and_b32 s10, s13, 1 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] @@ -6730,9 +6743,10 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s4 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_and_b32 s12, s13, 1 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] @@ -6781,9 +6795,10 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 ; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s4 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_and_b32 s12, s13, 1 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] @@ -6867,11 +6882,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_and_b32 s8, s11, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 @@ -6922,11 +6938,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_and_b32 s8, s11, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 @@ -6977,11 +6994,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_and_b32 s8, s11, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 @@ -7031,14 +7049,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s7 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 ; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_and_b32 s6, s11, 1 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 @@ -7085,13 +7104,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], s7 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 ; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX11-NEXT: s_and_b32 s6, s11, 1 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 @@ -7238,7 +7258,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 ; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_and_b32 s19, s28, 1 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 @@ -7261,11 +7282,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 ; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] ; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 -; GFX6-NEXT: s_cmp_lg_u32 s26, 0 +; GFX6-NEXT: s_and_b32 s19, s26, 1 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX6-NEXT: s_cmp_lg_u32 s26, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX6-NEXT: s_and_b32 s8, s20, 0x7f @@ -7281,7 +7303,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 -; GFX6-NEXT: s_cmp_lg_u32 s21, 0 +; GFX6-NEXT: s_and_b32 s16, s21, 1 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] ; GFX6-NEXT: s_cmp_lg_u32 s22, 0 @@ -7303,11 +7326,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 ; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_and_b32 s16, s19, 1 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 ; GFX6-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] @@ -7327,7 +7351,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 ; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_and_b32 s19, s28, 1 +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] ; GFX8-NEXT: s_cmp_lg_u32 s17, 0 @@ -7350,11 +7375,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 ; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] ; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_and_b32 s19, s26, 1 +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX8-NEXT: s_and_b32 s8, s20, 0x7f @@ -7370,7 +7396,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 -; GFX8-NEXT: s_cmp_lg_u32 s21, 0 +; GFX8-NEXT: s_and_b32 s16, s21, 1 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] ; GFX8-NEXT: s_cmp_lg_u32 s22, 0 @@ -7392,11 +7419,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 ; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_and_b32 s16, s19, 1 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 ; GFX8-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] @@ -7416,7 +7444,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 ; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_and_b32 s19, s28, 1 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] ; GFX9-NEXT: s_cmp_lg_u32 s17, 0 @@ -7439,11 +7468,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 ; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] ; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 -; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_and_b32 s19, s26, 1 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX9-NEXT: s_and_b32 s8, s20, 0x7f @@ -7459,7 +7489,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 -; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_and_b32 s16, s21, 1 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 @@ -7481,11 +7512,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 ; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_and_b32 s16, s19, 1 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 ; GFX9-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] @@ -7503,10 +7535,11 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s21 ; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s16 +; GFX10-NEXT: s_and_b32 s21, s28, 1 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s16 ; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 @@ -7527,12 +7560,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_lshl_b64 s[16:17], s[8:9], s16 ; GFX10-NEXT: s_lshr_b64 s[22:23], s[8:9], s19 ; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX10-NEXT: s_and_b32 s16, s26, 1 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s21 -; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 ; GFX10-NEXT: s_and_b32 s10, s20, 0x7f ; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] @@ -7545,6 +7579,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b32 s22, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s20 +; GFX10-NEXT: s_and_b32 s21, s21, 1 ; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s20 ; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 @@ -7569,12 +7604,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 ; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_and_b32 s14, s19, 1 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cmp_lg_u32 s14, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s20, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cmp_lg_u32 s14, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] @@ -7592,10 +7628,11 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_cselect_b32 s17, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s21 ; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s16 +; GFX11-NEXT: s_and_b32 s21, s28, 1 ; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s16 ; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 ; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s17, 0 @@ -7616,12 +7653,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_lshl_b64 s[16:17], s[8:9], s16 ; GFX11-NEXT: s_lshr_b64 s[22:23], s[8:9], s19 ; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX11-NEXT: s_and_b32 s16, s26, 1 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s21 -; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s27, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 ; GFX11-NEXT: s_and_b32 s10, s20, 0x7f ; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] @@ -7634,6 +7672,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_cselect_b32 s22, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s20 +; GFX11-NEXT: s_and_b32 s21, s21, 1 ; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s20 ; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 @@ -7658,12 +7697,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 ; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX11-NEXT: s_and_b32 s14, s19, 1 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 58304d2072d7f6..24ca00da0aee9f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3610,8 +3610,8 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX6-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -4331,12 +4331,12 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, s5 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v3i16: @@ -4347,8 +4347,9 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_lshr_b32 s8, s8, 15 ; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s7 ; GFX8-NEXT: s_lshl_b32 s6, s6, 1 -; GFX8-NEXT: s_lshr_b32 s8, s7, 15 +; GFX8-NEXT: s_lshr_b32 s8, s8, 15 ; GFX8-NEXT: s_lshl_b32 s2, s2, 1 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 ; GFX8-NEXT: s_or_b32 s6, s6, s8 @@ -4737,8 +4738,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_and_b32 s5, 0xffff, s8 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 1 @@ -4769,8 +4770,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -5857,7 +5858,8 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 ; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_and_b32 s9, s17, 1 +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 @@ -5874,11 +5876,12 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_and_b32 s12, s15, 1 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] ; GFX6-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] @@ -5904,7 +5907,8 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 ; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 -; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_and_b32 s9, s17, 1 +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s18, 0 @@ -5921,11 +5925,12 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_and_b32 s12, s15, 1 +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] ; GFX8-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] @@ -5951,7 +5956,8 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 ; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 -; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_and_b32 s9, s17, 1 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 @@ -5968,11 +5974,12 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_and_b32 s12, s15, 1 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] ; GFX9-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] @@ -5995,6 +6002,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 ; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], s14 +; GFX10-NEXT: s_and_b32 s17, s17, 1 ; GFX10-NEXT: s_lshl_b64 s[14:15], s[0:1], s14 ; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 @@ -6014,12 +6022,13 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_and_b32 s10, s15, 1 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 ; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -6042,6 +6051,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 ; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], s14 +; GFX11-NEXT: s_and_b32 s17, s17, 1 ; GFX11-NEXT: s_lshl_b64 s[14:15], s[0:1], s14 ; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 @@ -6062,12 +6072,13 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_and_b32 s10, s15, 1 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_cmp_lg_u32 s10, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_cmp_lg_u32 s10, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 ; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -6606,7 +6617,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_and_b32 s5, s13, 1 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s14, 0 @@ -6660,7 +6672,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_and_b32 s5, s13, 1 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s14, 0 @@ -6714,7 +6727,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_and_b32 s5, s13, 1 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s14, 0 @@ -6765,6 +6779,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s10 +; GFX10-NEXT: s_and_b32 s13, s13, 1 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 @@ -6819,6 +6834,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s10 +; GFX11-NEXT: s_and_b32 s13, s13, 1 ; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 @@ -6903,11 +6919,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_and_b32 s8, s11, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 @@ -6956,11 +6973,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_and_b32 s8, s11, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 @@ -7009,11 +7027,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_and_b32 s8, s11, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 @@ -7061,14 +7080,15 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_and_b32 s8, s11, 1 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 @@ -7113,13 +7133,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_and_b32 s8, s11, 1 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s8, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s8, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 @@ -7277,7 +7298,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 ; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_and_b32 s17, s28, 1 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX6-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] ; GFX6-NEXT: s_cmp_lg_u32 s29, 0 @@ -7294,11 +7316,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 ; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] ; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 -; GFX6-NEXT: s_cmp_lg_u32 s26, 0 +; GFX6-NEXT: s_and_b32 s21, s26, 1 +; GFX6-NEXT: s_cmp_lg_u32 s21, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s26, 0 +; GFX6-NEXT: s_cmp_lg_u32 s21, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX6-NEXT: s_lshr_b32 s22, s5, 31 @@ -7319,7 +7342,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_and_b32 s16, s19, 1 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 @@ -7336,11 +7360,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_and_b32 s16, s19, 1 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 ; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] ; GFX6-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] @@ -7366,7 +7391,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 ; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_and_b32 s17, s28, 1 +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX8-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] ; GFX8-NEXT: s_cmp_lg_u32 s29, 0 @@ -7383,11 +7409,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 ; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] ; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_and_b32 s21, s26, 1 +; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX8-NEXT: s_lshr_b32 s22, s5, 31 @@ -7408,7 +7435,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_and_b32 s16, s19, 1 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 @@ -7425,11 +7453,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_and_b32 s16, s19, 1 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 ; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] ; GFX8-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] @@ -7455,7 +7484,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 ; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_and_b32 s17, s28, 1 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX9-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 @@ -7472,11 +7502,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 ; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] ; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 -; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_and_b32 s21, s26, 1 +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX9-NEXT: s_lshr_b32 s22, s5, 31 @@ -7497,7 +7528,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_and_b32 s16, s19, 1 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 @@ -7514,11 +7546,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_and_b32 s16, s19, 1 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 ; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] ; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] @@ -7542,9 +7575,10 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 ; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 +; GFX10-NEXT: s_and_b32 s18, s28, 1 ; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 @@ -7558,6 +7592,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b32 s26, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 ; GFX10-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX10-NEXT: s_and_b32 s21, s21, 1 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] ; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 @@ -7583,6 +7618,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s9 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s16 +; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 ; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s18 @@ -7600,6 +7636,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s20 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[14:15], s8 +; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[14:15], s20 ; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s18 @@ -7631,9 +7668,10 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 ; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 ; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 +; GFX11-NEXT: s_and_b32 s18, s28, 1 ; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 ; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s17, 0 @@ -7648,6 +7686,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_cselect_b32 s26, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 ; GFX11-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX11-NEXT: s_and_b32 s21, s21, 1 ; GFX11-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] ; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 @@ -7673,6 +7712,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_cselect_b32 s21, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s9 ; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s16 +; GFX11-NEXT: s_and_b32 s19, s19, 1 ; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 ; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s18 @@ -7691,6 +7731,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-NEXT: s_cselect_b32 s21, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s20 ; GFX11-NEXT: s_lshl_b64 s[8:9], s[14:15], s8 +; GFX11-NEXT: s_and_b32 s19, s19, 1 ; GFX11-NEXT: s_lshr_b64 s[16:17], s[14:15], s20 ; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 5dd4fa0809131f..f78c6a514a8d87 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1761,11 +1761,12 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 ; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_and_b32 s7, s11, 1 +; GCN-NEXT: s_cmp_lg_u32 s7, 0 ; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s7, 0 ; GCN-NEXT: s_cselect_b32 s2, s6, 0 ; GCN-NEXT: ; return to shader part epilog ; @@ -1781,13 +1782,14 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 ; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 ; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 +; GFX10PLUS-NEXT: s_and_b32 s3, s11, 1 ; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 4cf1c92539c36f..de14c856bfd2ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1731,7 +1731,8 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3 ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_and_b32 s3, s11, 1 +; GCN-NEXT: s_cmp_lg_u32 s3, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 ; GCN-NEXT: s_cselect_b32 s3, s6, s8 ; GCN-NEXT: s_cmp_lg_u32 s12, 0 @@ -1749,9 +1750,10 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 ; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 +; GFX10PLUS-NEXT: s_and_b32 s3, s11, 1 ; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 ; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 788692c94b0cfa..7884f174590eaa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -3093,6 +3093,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: s_addc_u32 s1, s1, s5 ; GFX6-NEXT: s_addc_u32 s2, s2, s6 ; GFX6-NEXT: s_addc_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: ; return to shader part epilog @@ -3103,6 +3106,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_addc_u32 s2, s2, s6 ; GFX8-NEXT: s_addc_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: ; return to shader part epilog @@ -3113,6 +3119,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_addc_u32 s2, s2, s6 ; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: ; return to shader part epilog @@ -3123,6 +3132,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 ; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s6 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -3370,12 +3382,18 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_addc_u32 s1, s1, s9 ; GFX6-NEXT: s_addc_u32 s2, s2, s10 ; GFX6-NEXT: s_addc_u32 s3, s3, s11 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: s_add_u32 s4, s4, s12 ; GFX6-NEXT: s_addc_u32 s5, s5, s13 ; GFX6-NEXT: s_addc_u32 s6, s6, s14 ; GFX6-NEXT: s_addc_u32 s7, s7, s15 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] ; GFX6-NEXT: ; return to shader part epilog @@ -3386,12 +3404,18 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: s_addc_u32 s1, s1, s9 ; GFX8-NEXT: s_addc_u32 s2, s2, s10 ; GFX8-NEXT: s_addc_u32 s3, s3, s11 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: s_add_u32 s4, s4, s12 ; GFX8-NEXT: s_addc_u32 s5, s5, s13 ; GFX8-NEXT: s_addc_u32 s6, s6, s14 ; GFX8-NEXT: s_addc_u32 s7, s7, s15 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX8-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] ; GFX8-NEXT: ; return to shader part epilog @@ -3402,12 +3426,18 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: s_addc_u32 s1, s1, s9 ; GFX9-NEXT: s_addc_u32 s2, s2, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s11 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: s_add_u32 s4, s4, s12 ; GFX9-NEXT: s_addc_u32 s5, s5, s13 ; GFX9-NEXT: s_addc_u32 s6, s6, s14 ; GFX9-NEXT: s_addc_u32 s7, s7, s15 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] ; GFX9-NEXT: ; return to shader part epilog @@ -3418,12 +3448,18 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s9 ; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s10 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s11 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: s_add_u32 s4, s4, s12 ; GFX10PLUS-NEXT: s_addc_u32 s5, s5, s13 ; GFX10PLUS-NEXT: s_addc_u32 s6, s6, s14 ; GFX10PLUS-NEXT: s_addc_u32 s7, s7, s15 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 0042d34e235d17..a8a5b76d93d592 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2961,6 +2961,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: s_subb_u32 s1, s1, s5 ; GFX6-NEXT: s_subb_u32 s2, s2, s6 ; GFX6-NEXT: s_subb_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX6-NEXT: ; return to shader part epilog @@ -2971,6 +2974,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: s_subb_u32 s1, s1, s5 ; GFX8-NEXT: s_subb_u32 s2, s2, s6 ; GFX8-NEXT: s_subb_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX8-NEXT: ; return to shader part epilog @@ -2981,6 +2987,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: s_subb_u32 s1, s1, s5 ; GFX9-NEXT: s_subb_u32 s2, s2, s6 ; GFX9-NEXT: s_subb_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX9-NEXT: ; return to shader part epilog @@ -2991,6 +3000,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5 ; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s6 ; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -3238,12 +3250,18 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_subb_u32 s1, s1, s9 ; GFX6-NEXT: s_subb_u32 s2, s2, s10 ; GFX6-NEXT: s_subb_u32 s3, s3, s11 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX6-NEXT: s_sub_u32 s4, s4, s12 ; GFX6-NEXT: s_subb_u32 s5, s5, s13 ; GFX6-NEXT: s_subb_u32 s6, s6, s14 ; GFX6-NEXT: s_subb_u32 s7, s7, s15 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] ; GFX6-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] ; GFX6-NEXT: ; return to shader part epilog @@ -3254,12 +3272,18 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: s_subb_u32 s1, s1, s9 ; GFX8-NEXT: s_subb_u32 s2, s2, s10 ; GFX8-NEXT: s_subb_u32 s3, s3, s11 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX8-NEXT: s_sub_u32 s4, s4, s12 ; GFX8-NEXT: s_subb_u32 s5, s5, s13 ; GFX8-NEXT: s_subb_u32 s6, s6, s14 ; GFX8-NEXT: s_subb_u32 s7, s7, s15 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] ; GFX8-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] ; GFX8-NEXT: ; return to shader part epilog @@ -3270,12 +3294,18 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: s_subb_u32 s1, s1, s9 ; GFX9-NEXT: s_subb_u32 s2, s2, s10 ; GFX9-NEXT: s_subb_u32 s3, s3, s11 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX9-NEXT: s_sub_u32 s4, s4, s12 ; GFX9-NEXT: s_subb_u32 s5, s5, s13 ; GFX9-NEXT: s_subb_u32 s6, s6, s14 ; GFX9-NEXT: s_subb_u32 s7, s7, s15 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] ; GFX9-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] ; GFX9-NEXT: ; return to shader part epilog @@ -3286,12 +3316,18 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s9 ; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s10 ; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s11 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX10PLUS-NEXT: s_sub_u32 s4, s4, s12 ; GFX10PLUS-NEXT: s_subb_u32 s5, s5, s13 ; GFX10PLUS-NEXT: s_subb_u32 s6, s6, s14 ; GFX10PLUS-NEXT: s_subb_u32 s7, s7, s15 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] ; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index 6b097bd71c9f14..fbbde156b8c714 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -3,20 +3,39 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_cs float @v_s_exp_f32(float inreg %src) { -; GFX12-LABEL: v_s_exp_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 -; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_add_f32 s0, s0, s1 -; GFX12-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0 -; GFX12-NEXT: v_s_exp_f32 s0, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: v_s_exp_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42800000, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: s_add_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0 +; GFX12-SDAG-NEXT: v_s_exp_f32 s0, s0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: v_s_exp_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s1, s1, 1 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-GISEL-NEXT: s_cselect_b32 s2, 0x42800000, 0 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0 +; GFX12-GISEL-NEXT: s_add_f32 s0, s0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: v_s_exp_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.exp2.f32(float %src) ret float %result } @@ -55,20 +74,39 @@ define amdgpu_cs half @v_s_amdgcn_exp_f16(half inreg %src) { } define amdgpu_cs float @v_s_log_f32(float inreg %src) { -; GFX12-LABEL: v_s_log_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-NEXT: v_s_log_f32 s0, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_sub_f32 s0, s0, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: v_s_log_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: v_s_log_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s1, s1, 1 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-GISEL-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.log2.f32(float %src) ret float %result } @@ -205,32 +243,35 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0xf800000 ; GFX12-GISEL-NEXT: s_mul_f32 s2, s0, 0x4f800000 ; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s1, s1, 1 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(TRANS32_DEP_1) ; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_mov_b32 s4, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_add_co_i32 s3, s2, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_xor_b32 s5, s3, 0x80000000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_fmac_f32 s4, s5, s2 ; GFX12-GISEL-NEXT: s_add_co_i32 s5, s2, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_xor_b32 s7, s5, 0x80000000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2) ; GFX12-GISEL-NEXT: s_cmp_le_f32 s4, 0 ; GFX12-GISEL-NEXT: s_fmac_f32 s6, s7, s2 ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s3, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_3) ; GFX12-GISEL-NEXT: s_cmp_gt_f32 s6, 0 ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.sqrt.f32(float %src) @@ -271,22 +312,42 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src) { } define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { -; GFX12-LABEL: srcmods_abs_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_bitset0_b32 s0, 31 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX12-NEXT: v_s_log_f32 s0, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_sub_f32 s0, s0, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: srcmods_abs_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_bitset0_b32 s0, 31 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 +; GFX12-SDAG-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: srcmods_abs_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_bitset0_b32 s0, 31 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX12-GISEL-NEXT: s_and_b32 s1, s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-GISEL-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s2 +; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %abs = call float @llvm.fabs.f32(float %src) %result = call float @llvm.log2.f32(float %abs) ret float %result @@ -314,15 +375,18 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX12-GISEL-NEXT: s_and_b32 s1, s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-GISEL-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0 ; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s2 ; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %neg = fneg float %src diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip-rv64.ll index fd80afce6510e9..df3be2ef1f933f 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip-rv64.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip-rv64.ll @@ -105,7 +105,8 @@ entry: define i64 @zext_nneg_i32_i64(i32 %a) { ; RV64IM-LABEL: zext_nneg_i32_i64: ; RV64IM: # %bb.0: # %entry -; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 ; RV64IM-NEXT: ret entry: %b = zext nneg i32 %a to i64 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine.mir b/llvm/test/CodeGen/RISCV/GlobalISel/combine.mir index ef3fc4c9d5fae7..2ed0ea949231fd 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/combine.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine.mir @@ -8,9 +8,9 @@ body: | ; RV64-LABEL: name: nneg_zext ; RV64: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 - ; RV64-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s32) - ; RV64-NEXT: $x10 = COPY [[SEXT]](s64) + ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; RV64-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; RV64-NEXT: $x10 = COPY [[AND]](s64) ; RV64-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %2:_(s32) = G_TRUNC %0