llvm · tschuett · Sep 11, 2024 · Sep 12, 2024 · Sep 17, 2024 · Oct 12, 2024
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -388,9 +388,6 @@ class CombinerHelper {
   /// Transform anyext(trunc(x)) to x.
   bool matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg);
 
-  /// Transform zext(trunc(x)) to x.
-  bool matchCombineZextTrunc(MachineInstr &MI, Register &Reg);
-
   /// Transform trunc (shl x, K) to shl (trunc x), K
   ///    if K < VT.getScalarSizeInBits().
   ///
@@ -918,6 +915,10 @@ class CombinerHelper {
   bool matchCanonicalizeICmp(const MachineInstr &MI, BuildFnTy &MatchInfo);
   bool matchCanonicalizeFCmp(const MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Transform zext of truncate to x or and(x, mask).
+  bool matchCombineZextTrunc(const MachineInstr &ZextMI,
+                             const MachineInstr &TruncMI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -758,15 +758,6 @@ def anyext_trunc_fold: GICombineRule <
   (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
 >;
 
-// Fold (zext (trunc x)) -> x if the source type is same as the destination type
-// and truncated bits are known to be zero.
-def zext_trunc_fold: GICombineRule <
-  (defs root:$root, register_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_ZEXT):$root,
-         [{ return Helper.matchCombineZextTrunc(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
->;
-
 def not_cmp_fold_matchinfo : GIDefMatchData<"SmallVector<Register, 4>">;
 def not_cmp_fold : GICombineRule<
   (defs root:$d, not_cmp_fold_matchinfo:$info),
@@ -1791,6 +1782,15 @@ class integer_of_opcode<Instruction castOpcode> : GICombineRule <
 
 def integer_of_truncate : integer_of_opcode<G_TRUNC>;
 
+/// Transform zext of truncate to x or and(x, mask).
+def zext_of_truncate : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_TRUNC $trunc, $src):$TruncMI,
+         (G_ZEXT $root, $trunc):$ZextMI,
+         [{ return Helper.matchCombineZextTrunc(*${ZextMI}, *${TruncMI}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${ZextMI}, ${matchinfo}); }])>;
+
+
 def cast_combines: GICombineGroup<[
   truncate_of_zext,
   truncate_of_sext,
@@ -1812,7 +1812,8 @@ def cast_combines: GICombineGroup<[
   narrow_binop_and,
   narrow_binop_or,
   narrow_binop_xor,
-  integer_of_truncate
+  integer_of_truncate,
+  zext_of_truncate
 ]>;
 
 def canonicalize_icmp : GICombineRule<
@@ -1869,7 +1870,6 @@ def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
 
 def known_bits_simplifications : GICombineGroup<[
   redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,
-  zext_trunc_fold,
   sext_inreg_to_zext_inreg]>;
 
 def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,

diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -333,8 +333,10 @@ MachineInstrBuilder CSEMIRBuilder::buildConstant(const DstOp &Res,
 
   // For vectors, CSE the element only for now.
   LLT Ty = Res.getLLTTy(*getMRI());
-  if (Ty.isVector())
+  if (Ty.isFixedVector())
     return buildSplatBuildVector(Res, buildConstant(Ty.getElementType(), Val));
+  if (Ty.isScalableVector())
+    return buildSplatVector(Res, buildConstant(Ty.getElementType(), Val));
 
   FoldingSetNodeID ID;
   GISelInstProfileBuilder ProfBuilder(ID, *getMRI());

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2526,20 +2526,6 @@ bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
                   m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))));
 }
 
-bool CombinerHelper::matchCombineZextTrunc(MachineInstr &MI, Register &Reg) {
-  assert(MI.getOpcode() == TargetOpcode::G_ZEXT && "Expected a G_ZEXT");
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  if (mi_match(SrcReg, MRI,
-               m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))))) {
-    unsigned DstSize = DstTy.getScalarSizeInBits();
-    unsigned SrcSize = MRI.getType(SrcReg).getScalarSizeInBits();
-    return KB->getKnownBits(Reg).countMinLeadingZeros() >= DstSize - SrcSize;
-  }
-  return false;
-}
-
 static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) {
   const unsigned ShiftSize = ShiftTy.getScalarSizeInBits();
   const unsigned TruncSize = TruncTy.getScalarSizeInBits();

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -359,3 +359,94 @@ bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI,
     return false;
   }
 }
+
+bool CombinerHelper::matchCombineZextTrunc(const MachineInstr &ZextMI,
+                                           const MachineInstr &TruncMI,
+                                           BuildFnTy &MatchInfo) {
+  const GZext *Zext = cast<GZext>(&ZextMI);
+  const GTrunc *Trunc = cast<GTrunc>(&TruncMI);
+
+  Register Dst = Zext->getReg(0);
+  Register Mid = Zext->getSrcReg();
+  Register Src = Trunc->getSrcReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (!MRI.hasOneNonDBGUse(Mid))
+    return false;
+
+  unsigned DstSize = DstTy.getScalarSizeInBits();
+  unsigned MidSize = MRI.getType(Mid).getScalarSizeInBits();
+  unsigned SrcSize = SrcTy.getScalarSizeInBits();
+
+  // Are the truncated bits known to be zero?
+  if (DstTy == SrcTy &&
+      (KB->getKnownBits(Src).countMinLeadingZeros() >= DstSize - MidSize)) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+    return true;
+  }
 // If we're actually extending zero bits, then if 
 // If we're actually extending zero bits, then if 
+
+  // If the sizes are just right we can convert this into a logical
+  // 'and', which will be much cheaper than the pair of casts.
+
+  // If we're actually extending zero bits, then if
+  // SrcSize <  DstSize: zext(Src & mask)
+  // SrcSize == DstSize: Src & mask
+  // SrcSize  > DstSize: trunc(Src) & mask
+
+  if (DstSize == SrcSize) {
+    // Src & mask.
+
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {DstTy}}) ||
+        !isConstantLegalOrBeforeLegalizer(DstTy))
+      return false;
+
+    // build mask.
+    APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+
+    MatchInfo = [=](MachineIRBuilder &B) {
+      auto Mask = B.buildConstant(DstTy, AndValue);
+      B.buildAnd(Dst, Src, Mask);
+    };
+    return true;
+  }
+
+  //  if (SrcSize < DstSize) {
+  //    // zext(Src & mask).
+  //
+  //    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {SrcTy}}) ||
+  //        !isConstantLegalOrBeforeLegalizer(SrcTy) ||
+  //        !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+  //      return false;
+  //
+  //    APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+  //
+  //    MatchInfo = [=](MachineIRBuilder &B) {
+  //      auto Mask = B.buildConstant(SrcTy, AndValue);
+  //      auto And = B.buildAnd(SrcTy, Src, Mask);
+  //      B.buildZExt(Dst, And);
+  //    };
+  //    return true;
+  //  }
+
+  //  if (SrcSize > DstSize) {
+  //    // trunc(Src) & mask.
+  //
+  //    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {DstTy}}) ||
+  //        !isConstantLegalOrBeforeLegalizer(DstTy) ||
+  //        !isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}}))
+  //      return false;
+  //
+  //    APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+  //
+  //    MatchInfo = [=](MachineIRBuilder &B) {
+  //      auto Mask = B.buildConstant(DstTy, AndValue);
+  //      auto Trunc = B.buildTrunc(DstTy, Src);
+  //      B.buildAnd(Dst, Trunc, Mask);
+  //    };
+  //    return true;
+  //  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -168,6 +168,6 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
 def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
   [unmerge_merge, unmerge_cst, unmerge_undef,
-   zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
+   int_minmax_to_med3, ptr_add_immed_chain,
    fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
@@ -49,8 +49,8 @@ body:             |
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %arg1:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64)
-    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[TRUNC]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: %zext:_(s64) = G_AND %arg1, [[C]]
     ; CHECK-NEXT: $x0 = COPY %zext(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %arg1:_(s64) = COPY $x0

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
@@ -10,7 +10,9 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: $x1 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $x1 = COPY [[AND]](s64)
     %0:_(s64) = COPY $x0
     %2:_(s32) = nuw G_TRUNC %0
     %3:_(s64) = G_ZEXT  %2
@@ -25,9 +27,9 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nsw G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
-    ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $x1 = COPY [[AND]](s64)
     %0:_(s64) = COPY $x0
     %2:_(s32) = nsw G_TRUNC %0
     %3:_(s64) = G_ZEXT  %2
@@ -42,9 +44,9 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
-    ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $x1 = COPY [[AND]](s64)
     %0:_(s64) = COPY $x0
     %2:_(s32) = G_TRUNC %0
     %3:_(s64) = G_ZEXT  %2
@@ -300,7 +302,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: %sv0:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[COPY]](s64)
-    ; CHECK-NEXT: $z0 = COPY %sv0(<vscale x 2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: %z:_(<vscale x 2 x s64>) = G_AND %sv0, [[SPLAT_VECTOR]]
+    ; CHECK-NEXT: $z0 = COPY %z(<vscale x 2 x s64>)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %sv0:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %0:_(s64)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir
@@ -165,9 +165,9 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[SEXTLOAD]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
-    ; CHECK-NEXT: $w0 = COPY [[ZEXT]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXTLOAD]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
     ; CHECK-NEXT: $w1 = COPY [[SEXTLOAD]](s32)
     %0:_(p0) = COPY $x0
     %1:_(s8) = G_LOAD %0 :: (load (s8))