resolved comments

jyli0116 · jyli0116 · commit 716eee077c75 · 2025-09-09T09:40:50.000Z
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -81,9 +81,11 @@ struct ShiftOfShiftedLogic {
   uint64_t ValSum;
 };
 
-struct ShiftOfTruncOfShift {
+struct LshrOfTruncOfLshr {
+  bool Mask = false;
+  APInt MaskVal;
   Register Src;
-  uint64_t ShiftAmt;
+  APInt ShiftAmt;
   LLT ShiftAmtTy;
   LLT InnerShiftTy;
 };
@@ -345,11 +347,10 @@ class CombinerHelper {
 
   bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  /// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2))
-  bool matchShiftOfTruncOfShift(MachineInstr &MI,
-                                ShiftOfTruncOfShift &MatchInfo) const;
-  void applyShiftOfTruncOfShift(MachineInstr &MI,
-                                ShiftOfTruncOfShift &MatchInfo) const;
+  /// Fold (lshr (trunc (lshr x, C1)), C2) -> trunc (shift x, (C1 + C2))
+  bool matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI, MachineInstr &TruncMI) const;
+  void applyLshrOfTruncOfLshr(MachineInstr &MI,
+                                LshrOfTruncOfLshr &MatchInfo) const;
 
   /// Transform a multiply by a power-of-2 value to a left shift.
   bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -396,18 +396,21 @@ def commute_shift : GICombineRule<
          [{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>;
 
-// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2))
-def shift_right_op : GICombinePatFrag<
-  (outs root:$dst), (ins),
-  !foreach(op,
-           [G_LSHR, G_ASHR],
-           (pattern (op $dst, $shifted, $amt)))>;
-def shift_of_trunc_of_shift_matchdata : GIDefMatchData<"ShiftOfTruncOfShift">;
-def shift_of_trunc_of_shift : GICombineRule<
-  (defs root:$dst, shift_of_trunc_of_shift_matchdata:$matchinfo),
-  (match (shift_right_op $dst):$root,
-         [{ return Helper.matchShiftOfTruncOfShift(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyShiftOfTruncOfShift(*${root}, ${matchinfo}); }])>;
+// Fold (lshr (trunc (lshr x, C1)), C2) -> trunc (lshr x, (C1 + C2))
+def lshr_of_trunc_of_lshr_matchdata : GIDefMatchData<"LshrOfTruncOfLshr">;
+//def lshr_of_trunc_of_lshr : GICombineRule<
+//  (defs root:$root, lshr_of_trunc_of_lshr_matchdata:$matchinfo),
+//  (match (G_LSHR $dst, $x, $y):$root,
+//         [{ return Helper.matchLshrOfTruncOfLshr(*${root}, ${matchinfo}); }]),
+//  (apply [{ Helper.applyLshrOfTruncOfLshr(*${root}, ${matchinfo}); }])>;
+
+def lshr_of_trunc_of_lshr : GICombineRule<
+  (defs root:$root, lshr_of_trunc_of_lshr_matchdata:$matchinfo),
+  (match (G_LSHR $d1, $x, $y):$Shift,
+         (G_TRUNC $d2, $d1):$Trunc,
+         (G_LSHR $dst, $d2, $z):$root,
+         [{ return Helper.matchLshrOfTruncOfLshr(*${root}, ${matchinfo}, *${Shift}, *${Trunc}); }]),
+  (apply [{ Helper.applyLshrOfTruncOfLshr(*${root}, ${matchinfo}); }])>;
 
 def narrow_binop_feeding_and : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
@@ -2147,7 +2150,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     simplify_neg_minmax, combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
     combine_use_vector_truncate, merge_combines, overflow_combines, 
-    truncsat_combines, shift_of_trunc_of_shift]>;
+    truncsat_combines, lshr_of_trunc_of_lshr]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2094,57 +2094,63 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI,
   return true;
 }
 
-bool CombinerHelper::matchShiftOfTruncOfShift(
-    MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const {
+bool CombinerHelper::matchLshrOfTruncOfLshr(
+    MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI, MachineInstr &TruncMI) const {
   unsigned ShiftOpcode = MI.getOpcode();
-  assert(ShiftOpcode == TargetOpcode::G_LSHR ||
-         ShiftOpcode == TargetOpcode::G_ASHR);
+  assert(ShiftOpcode == TargetOpcode::G_LSHR);
 
   Register N0 = MI.getOperand(1).getReg();
   Register N1 = MI.getOperand(2).getReg();
   unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits();
 
-  APInt N1C;
-  Register InnerShift;
-  if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)) ||
-      !mi_match(N0, MRI, m_GTrunc(m_Reg(InnerShift))))
+  APInt N1C, N001C;
+  if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)))
     return false;
-
-  auto *InnerMI = MRI.getVRegDef(InnerShift);
-  if (InnerMI->getOpcode() != ShiftOpcode)
-    return false;
-
-  APInt N001C;
-  auto N001 = InnerMI->getOperand(2).getReg();
+  auto N001 = ShiftMI.getOperand(2).getReg();
   if (!mi_match(N001, MRI, m_ICstOrSplat(N001C)))
     return false;
 
-  uint64_t c1 = N001C.getZExtValue();
-  uint64_t c2 = N1C.getZExtValue();
+  if (N001C.getBitWidth() > N1C.getBitWidth())
+    N1C = N1C.zext(N001C.getBitWidth());
+  else
+    N001C = N001C.zext(N1C.getBitWidth());
+
+  Register InnerShift = ShiftMI.getOperand(0).getReg();
   LLT InnerShiftTy = MRI.getType(InnerShift);
   uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits();
-  if (!(c1 + OpSizeInBits == InnerShiftSize) || !(c1 + c2 < InnerShiftSize))
-    return false;
+  if ((N1C + N001C).ult(InnerShiftSize)) {
+    MatchInfo.Src = ShiftMI.getOperand(1).getReg();
+    MatchInfo.ShiftAmt = N1C + N001C;
+    MatchInfo.ShiftAmtTy = MRI.getType(N001);
+    MatchInfo.InnerShiftTy = InnerShiftTy;    
 
-  MatchInfo.Src = InnerMI->getOperand(1).getReg();
-  MatchInfo.ShiftAmt = c1 + c2;
-  MatchInfo.ShiftAmtTy = MRI.getType(N001);
-  MatchInfo.InnerShiftTy = InnerShiftTy;
-  return true;
+    if ((N001C + OpSizeInBits) == InnerShiftSize)
+      return true;
+    if (MRI.hasOneUse(N0) && MRI.hasOneUse(InnerShift)) {
+      MatchInfo.Mask = true;
+      MatchInfo.MaskVal = APInt(N1C.getBitWidth(), OpSizeInBits) - N1C;
+      return true;
+    }
+  }
+  return false;
 }
 
-void CombinerHelper::applyShiftOfTruncOfShift(
-    MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const {
+void CombinerHelper::applyLshrOfTruncOfLshr(
+    MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const {
   unsigned ShiftOpcode = MI.getOpcode();
-  assert(ShiftOpcode == TargetOpcode::G_LSHR ||
-         ShiftOpcode == TargetOpcode::G_ASHR);
+  assert(ShiftOpcode == TargetOpcode::G_LSHR);
 
   Register Dst = MI.getOperand(0).getReg();
   auto ShiftAmt =
       Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt);
-  auto Shift = Builder.buildInstr(ShiftOpcode, {MatchInfo.InnerShiftTy},
-                                  {MatchInfo.Src, ShiftAmt});
-  Builder.buildTrunc(Dst, Shift);
+  auto Shift = Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt);
+  if (MatchInfo.Mask == true) {
+    APInt MaskVal = APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(), MatchInfo.MaskVal.getZExtValue());
+    auto Mask = Builder.buildConstant(MatchInfo.ShiftAmtTy, MaskVal);
+    auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask);
+    Builder.buildTrunc(Dst, And);
+  } else
+    Builder.buildTrunc(Dst, Shift);
   MI.eraseFromParent();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -369,5 +369,5 @@ def AArch64PostLegalizerCombiner
                         commute_constant_to_rhs, extract_vec_elt_combines,
                         push_freeze_to_prevent_poison_from_propagating,
                         combine_mul_cmlt, combine_use_vector_truncate, 
-                        extmultomull, truncsat_combines, shift_of_trunc_of_shift]> {
+                        extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1684,14 +1684,24 @@ define i32 @combine_i32_sdiv_const7(i32 %x) {
 }
 
 define i32 @combine_i32_sdiv_const100(i32 %x) {
-; CHECK-LABEL: combine_i32_sdiv_const100:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #34079 // =0x851f
-; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    smull x8, w0, w8
-; CHECK-NEXT:    asr x8, x8, #37
-; CHECK-NEXT:    add w0, w8, w8, lsr #31
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: combine_i32_sdiv_const100:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w0, w8, w8, lsr #31
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_i32_sdiv_const100:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    smull x8, w0, w8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    asr w8, w8, #5
+; CHECK-GI-NEXT:    add w0, w8, w8, lsr #31
+; CHECK-GI-NEXT:    ret
   %1 = sdiv i32 %x, 100
   ret i32 %1
 }
diff --git a/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll b/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i32 @s32_test1(i64 %a) {
+; CHECK-LABEL: s32_test1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr x0, x0, #48
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %r = lshr i64 %a, 32
+  %ret = trunc i64 %r to i32
+  %x = lshr i32 %ret, 16
+  ret i32 %x
+}
+
+define i32 @s32_test2(i64 %a) {
+; CHECK-LABEL: s32_test2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx x0, x0, #32, #16
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %r = lshr i64 %a, 16
+  %ret = trunc i64 %r to i32
+  %x = lshr i32 %ret, 16
+  ret i32 %x
+}
+
+define <8 x i8> @v8s8_test1(<8 x i16> %a) {
+; CHECK-LABEL: v8s8_test1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #12
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %r = lshr <8 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %ret = trunc <8 x i16> %r to <8 x i8>
+  %x = lshr <8 x i8> %ret, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  ret <8 x i8> %x
+}
+
+define <8 x i8> @v8s8_test2(<8 x i16> %a) {
+; CHECK-SD-LABEL: v8s8_test2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushr v0.8h, v0.8h, #8
+; CHECK-SD-NEXT:    bic v0.8h, #240
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8s8_test2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.8h, #15
+; CHECK-GI-NEXT:    ushr v0.8h, v0.8h, #8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+  %r = lshr <8 x i16> %a, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  %ret = trunc <8 x i16> %r to <8 x i8>
+  %x = lshr <8 x i8> %ret, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  ret <8 x i8> %x
+}
+
+define <4 x i16> @v4s16_test1(<4 x i32> %a) {
+; CHECK-LABEL: v4s16_test1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #24
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %r = lshr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %ret = trunc <4 x i32> %r to <4 x i16>
+  %x = lshr <4 x i16> %ret, <i16 8, i16 8, i16 8, i16 8>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @v4s16_test2(<4 x i32> %a) {
+; CHECK-SD-LABEL: v4s16_test2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4s16_test2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+  %r = lshr <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  %ret = trunc <4 x i32> %r to <4 x i16>
+  %x = lshr <4 x i16> %ret, <i16 8, i16 8, i16 8, i16 8>
+  ret <4 x i16> %x
+}
+
+define <2 x i32> @v2s32_test1(<2 x i64> %a) {
+; CHECK-LABEL: v2s32_test1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushr v0.2d, v0.2d, #48
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %r = lshr <2 x i64> %a, <i64 32, i64 32>
+  %ret = trunc <2 x i64> %r to <2 x i32>
+  %x = lshr <2 x i32> %ret, <i32 16, i32 16>
+  ret <2 x i32> %x
+}
+
+define <2 x i32> @v2s32_test2(<2 x i64> %a) {
+; CHECK-SD-LABEL: v2s32_test2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2s32_test2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
+; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #32
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+  %r = lshr <2 x i64> %a, <i64 16, i64 16>
+  %ret = trunc <2 x i64> %r to <2 x i32>
+  %x = lshr <2 x i32> %ret, <i32 16, i32 16>
+  ret <2 x i32> %x
+}
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll

Original file line number	Diff line number	Diff line change
`@@ -369,5 +369,5 @@ def AArch64PostLegalizerCombiner`
`369`	`369`	`commute_constant_to_rhs, extract_vec_elt_combines,`
`370`	`370`	`push_freeze_to_prevent_poison_from_propagating,`
`371`	`371`	`combine_mul_cmlt, combine_use_vector_truncate,`
`372`		`- extmultomull, truncsat_combines, shift_of_trunc_of_shift]> {`
	`372`	`+ extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> {`
`373`	`373`	`}`