[Hexagon] Implement isMaskAndCmp0FoldingBeneficial (llvm#166891)

svs-quic · web-flow · commit 4d88bb6c6303 · 2025-11-10T09:48:56.000+05:30
Sink `and` mask to `cmp` use block if it is masking a single bit since
this will fold the `and/cmp/br` into a single `tstbit` instruction.
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3948,3 +3948,13 @@ HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
   return AtomicExpansionKind::LLSC;
 }
+
+bool HexagonTargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  // Only sink 'and' mask to cmp use block if it is masking a single bit since
+  // this will fold the and/cmp/br into a single tstbit instruction.
+  ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  if (!Mask)
+    return false;
+  return Mask->getValue().isPowerOf2();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -160,6 +160,8 @@ class HexagonTargetLowering : public TargetLowering {
 
   bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
+  bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
   /// Return true if an FMA operation is faster than a pair of mul and add
   /// instructions. fmuladd intrinsics will be expanded to FMAs when this
   /// method returns true (and FMAs are legal), otherwise fmuladd is
diff --git a/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll b/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; Test that 'and' mask is sunk to the cmp use block only if it is masking a single bit
+; RUN: llc -march=hexagon --verify-machineinstrs < %s | FileCheck %s
+
+@A = global i32 zeroinitializer
+
+define i32 @and_sink1(i32 %a) {
+; CHECK-LABEL: and_sink1:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = !tstbit(r0,#11)
+; CHECK-NEXT:     r0 = ##A
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: // %bb0
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) jump:nt .LBB0_1
+; CHECK-NEXT:     memw(r0+#0) = #0
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %bb2
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = #0
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %and = and i32 %a, 2048
+  br label %bb0
+bb0:
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb0, label %bb2
+bb2:
+  ret i32 0
+}
+
+define i32 @and_sink2(i32 %a) {
+; CHECK-LABEL: and_sink2:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = and(r0,##2049)
+; CHECK-NEXT:     r0 = ##A
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB1_1: // %bb0
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) jump:nt .LBB1_1
+; CHECK-NEXT:     memw(r0+#0) = #0
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %bb2
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = #0
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %and = and i32 %a, 2049
+  br label %bb0
+bb0:
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb0, label %bb2
+bb2:
+  ret i32 0
+}