llvm · abhishek-kaushik22 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -60832,3 +60832,24 @@ Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
   return TargetLowering::getPrefLoopAlignment();
 }
+
+bool X86TargetLowering::shouldSimplifyDemandedVectorElts(
+    SDValue Op, const TargetLoweringOpt &TLO) const {
+  if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    SDValue V0 = peekThroughBitcasts(Op.getOperand(0));
+    SDValue V1 = peekThroughBitcasts(Op.getOperand(1));
+
+    if (V0.getOpcode() == ISD::MUL || V1.getOpcode() == ISD::MUL) {
+      SDNode *Mul = V0.getOpcode() == ISD::MUL ? V0.getNode() : V1.getNode();
+      SelectionDAG &DAG = TLO.DAG;
+      const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
+      const SDLoc DL(Mul);
+
+      if (SDValue V = combineMulToPMULDQ(Mul, DL, DAG, Subtarget)) {
+        DAG.ReplaceAllUsesWith(Mul, V.getNode());
+        return false;
+      }
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1207,6 +1207,9 @@ namespace llvm {
 
     bool hasBitTest(SDValue X, SDValue Y) const override;
 
+    bool shouldSimplifyDemandedVectorElts(
+        SDValue Op, const TargetLoweringOpt &TLO) const override;
+
     bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
         unsigned OldShiftOpcode, unsigned NewShiftOpcode,

diff --git a/llvm/test/CodeGen/X86/pr121456.ll b/llvm/test/CodeGen/X86/pr121456.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512dq -O3 | FileCheck %s
+
+define <8 x i64> @pr121456(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: pr121456:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = and <8 x i64> %a, splat (i64 4294967295)
+  %1 = and <8 x i64> %b, splat (i64 4294967295)
+  %2 = mul nuw <8 x i64> %1, %0
+  %3 = bitcast <8 x i64> %2 to <16 x i32>
+  %4 = shufflevector <16 x i32> <i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison>, <16 x i32> %3, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %5 = bitcast <16 x i32> %4 to <8 x i64>
+  ret <8 x i64> %5
+}