[SLP]Recalculate deps if the original instruction scheduled after being copyable

alexey-bataev · alexey-bataev · commit 0dddfab54cc3 · 2025-09-10T10:18:45.000-07:00
If the original instruction is going to be scheduled after same
instruction being scheduled as copyable, need to recalculate
dependencies. Otherwise, the dependencies maybe calculated incorrectly.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20788,6 +20788,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           continue;
         }
         auto *SD = cast<ScheduleData>(SE);
+        if (SD->hasValidDependencies() &&
+            (!S.areInstructionsWithCopyableElements() ||
+             !S.isCopyableElement(SD->getInst())) &&
+            !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
+            EI.UserTE->hasState() &&
+            (!EI.UserTE->hasCopyableElements() ||
+             !EI.UserTE->isCopyableElement(SD->getInst())))
+          SD->clearDirectDependencies();
         for (const Use &U : SD->getInst()->operands()) {
           unsigned &NumOps =
               UserOpToNumOps
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll b/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-10 < %s | FileCheck %s
+
+define void @test(ptr %0, i32 %1, i32 %2) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 56
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD_NARROWED_I_I:%.*]] = shl i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[ADD_NARROWED_I_I]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], -1
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP28]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], splat (i32 -2)
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 -2>, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i32> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i32>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> <i32 1, i32 poison>, i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP33:%.*]] = and <2 x i32> [[TMP17]], [[TMP32]]
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr null)
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <2 x i32> [[TMP34]] to <2 x i64>
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i32> [[TMP33]] to <2 x i64>
+; CHECK-NEXT:    [[TMP35:%.*]] = shl <2 x i64> [[TMP23]], splat (i64 1)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <2 x i64> [[TMP35]], [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = trunc <2 x i64> [[TMP25]] to <2 x i32>
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc <2 x i64> [[TMP25]] to <2 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 0)
+; CHECK-NEXT:    store <2 x i32> [[TMP16]], ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i32> [[TMP32]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP30:%.*]] = and <2 x i32> [[TMP29]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <2 x i32> [[TMP30]], [[TMP27]]
+; CHECK-NEXT:    store <2 x i32> [[TMP31]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %3 = getelementptr i8, ptr %0, i64 48
+  %4 = getelementptr i8, ptr %0, i64 52
+  %5 = getelementptr i8, ptr %0, i64 56
+  %6 = getelementptr i8, ptr %0, i64 60
+  %.pre21.i = load i32, ptr %5, align 8
+  %.pre23.i = load i32, ptr %6, align 4
+  %7 = and i32 %2, %1
+  %8 = and i32 %.pre21.i, 1
+  %9 = and i32 %1, %.pre23.i
+  call void @llvm.stackrestore.p0(ptr null)
+  %add.narrowed.i.i = shl i32 %1, 1
+  %10 = lshr i32 %7, 1
+  %11 = zext i32 %10 to i64
+  %12 = zext i32 %8 to i64
+  %reass.add1.i = shl i64 %12, 1
+  %13 = or i64 %reass.add1.i, %11
+  %14 = trunc i64 %13 to i32
+  %15 = zext i32 %9 to i64
+  %reass.add2.i = shl i64 %15, 1
+  %16 = or i64 %reass.add2.i, %12
+  %17 = trunc i64 %16 to i32
+  %18 = zext i32 %add.narrowed.i.i to i64
+  %19 = add i64 %18, -1
+  %20 = trunc i64 %19 to i32
+  %21 = trunc i64 %19 to i32
+  %22 = trunc i64 %13 to i32
+  %23 = trunc i64 %16 to i32
+  %24 = tail call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 0)
+  %25 = and i32 %20, -2
+  %26 = or i32 %1, %25
+  store i32 %26, ptr %3, align 16
+  %27 = and i32 %21, -2
+  %28 = xor i32 %27, -2
+  store i32 %28, ptr %4, align 4
+  %29 = and i32 %1, %14
+  %30 = or i32 %29, %22
+  store i32 %30, ptr %5, align 8
+  %31 = and i32 %1, %17
+  %32 = or i32 %31, %23
+  store i32 %32, ptr %6, align 4
+  ret void
+}
+
+declare void @llvm.stackrestore.p0(ptr) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn }