Skip to content

Conversation

@ostannard
Copy link
Collaborator

There were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics:

  • The VSBCI instruction behaves as if the carry-in is always set, but we were selecting it when the carry-in is clear.
  • The vsbciq intrinsics should generate IR with the carry-in set, but they were leaving it clear.

These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant.

The MVE VSBCI instruction should be selected when the carry-in is set,
unlike the VADCI instruction which should be selected when the carry-in
is zero. This was already implemented in the code, but the function was
always called with Add=1, even for the subtract instructions.
The VSBCI instruction behaves as if the carry flag input is set, unlike
the VADCI instruction which behaves as if it is clear. The IR intrinsics
take and return a carry flag in the format expected in FPSCR, with the
carry flag in bit 29, so we need to match that in clang.
@graphite-app
Copy link

graphite-app bot commented Dec 2, 2024

Your org has enabled the Graphite merge queue for merging into main

Add the label “FP Bundles” to the PR and Graphite will automatically add it to the merge queue when it’s ready to merge.

You must have a Graphite account and log in to Graphite in order to use the merge queue. Sign up using this link.

@llvmbot llvmbot added clang Clang issues not falling into any other category clang:frontend Language frontend issues, e.g. anything involving "Sema" labels Dec 2, 2024
@llvmbot
Copy link
Member

llvmbot commented Dec 2, 2024

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-arm

Author: Oliver Stannard (ostannard)

Changes

There were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics:

  • The VSBCI instruction behaves as if the carry-in is always set, but we were selecting it when the carry-in is clear.
  • The vsbciq intrinsics should generate IR with the carry-in set, but they were leaving it clear.

These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant.


Full diff: https://github.com/llvm/llvm-project/pull/118284.diff

5 Files Affected:

  • (modified) clang/include/clang/Basic/arm_mve.td (+5-5)
  • (modified) clang/test/CodeGen/arm-mve-intrinsics/vadc.c (+4-4)
  • (modified) llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp (+1-1)
  • (modified) llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll (+90-2)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll (+1-1)
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 93abbc47c54dd5..6dd8c52ddfd772 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -1270,13 +1270,13 @@ defm sqrshr: ScalarSaturatingShiftReg<s32, s64>;
 def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>;
 def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>;
 
-multiclass vadcsbc {
+multiclass vadcsbc<dag initial_carry_in> {
   def q: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
       (seq (IRInt<NAME, [Vector]> $a, $b, (shl (load $carry), 29)):$pair,
            (store (and 1, (lshr (xval $pair, 1), 29)), $carry),
            (xval $pair, 0))>;
   def iq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
-      (seq (IRInt<NAME, [Vector]> $a, $b, 0):$pair,
+      (seq (IRInt<NAME, [Vector]> $a, $b, initial_carry_in):$pair,
            (store (and 1, (lshr (xval $pair, 1), 29)), $carry),
            (xval $pair, 0))>;
   def q_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
@@ -1288,13 +1288,13 @@ multiclass vadcsbc {
   def iq_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
                                     Ptr<uint>:$carry, Predicate:$pred),
       (seq (IRInt<NAME # "_predicated", [Vector, Predicate]> $inactive, $a, $b,
-               0, $pred):$pair,
+               initial_carry_in, $pred):$pair,
            (store (and 1, (lshr (xval $pair, 1), 29)), $carry),
            (xval $pair, 0))>;
 }
 let params = T.Int32 in {
-  defm vadc: vadcsbc;
-  defm vsbc: vadcsbc;
+  defm vadc: vadcsbc<(u32 0)>;
+  defm vsbc: vadcsbc<(shl 1, 29)>;
 }
 
 let params = T.Int in {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
index 21087b83300c88..29e067d65c6c89 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
@@ -92,7 +92,7 @@ int32x4_t test_vadcq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsigne
 
 // CHECK-LABEL: @test_vsbciq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
 // CHECK-NEXT:    [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -110,7 +110,7 @@ int32x4_t test_vsbciq_s32(int32x4_t a, int32x4_t b, unsigned *carry_out) {
 
 // CHECK-LABEL: @test_vsbciq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
 // CHECK-NEXT:    [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -170,7 +170,7 @@ uint32x4_t test_vsbcq_u32(uint32x4_t a, uint32x4_t b, unsigned *carry) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
 // CHECK-NEXT:    [[TMP5:%.*]] = and i32 1, [[TMP4]]
@@ -190,7 +190,7 @@ int32x4_t test_vsbciq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsign
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
 // CHECK-NEXT:    [[TMP5:%.*]] = and i32 1, [[TMP4]]
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 73ee8cf81adcd6..9714370ec7cc8e 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -5229,7 +5229,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     case Intrinsic::arm_mve_vsbc:
     case Intrinsic::arm_mve_vsbc_predicated:
-      SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
+      SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, false,
                         IntNo == Intrinsic::arm_mve_vsbc_predicated);
       return;
     case Intrinsic::arm_mve_vshlc:
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
index 967e8a94900e16..2b4c478dfe3e6a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
@@ -108,7 +108,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_s32(<4 x i32> %a, <4 x i32> %b, pt
 ; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
   %1 = extractvalue { <4 x i32>, i32 } %0, 1
   %2 = lshr i32 %1, 29
   %3 = and i32 %2, 1
@@ -125,6 +125,46 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_u32(<4 x i32> %a, <4 x i32> %b, pt
 ; CHECK-NEXT:    ubfx r1, r1, #29, #1
 ; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
+  %1 = extractvalue { <4 x i32>, i32 } %0, 1
+  %2 = lshr i32 %1, 29
+  %3 = and i32 %2, 1
+  store i32 %3, ptr %carry_out, align 4
+  %4 = extractvalue { <4 x i32>, i32 } %0, 0
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_s32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_s32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT:    vsbc.i32 q0, q0, q1
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+  %1 = extractvalue { <4 x i32>, i32 } %0, 1
+  %2 = lshr i32 %1, 29
+  %3 = and i32 %2, 1
+  store i32 %3, ptr %carry_out, align 4
+  %4 = extractvalue { <4 x i32>, i32 } %0, 0
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_u32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_u32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT:    vsbc.i32 q0, q0, q1
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
 entry:
   %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
   %1 = extractvalue { <4 x i32>, i32 } %0, 1
@@ -196,7 +236,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_s32(<4 x i32> %inactive, <4 x i3
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
   %3 = extractvalue { <4 x i32>, i32 } %2, 1
   %4 = lshr i32 %3, 29
   %5 = and i32 %4, 1
@@ -215,6 +255,54 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_u32(<4 x i32> %inactive, <4 x i3
 ; CHECK-NEXT:    ubfx r1, r1, #29, #1
 ; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
+  %3 = extractvalue { <4 x i32>, i32 } %2, 1
+  %4 = lshr i32 %3, 29
+  %5 = and i32 %4, 1
+  store i32 %5, ptr %carry_out, align 4
+  %6 = extractvalue { <4 x i32>, i32 } %2, 0
+  ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_s32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_s32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsbct.i32 q0, q1, q2
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  %3 = extractvalue { <4 x i32>, i32 } %2, 1
+  %4 = lshr i32 %3, 29
+  %5 = and i32 %4, 1
+  store i32 %5, ptr %carry_out, align 4
+  %6 = extractvalue { <4 x i32>, i32 } %2, 0
+  ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_u32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_u32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsbct.i32 q0, q1, q2
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
index 7fe8e94589a0c5..8a0134a4c5ff29 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
@@ -59,7 +59,7 @@ define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    b use_int32x4_t
 entry:
-  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 536870912)
   %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
   %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
   tail call void @use_int32x4_t(<4 x i32> %result_low)

@ostannard ostannard merged commit f893b47 into llvm:main Dec 6, 2024
13 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:ARM clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category miscompilation

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants