-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[ARM] Fix instruction selection for MVE vsbciq intrinsic #118284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The MVE VSBCI instruction should be selected when the carry-in is set, unlike the VADCI instruction which should be selected when the carry-in is zero. This was already implemented in the code, but the function was always called with Add=1, even for the subtract instructions.
The VSBCI instruction behaves as if the carry flag input is set, unlike the VADCI instruction which behaves as if it is clear. The IR intrinsics take and return a carry flag in the format expected in FPSCR, with the carry flag in bit 29, so we need to match that in clang.
Your org has enabled the Graphite merge queue for merging into mainAdd the label “FP Bundles” to the PR and Graphite will automatically add it to the merge queue when it’s ready to merge. You must have a Graphite account and log in to Graphite in order to use the merge queue. Sign up using this link. |
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-arm Author: Oliver Stannard (ostannard) ChangesThere were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics:
These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant. Full diff: https://github.com/llvm/llvm-project/pull/118284.diff 5 Files Affected:
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 93abbc47c54dd5..6dd8c52ddfd772 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -1270,13 +1270,13 @@ defm sqrshr: ScalarSaturatingShiftReg<s32, s64>;
def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>;
def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>;
-multiclass vadcsbc {
+multiclass vadcsbc<dag initial_carry_in> {
def q: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
(seq (IRInt<NAME, [Vector]> $a, $b, (shl (load $carry), 29)):$pair,
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
(xval $pair, 0))>;
def iq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
- (seq (IRInt<NAME, [Vector]> $a, $b, 0):$pair,
+ (seq (IRInt<NAME, [Vector]> $a, $b, initial_carry_in):$pair,
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
(xval $pair, 0))>;
def q_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
@@ -1288,13 +1288,13 @@ multiclass vadcsbc {
def iq_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
Ptr<uint>:$carry, Predicate:$pred),
(seq (IRInt<NAME # "_predicated", [Vector, Predicate]> $inactive, $a, $b,
- 0, $pred):$pair,
+ initial_carry_in, $pred):$pair,
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
(xval $pair, 0))>;
}
let params = T.Int32 in {
- defm vadc: vadcsbc;
- defm vsbc: vadcsbc;
+ defm vadc: vadcsbc<(u32 0)>;
+ defm vsbc: vadcsbc<(shl 1, 29)>;
}
let params = T.Int in {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
index 21087b83300c88..29e067d65c6c89 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
@@ -92,7 +92,7 @@ int32x4_t test_vadcq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsigne
// CHECK-LABEL: @test_vsbciq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
// CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -110,7 +110,7 @@ int32x4_t test_vsbciq_s32(int32x4_t a, int32x4_t b, unsigned *carry_out) {
// CHECK-LABEL: @test_vsbciq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
// CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -170,7 +170,7 @@ uint32x4_t test_vsbcq_u32(uint32x4_t a, uint32x4_t b, unsigned *carry) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
// CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]]
@@ -190,7 +190,7 @@ int32x4_t test_vsbciq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsign
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
// CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]]
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 73ee8cf81adcd6..9714370ec7cc8e 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -5229,7 +5229,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
case Intrinsic::arm_mve_vsbc:
case Intrinsic::arm_mve_vsbc_predicated:
- SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
+ SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, false,
IntNo == Intrinsic::arm_mve_vsbc_predicated);
return;
case Intrinsic::arm_mve_vshlc:
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
index 967e8a94900e16..2b4c478dfe3e6a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
@@ -108,7 +108,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_s32(<4 x i32> %a, <4 x i32> %b, pt
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
- %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+ %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
%1 = extractvalue { <4 x i32>, i32 } %0, 1
%2 = lshr i32 %1, 29
%3 = and i32 %2, 1
@@ -125,6 +125,46 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_u32(<4 x i32> %a, <4 x i32> %b, pt
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
+ %1 = extractvalue { <4 x i32>, i32 } %0, 1
+ %2 = lshr i32 %1, 29
+ %3 = and i32 %2, 1
+ store i32 %3, ptr %carry_out, align 4
+ %4 = extractvalue { <4 x i32>, i32 } %0, 0
+ ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_s32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_s32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT: vsbc.i32 q0, q0, q1
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+ %1 = extractvalue { <4 x i32>, i32 } %0, 1
+ %2 = lshr i32 %1, 29
+ %3 = and i32 %2, 1
+ store i32 %3, ptr %carry_out, align 4
+ %4 = extractvalue { <4 x i32>, i32 } %0, 0
+ ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_u32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_u32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT: vsbc.i32 q0, q0, q1
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
%1 = extractvalue { <4 x i32>, i32 } %0, 1
@@ -196,7 +236,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_s32(<4 x i32> %inactive, <4 x i3
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+ %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 1
%4 = lshr i32 %3, 29
%5 = and i32 %4, 1
@@ -215,6 +255,54 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_u32(<4 x i32> %inactive, <4 x i3
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
+ %3 = extractvalue { <4 x i32>, i32 } %2, 1
+ %4 = lshr i32 %3, 29
+ %5 = and i32 %4, 1
+ store i32 %5, ptr %carry_out, align 4
+ %6 = extractvalue { <4 x i32>, i32 } %2, 0
+ ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_s32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_s32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsbct.i32 q0, q1, q2
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+ %3 = extractvalue { <4 x i32>, i32 } %2, 1
+ %4 = lshr i32 %3, 29
+ %5 = and i32 %4, 1
+ store i32 %5, ptr %carry_out, align 4
+ %6 = extractvalue { <4 x i32>, i32 } %2, 0
+ ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_u32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_u32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsbct.i32 q0, q1, q2
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
index 7fe8e94589a0c5..8a0134a4c5ff29 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
@@ -59,7 +59,7 @@ define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x
; CHECK-NEXT: pop.w {r7, lr}
; CHECK-NEXT: b use_int32x4_t
entry:
- %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+ %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 536870912)
%carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
%result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
tail call void @use_int32x4_t(<4 x i32> %result_low)
|
There were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics:
These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant.