-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AArch64] Keep floating-point conversion in SIMD #147707
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Guy David (guy-david) ChangesStores can be issued faster if the result is kept in the SIMD/FP registers. Full diff: https://github.com/llvm/llvm-project/pull/147707.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 811877ffacedb..68a1f41535680 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6598,6 +6598,16 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+let HasOneUse = 1 in {
+def fp_to_uint_oneuse : PatFrag<(ops node:$src0), (fp_to_uint $src0)>;
+def fp_to_sint_oneuse : PatFrag<(ops node:$src0), (fp_to_sint $src0)>;
+}
+
+def ignore_assertzext : PatFrag<
+ (ops node:$src),
+ (assertzext node:$src)
+>;
+
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
@@ -6632,6 +6642,38 @@ def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
(UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+// float -> int conversion followed by a store should use the value in the first
+// lane to avoid expensive fpr -> gpr transfers.
+let AddedComplexity = 19 in {
+// f32 -> i32
+def : Pat<(store (ignore_assertzext (i32 (fp_to_uint_oneuse f32:$src))), GPR64sp:$Rn),
+ (STRSui (FCVTZUv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
+def : Pat<(store (ignore_assertzext (i32 (fp_to_sint_oneuse f32:$src))), GPR64sp:$Rn),
+ (STRSui (FCVTZSv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
+
+// f64 -> i64
+def : Pat<(store (ignore_assertzext (i64 (fp_to_uint_oneuse f64:$src))), GPR64sp:$Rn),
+ (STRDui (FCVTZUv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
+def : Pat<(store (ignore_assertzext (i64 (fp_to_sint_oneuse f64:$src))), GPR64sp:$Rn),
+ (STRDui (FCVTZSv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
+
+// f32 -> i8
+def : Pat<(truncstorei8 (ignore_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), bsub)),
+ GPR64sp:$Rn, (i64 0))>;
+def : Pat<(truncstorei8 (ignore_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), bsub)),
+ GPR64sp:$Rn, (i64 0))>;
+
+// f32 -> i16
+def : Pat<(truncstorei16 (ignore_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRHui (f16 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), hsub)),
+ GPR64sp:$Rn, (i64 0))>;
+def : Pat<(truncstorei16 (ignore_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRHui (f16 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), hsub)),
+ GPR64sp:$Rn, (i64 0))>;
+}
+
// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
new file mode 100644
index 0000000000000..ca12fcb1dcc1b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+
+define void @f32_to_u8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f32_to_s32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_u64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_s64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_i32_multiple_uses:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: mov x9, x0
+; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: strb w8, [x9]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret i32 %conv
+}
|
c73288a to
54e0bfd
Compare
david-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The intent of the PR makes sense to me and it's nice to see it using the higher throughput versions of fcvtzs, etc. I just had a few comments about possibly improving the patterns - let me know what you think!
|
Is it possible to write this as a post legalisation DAG combine? to remove the need for dedicated isel patterns. |
Do you mean a pattern that combines stores whenever the source is a |
54e0bfd to
58e48af
Compare
I was thinking the former. There's already a combine to handle the case where the source is |
Sounds feasible, although I don't have too much experience with that. Can we postpone it to a follow-up PR? |
I'd rather not postpone, but am happy to take a look to see what's involved. |
I've hit a few blockers around legalization, see 147d2c6:
|
|
We can avoid legalization issues if we use 2-element vectors instead of single element ones as shown here: 89d044e. The drawback is that now the core has to operate on twice the data because the emitted code has |
|
Thanks for continuing to dig down on this. My immediate thoughts are:
|
58e48af to
9619851
Compare
|
Adding a pattern around insertions to undef and extractions seems to work. |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
9619851 to
2d33584
Compare
2d33584 to
a789b11
Compare
a53c68e to
9161cc3
Compare
611070b to
4eee9a4
Compare
|
/cherry-pick 58d70dc |
|
/pull-request #151317 |
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like the patch is causing a crash for the reproducer below:
bin/llc reduced.ll
LLVM ERROR: Cannot select: t42: i64 = extract_vector_elt t40, Constant:i64<0>
t40: v8i16 = AArch64ISD::NVCAST t39
t39: v2i64 = fp_to_uint t38
t38: v2f64 = scalar_to_vector t4
t4: f64 = fadd t2, ConstantFP:f64<2.000000e+00>
t2: f64,ch = CopyFromReg t0, Register:f64 %3
In function: test
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"
declare i1 @cond()
define void @test(ptr %dst, double %0) {
entry:
br label %loop
loop:
%ptr.iv = phi ptr [ %dst, %entry ], [ %ptr.iv.next, %loop ]
%1 = fadd double %0, 2.000000e+00
%conv137.us.us.us = fptoui double %1 to i64
%conv142.us.us.us = trunc i64 %conv137.us.us.us to i16
%ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 2
store i16 %conv142.us.us.us, ptr %ptr.iv, align 2
%c = call i1 @cond()
br i1 %c, label %loop, label %exit
exit:
ret void
}
…oreValueFPToInt. This reverts a small part of #147707 that triggers an isel failure because we cannot extract an >i32 element into an i64 result.
|
For now I've pushed a commit to remove the problematic code, I forgot the extracts are not as flexible as you'd hope. The majority of the PR's functionality is maintained and we can circle back if supporting the post inc store case proves valuable. |
…n combineStoreValueFPToInt. This reverts a small part of llvm/llvm-project#147707 that triggers an isel failure because we cannot extract an >i32 element into an i64 result.
|
@fhahn Thank you very much for coming up with a reproducer, I guess my tests were not comprehensive enough. |
The commit at #147707 introduced a bug because of missing patterns for post-inc stores where the input is a vector_extract with i64 types. Additionally, remove the early pre-legalization early-exit as it can miss its opportunity to apply the optimization.
|
Hi. Sorry for the late reply but I saw this was requested as a backport. Am I right that this takes a fptosi/fptoui and converts it to a vector operation, so that we can then convert it back to a scalar operation? That feels a bit of a messy way of handling it (*). There are some cases in https://godbolt.org/z/dhTh8Pb98 (although godbolt doesn't seem to have caught up so far), of cases that should probably be preferring existing lowering as fptosi(fmul(..)) and fptosi(round/ceil/etc) can be converted to single instructions. Saving the cross-register-bank copy is a great, but I'm not sure it is better than folding in another instruction like fmul. (* I'm really just not a fan of creating messy dags - it often has a hidden cost that only comes up in complex cases. The code looks sensible otherwise. It sounds like what this really needs, fp16 aside, is to convert the dag to |
|
PreprocessISelDAG sounds useful, otherwise as you say there wasn't much option. Presumably in the short term we can restore cases like you mention by simply checking the source of the |
|
Thanks for the suggestions, I'll try the bitcast route and see where it leads. |
|
PreprocessISelDAG is a new one that we don't use very much, I just happened to run into it the other day on another target. You might find that because the patterns are greedy, it ends up producing the same thing anyway. (i.e. it picks bitcast(fptosi)+round over fmov+fptosi_round. It hopefully allows adding patterns for bitcast(fptosi(round all at once though. |
Stores can be issued faster if the result is kept in the SIMD/FP registers. The `HasOneUse` guards against creating two floating point conversions, if for example there's some arithmetic done on the converted value as well. Another approach would be to inspect the user instructions during lowering, but I don't see that type of check in the lowering too often.
…oreValueFPToInt. This reverts a small part of llvm#147707 that triggers an isel failure because we cannot extract an >i32 element into an i64 result.
Stores can be issued faster if the result is kept in the SIMD/FP registers.
The
HasOneUseguards against creating two floating point conversions, if for example there's some arithmetic done on the converted value as well. Another approach would be to inspect the user instructions during lowering, but I don't see that type of check in the lowering too often.