-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64] Generate zeroing forms of certain SVE2.2 instructions (1/11) #116259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Generate zeroing forms of certain SVE2.2 instructions (1/11) #116259
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Momchil Velikov (momchil-velikov) ChangesSVE2.2 introduces instructions with predicated forms with zeroing of the inactive lanes. This allows in some cases to save a This patch adds support for emitting the zeroing forms of Patch is 35.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116259.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index e4ad27d4bcfc00..4b9d34be02b2ef 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -381,6 +381,9 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
+def UseUnaryUndefPseudos
+ : Predicate<"!(Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2()))">;
+
def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisInt<1>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4f146b3ee59e9a..0ac009e38eed8b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -664,6 +664,14 @@ let Predicates = [HasSVEorSME] in {
defm FABS_ZPmZ : sve_int_un_pred_arit_bitwise_fp<0b100, "fabs", AArch64fabs_mt>;
defm FNEG_ZPmZ : sve_int_un_pred_arit_bitwise_fp<0b101, "fneg", AArch64fneg_mt>;
+ let Predicates = [HasSVEorSME, UseUnaryUndefPseudos] in {
+ defm FABS_ZPmZ : sve_int_un_pred_arit_hsd<AArch64fabs_mt>;
+ defm FNEG_ZPmZ : sve_int_un_pred_arit_hsd<AArch64fneg_mt>;
+
+ defm ABS_ZPmZ : sve_int_un_pred_arit_bhsd<AArch64abs_mt>;
+ defm NEG_ZPmZ : sve_int_un_pred_arit_bhsd<AArch64neg_mt>;
+ }
+
foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in {
// No dedicated instruction, so just clear the sign bit.
def : Pat<(VT (fabs VT:$op)),
@@ -4310,16 +4318,16 @@ let Predicates = [HasSVE2p2orSME2p2] in {
defm NOT_ZPzZ : sve_int_un_pred_arit_bitwise_z<0b110, "not">;
// floating point
- defm FABS_ZPzZ : sve_int_un_pred_arit_bitwise_fp_z<0b100, "fabs">;
- defm FNEG_ZPzZ : sve_int_un_pred_arit_bitwise_fp_z<0b101, "fneg">;
+ defm FABS_ZPzZ : sve_int_un_pred_arit_bitwise_fp_z<0b100, "fabs", AArch64fabs_mt>;
+ defm FNEG_ZPzZ : sve_int_un_pred_arit_bitwise_fp_z<0b101, "fneg", AArch64fneg_mt>;
// SVE2p2 integer unary arithmetic, zeroing predicate
defm SXTB_ZPzZ : sve_int_un_pred_arit_h_z<0b000, "sxtb">;
defm UXTB_ZPzZ : sve_int_un_pred_arit_h_z<0b001, "uxtb">;
defm SXTH_ZPzZ : sve_int_un_pred_arit_w_z<0b010, "sxth">;
defm UXTH_ZPzZ : sve_int_un_pred_arit_w_z<0b011, "uxth">;
- defm ABS_ZPzZ : sve_int_un_pred_arit_z< 0b110, "abs">;
- defm NEG_ZPzZ : sve_int_un_pred_arit_z< 0b111, "neg">;
+ defm ABS_ZPzZ : sve_int_un_pred_arit_z< 0b110, "abs", AArch64abs_mt>;
+ defm NEG_ZPzZ : sve_int_un_pred_arit_z< 0b111, "neg", AArch64neg_mt>;
def SXTW_ZPzZ_D : sve_int_un_pred_arit_z<0b11, 0b1000, "sxtw", ZPR64>;
def UXTW_ZPzZ_D : sve_int_un_pred_arit_z<0b11, 0b1010, "uxtw", ZPR64>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 6de6aed3b2a816..84bf3585db5a16 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -482,6 +482,8 @@ let Predicates = [HasSVEorSME] in {
//===----------------------------------------------------------------------===//
// SVE pattern match helpers.
//===----------------------------------------------------------------------===//
+def SVEDup0 : ComplexPattern<vAny, 0, "SelectDupZero", []>;
+def SVEDup0Undef : ComplexPattern<vAny, 0, "SelectDupZeroOrUndef", []>;
class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
Instruction inst>
@@ -502,6 +504,11 @@ multiclass SVE_1_Op_PassthruUndef_Pat<ValueType vtd, SDPatternOperator op, Value
(inst $Op3, $Op1, $Op2)>;
}
+class SVE_1_Op_PassthruUndefZero_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+ ValueType vts, Instruction inst>
+ : Pat<(vtd (op pg:$Op1, vts:$Op2, (vtd (SVEDup0Undef)))),
+ (inst $Op1, $Op2)>;
+
// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the
// type of rounding. This is matched by timm0_1 in pattern below and ignored.
class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
@@ -517,13 +524,12 @@ multiclass SVE_1_Op_PassthruUndef_Round_Pat<ValueType vtd, SDPatternOperator op,
(inst $Op3, $Op1, $Op2)>;
}
-def SVEDup0 : ComplexPattern<vAny, 0, "SelectDupZero", []>;
-
class SVE_1_Op_PassthruZero_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, Instruction inst>
: Pat<(vtd (op (vtd (SVEDup0)), vt1:$Op1, vt2:$Op2)),
(inst (IMPLICIT_DEF), $Op1, $Op2)>;
+
class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (vt zprty:$Op1), (vt (splat_vector (it (cpx i32:$imm, i32:$shift)))))),
@@ -606,8 +612,6 @@ class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
(inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
-def SVEDup0Undef : ComplexPattern<vAny, 0, "SelectDupZeroOrUndef", []>;
-
let AddedComplexity = 1 in {
class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, Instruction inst>
@@ -4820,23 +4824,18 @@ multiclass sve_int_un_pred_arit<bits<3> opc, string asm,
def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
-
- def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
- def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
- def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
- def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
-
- defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _B_UNDEF)>;
- defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Pseudo>(NAME # _H_UNDEF)>;
- defm : SVE_1_Op_PassthruUndef_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Pseudo>(NAME # _S_UNDEF)>;
- defm : SVE_1_Op_PassthruUndef_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Pseudo>(NAME # _D_UNDEF)>;
}
-multiclass sve_int_un_pred_arit_z<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_z<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_un_pred_arit_z<0b00, { opc, 0b0 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit_z<0b01, { opc, 0b0 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b0 }, asm, ZPR64>;
+
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_h<bits<3> opc, string asm,
@@ -4950,7 +4949,22 @@ multiclass sve_int_un_pred_arit_bitwise_fp<bits<3> opc, string asm,
def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve_int_un_pred_arit_bitwise_fp_z<bits<3> opc, string asm, SDPatternOperator op> {
+ def _H : sve_int_un_pred_arit_z<0b01, { opc, 0b1 }, asm, ZPR16>;
+ def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b1 }, asm, ZPR32>;
+ def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b1 }, asm, ZPR64>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_PassthruUndefZero_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve_int_un_pred_arit_hsd<SDPatternOperator op> {
def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
@@ -4963,10 +4977,16 @@ multiclass sve_int_un_pred_arit_bitwise_fp<bits<3> opc, string asm,
defm : SVE_1_Op_PassthruUndef_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Pseudo>(NAME # _D_UNDEF)>;
}
-multiclass sve_int_un_pred_arit_bitwise_fp_z<bits<3> opc, string asm> {
- def _H : sve_int_un_pred_arit_z<0b01, { opc, 0b1 }, asm, ZPR16>;
- def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b1 }, asm, ZPR32>;
- def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b1 }, asm, ZPR64>;
+multiclass sve_int_un_pred_arit_bhsd<SDPatternOperator op> {
+ def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
+ def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+ def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+ def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _B_UNDEF)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Pseudo>(NAME # _H_UNDEF)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Pseudo>(NAME # _S_UNDEF)>;
+ defm : SVE_1_Op_PassthruUndef_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Pseudo>(NAME # _D_UNDEF)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
new file mode 100644
index 00000000000000..1caee994220f05
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
@@ -0,0 +1,666 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2
+
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2
+
+target triple = "aarch64-linux"
+
+define <vscale x 2 x double> @test_svabs_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svabs_f64_x_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fabs z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f64_x_1:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+ ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svabs_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svabs_f64_x_2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: fabs z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f64_x_2:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+ ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svabs_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svabs_f64_z:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, #0 // =0x0
+; CHECK-NEXT: fabs z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f64_z:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+ ret <vscale x 2 x double> %0
+}
+
+define <vscale x 4 x float> @test_svabs_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svabs_f32_x_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fabs z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f32_x_1:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+ ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svabs_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svabs_f32_x_2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: fabs z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f32_x_2:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+ ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svabs_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svabs_f32_z:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: fabs z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f32_z:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+ ret <vscale x 4 x float> %0
+}
+
+define <vscale x 8 x half> @test_svabs_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svabs_f16_x_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fabs z0.h, p0/m, z0.h
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f16_x_1:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+ ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svabs_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svabs_f16_x_2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: fabs z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f16_x_2:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+ ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svabs_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svabs_f16_z:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fabs z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_f16_z:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: fabs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+ ret <vscale x 8 x half> %0
+}
+
+define <vscale x 16 x i8> @test_svabs_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svabs_s8_x_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: abs z0.b, p0/m, z0.b
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_s8_x_1:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: abs z0.b, p0/z, z0.b
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+ ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svabs_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svabs_s8_x_2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: abs z0.b, p0/m, z1.b
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_s8_x_2:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: abs z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT: ret
+entry:
+ %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+ ret <vscale x 16 x i8> %1
+}
+
+define <vscale x 16 x i8> @test_svabs_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svabs_s8_z:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.b, #0 // =0x0
+; CHECK-NEXT: abs z0.b, p0/m, z1.b
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_s8_z:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: abs z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT: ret
+entry:
+ %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+ ret <vscale x 16 x i8> %1
+}
+
+define <vscale x 8 x i16> @test_svabs_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svabs_s16_x_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: abs z0.h, p0/m, z0.h
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_s16_x_1:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: abs z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svabs_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svabs_s16_x_2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movprfx z0, z1
+; CHECK-NEXT: abs z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_s16_x_2:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: abs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svabs_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svabs_s16_z:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: abs z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_s16_z:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: abs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svabs_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svabs_s32_x_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: abs z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+;
+; CHECK-2p2-LABEL: test_svabs_s32_x_1:
+; CHECK-2p2: // %bb.0: // %entry
+; CHECK-2p2-NEXT: abs z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT: ret
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svabs_s32_x_2(<vsca...
[truncated]
|
jthackray
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
67a7378 to
e1f4e15
Compare
SVE2.2 introduces instructions with predicated forms with zeroing of the inactive lanes. This allows in some cases to save a `movprfx` or a `mov` instruction when emitting code for `_x` or `_z` variants of intrinsics. This patch adds support for emitting the zeroing forms of `ABS`, `NEG`, `FABS`, and `FNEG` instructions.
e1f4e15 to
c6d7b10
Compare
… all-true predicate When the predicate of a destructive operation is known to be all-true, for example fabs z0.s, p0/m, z1.s then the entire output register is written and we can use a zeroing (instead of a merging) form of the instruction, for example fabs z0.s, p0/z, z1.s thus eliminate the dependency on the input-output destination register without the need to insert a `movprfx`. This patch complements (and in the case of 2b3266c, fixes a regression) the following: 7f4414b [AArch64] Generate zeroing forms of certain SVE2.2 instructions (4/11) (llvm#116830) 2474cf7 [AArch64] Generate zeroing forms of certain SVE2.2 instructions (3/11) (llvm#116829) 6f285d3 [AArch64] Generate zeroing forms of certain SVE2.2 instructions (2/11) (llvm#116828) 2b3266c [AArch64] Generate zeroing forms of certain SVE2.2 instructions (1/11) (llvm#116259)
… all-true predicate (#120595) When the predicate of a destructive operation is known to be all-true, for example fabs z0.s, p0/m, z1.s then the entire output register is written and we can use a zeroing (instead of a merging) form of the instruction, for example fabs z0.s, p0/z, z1.s thus eliminate the dependency on the input-output destination register without the need to insert a `movprfx`. This patch complements (and in the case of 2b3266c, fixes a regression) the following: 7f4414b [AArch64] Generate zeroing forms of certain SVE2.2 instructions (4/11) (#116830) 2474cf7 [AArch64] Generate zeroing forms of certain SVE2.2 instructions (3/11) (#116829) 6f285d3 [AArch64] Generate zeroing forms of certain SVE2.2 instructions (2/11) (#116828) 2b3266c [AArch64] Generate zeroing forms of certain SVE2.2 instructions (1/11) (#116259)
…ons with an all-true predicate (#120595)
When the predicate of a destructive operation is known to be all-true,
for example
fabs z0.s, p0/m, z1.s
then the entire output register is written and we can use a zeroing
(instead of a merging) form of the instruction, for example
fabs z0.s, p0/z, z1.s
thus eliminate the dependency on the input-output destination register
without the need to insert a `movprfx`.
This patch complements (and in the case of
llvm/llvm-project@2b3266c,
fixes a regression) the following:
llvm/llvm-project@7f4414b
[AArch64] Generate zeroing forms of certain SVE2.2 instructions (4/11)
(llvm/llvm-project#116830)
llvm/llvm-project@2474cf7
[AArch64] Generate zeroing forms of certain SVE2.2 instructions (3/11)
(llvm/llvm-project#116829)
llvm/llvm-project@6f285d3
[AArch64] Generate zeroing forms of certain SVE2.2 instructions (2/11)
(llvm/llvm-project#116828)
llvm/llvm-project@2b3266c
[AArch64] Generate zeroing forms of certain SVE2.2 instructions (1/11)
(llvm/llvm-project#116259)
SVE2.2 introduces instructions with predicated forms with zeroing of the inactive lanes. This allows in some cases to save a
movprfxor amovinstruction when emitting code for_xor_zvariants of intrinsics.This patch adds support for emitting the zeroing forms of
ABS,NEG,FABS, andFNEGinstructions.