-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64] Enable using SVE2 bit-sel instructions with Neon types. #146906
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesThis affects EOR3/BCAX/BSL/NBSL/BSL1N/BSL2N. This had initially been discussed in #138689 (comment). Patch is 22.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146906.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 848b1c30bbeb5..d5c12a9658113 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5368,6 +5368,19 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm,
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
+
+ // Allow selecting SVE2 ternary ops with Neon types.
+ foreach VT = [nxv16i8, nxv8i16, nxv4i32, nxv2i64] in {
+ def : Pat<(SVEType<VT>.DSub (op V64:$op1, V64:$op2, V64:$op3)),
+ (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $op1, dsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), $op2, dsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), $op3, dsub)), dsub)>;
+
+ def : Pat<(SVEType<VT>.ZSub (op V128:$op1, V128:$op2, V128:$op3)),
+ (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $op1, zsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), $op2, zsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), $op3, zsub)), zsub)>;
+ }
}
class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
diff --git a/llvm/test/CodeGen/AArch64/bcax.ll b/llvm/test/CodeGen/AArch64/bcax.ll
index e3c73c36e534b..e4eb608c2545f 100644
--- a/llvm/test/CodeGen/AArch64/bcax.ll
+++ b/llvm/test/CodeGen/AArch64/bcax.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
define <2 x i64> @bcax_64x2(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; SHA3-LABEL: bcax_64x2:
@@ -13,6 +14,15 @@ define <2 x i64> @bcax_64x2(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; NOSHA3-NEXT: bic v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v0.16b, v2.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: bcax_64x2:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: bcax z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: mov v0.16b, v2.16b
+; SVE2-NEXT: ret
%4 = xor <2 x i64> %1, <i64 -1, i64 -1>
%5 = and <2 x i64> %4, %0
%6 = xor <2 x i64> %5, %2
@@ -30,6 +40,15 @@ define <4 x i32> @bcax_32x4(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
; NOSHA3-NEXT: bic v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v0.16b, v2.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: bcax_32x4:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: bcax z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: mov v0.16b, v2.16b
+; SVE2-NEXT: ret
%4 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
%5 = and <4 x i32> %4, %0
%6 = xor <4 x i32> %5, %2
@@ -47,6 +66,15 @@ define <8 x i16> @bcax_16x8(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
; NOSHA3-NEXT: bic v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v0.16b, v2.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: bcax_16x8:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: bcax z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: mov v0.16b, v2.16b
+; SVE2-NEXT: ret
%4 = xor <8 x i16> %1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
%5 = and <8 x i16> %4, %0
%6 = xor <8 x i16> %5, %2
@@ -64,6 +92,15 @@ define <16 x i8> @bcax_8x16(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; NOSHA3-NEXT: bic v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v0.16b, v2.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: bcax_8x16:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: bcax z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: mov v0.16b, v2.16b
+; SVE2-NEXT: ret
%4 = xor <16 x i8> %1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%5 = and <16 x i8> %4, %0
%6 = xor <16 x i8> %5, %2
diff --git a/llvm/test/CodeGen/AArch64/bsl.ll b/llvm/test/CodeGen/AArch64/bsl.ll
new file mode 100644
index 0000000000000..b672a446e579e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bsl.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=NEON
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefix=SVE2
+
+; Test SVE2 BSL/NBSL/BSL1N/BSL2N code generation for:
+; #define BSL(x,y,z) ( ((x) & (z)) | ( (y) & ~(z)))
+; #define NBSL(x,y,z) (~(((x) & (z)) | ( (y) & ~(z))))
+; #define BSL1N(x,y,z) ( (~(x) & (z)) | ( (y) & ~(z)))
+; #define BSL2N(x,y,z) ( ((x) & (z)) | (~(y) & ~(z)))
+;
+; See also llvm/test/CodeGen/AArch64/sve2-bsl.ll.
+
+; Test basic codegen.
+
+define <1 x i64> @bsl_v1i64(<1 x i64> %0, <1 x i64> %1, <1 x i64> %2) {
+; NEON-LABEL: bsl_v1i64:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.8b, v1.8b, v2.8b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: bsl_v1i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: bif v0.8b, v1.8b, v2.8b
+; SVE2-NEXT: ret
+ %4 = and <1 x i64> %2, %0
+ %5 = xor <1 x i64> %2, splat (i64 -1)
+ %6 = and <1 x i64> %1, %5
+ %7 = or <1 x i64> %4, %6
+ ret <1 x i64> %7
+}
+
+define <1 x i64> @nbsl_v1i64(<1 x i64> %0, <1 x i64> %1, <1 x i64> %2) {
+; NEON-LABEL: nbsl_v1i64:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.8b, v1.8b, v2.8b
+; NEON-NEXT: mvn v0.8b, v0.8b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v1i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <1 x i64> %2, %0
+ %5 = xor <1 x i64> %2, splat (i64 -1)
+ %6 = and <1 x i64> %1, %5
+ %7 = or <1 x i64> %4, %6
+ %8 = xor <1 x i64> %7, splat (i64 -1)
+ ret <1 x i64> %8
+}
+
+define <1 x i64> @bsl1n_v1i64(<1 x i64> %0, <1 x i64> %1, <1 x i64> %2) {
+; NEON-LABEL: bsl1n_v1i64:
+; NEON: // %bb.0:
+; NEON-NEXT: mvn v0.8b, v0.8b
+; NEON-NEXT: bif v0.8b, v1.8b, v2.8b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: bsl1n_v1i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT: bsl1n z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %4 = xor <1 x i64> %0, splat (i64 -1)
+ %5 = and <1 x i64> %2, %4
+ %6 = xor <1 x i64> %2, splat (i64 -1)
+ %7 = and <1 x i64> %1, %6
+ %8 = or <1 x i64> %5, %7
+ ret <1 x i64> %8
+}
+
+define <1 x i64> @bsl2n_v1i64(<1 x i64> %0, <1 x i64> %1, <1 x i64> %2) {
+; NEON-LABEL: bsl2n_v1i64:
+; NEON: // %bb.0:
+; NEON-NEXT: and v0.8b, v2.8b, v0.8b
+; NEON-NEXT: orr v1.8b, v2.8b, v1.8b
+; NEON-NEXT: orn v0.8b, v0.8b, v1.8b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: bsl2n_v1i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT: bsl2n z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <1 x i64> %2, %0
+ %5 = or <1 x i64> %2, %1
+ %6 = xor <1 x i64> %5, splat (i64 -1)
+ %7 = or <1 x i64> %4, %6
+ ret <1 x i64> %7
+}
+
+define <2 x i64> @bsl_v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
+; NEON-LABEL: bsl_v2i64:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.16b, v1.16b, v2.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: bsl_v2i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: bif v0.16b, v1.16b, v2.16b
+; SVE2-NEXT: ret
+ %4 = and <2 x i64> %2, %0
+ %5 = xor <2 x i64> %2, splat (i64 -1)
+ %6 = and <2 x i64> %1, %5
+ %7 = or <2 x i64> %4, %6
+ ret <2 x i64> %7
+}
+
+define <2 x i64> @nbsl_v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
+; NEON-LABEL: nbsl_v2i64:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.16b, v1.16b, v2.16b
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v2i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <2 x i64> %2, %0
+ %5 = xor <2 x i64> %2, splat (i64 -1)
+ %6 = and <2 x i64> %1, %5
+ %7 = or <2 x i64> %4, %6
+ %8 = xor <2 x i64> %7, splat (i64 -1)
+ ret <2 x i64> %8
+}
+
+define <2 x i64> @bsl1n_v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
+; NEON-LABEL: bsl1n_v2i64:
+; NEON: // %bb.0:
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: bif v0.16b, v1.16b, v2.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: bsl1n_v2i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: bsl1n z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %4 = xor <2 x i64> %0, splat (i64 -1)
+ %5 = and <2 x i64> %2, %4
+ %6 = xor <2 x i64> %2, splat (i64 -1)
+ %7 = and <2 x i64> %1, %6
+ %8 = or <2 x i64> %5, %7
+ ret <2 x i64> %8
+}
+
+define <2 x i64> @bsl2n_v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
+; NEON-LABEL: bsl2n_v2i64:
+; NEON: // %bb.0:
+; NEON-NEXT: and v0.16b, v2.16b, v0.16b
+; NEON-NEXT: orr v1.16b, v2.16b, v1.16b
+; NEON-NEXT: orn v0.16b, v0.16b, v1.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: bsl2n_v2i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: bsl2n z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <2 x i64> %2, %0
+ %5 = or <2 x i64> %2, %1
+ %6 = xor <2 x i64> %5, splat (i64 -1)
+ %7 = or <2 x i64> %4, %6
+ ret <2 x i64> %7
+}
+
+; Test other element types.
+
+define <8 x i8> @nbsl_v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2) {
+; NEON-LABEL: nbsl_v8i8:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.8b, v1.8b, v2.8b
+; NEON-NEXT: mvn v0.8b, v0.8b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v8i8:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <8 x i8> %2, %0
+ %5 = xor <8 x i8> %2, splat (i8 -1)
+ %6 = and <8 x i8> %1, %5
+ %7 = or <8 x i8> %4, %6
+ %8 = xor <8 x i8> %7, splat (i8 -1)
+ ret <8 x i8> %8
+}
+
+define <4 x i16> @nbsl_v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) {
+; NEON-LABEL: nbsl_v4i16:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.8b, v1.8b, v2.8b
+; NEON-NEXT: mvn v0.8b, v0.8b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v4i16:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <4 x i16> %2, %0
+ %5 = xor <4 x i16> %2, splat (i16 -1)
+ %6 = and <4 x i16> %1, %5
+ %7 = or <4 x i16> %4, %6
+ %8 = xor <4 x i16> %7, splat (i16 -1)
+ ret <4 x i16> %8
+}
+
+define <2 x i32> @nbsl_v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) {
+; NEON-LABEL: nbsl_v2i32:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.8b, v1.8b, v2.8b
+; NEON-NEXT: mvn v0.8b, v0.8b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v2i32:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <2 x i32> %2, %0
+ %5 = xor <2 x i32> %2, splat (i32 -1)
+ %6 = and <2 x i32> %1, %5
+ %7 = or <2 x i32> %4, %6
+ %8 = xor <2 x i32> %7, splat (i32 -1)
+ ret <2 x i32> %8
+}
+
+define <16 x i8> @nbsl_v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
+; NEON-LABEL: nbsl_v16i8:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.16b, v1.16b, v2.16b
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v16i8:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <16 x i8> %2, %0
+ %5 = xor <16 x i8> %2, splat (i8 -1)
+ %6 = and <16 x i8> %1, %5
+ %7 = or <16 x i8> %4, %6
+ %8 = xor <16 x i8> %7, splat (i8 -1)
+ ret <16 x i8> %8
+}
+
+define <8 x i16> @nbsl_v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
+; NEON-LABEL: nbsl_v8i16:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.16b, v1.16b, v2.16b
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v8i16:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <8 x i16> %2, %0
+ %5 = xor <8 x i16> %2, splat (i16 -1)
+ %6 = and <8 x i16> %1, %5
+ %7 = or <8 x i16> %4, %6
+ %8 = xor <8 x i16> %7, splat (i16 -1)
+ ret <8 x i16> %8
+}
+
+define <4 x i32> @nbsl_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
+; NEON-LABEL: nbsl_v4i32:
+; NEON: // %bb.0:
+; NEON-NEXT: bif v0.16b, v1.16b, v2.16b
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nbsl_v4i32:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z2.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %4 = and <4 x i32> %2, %0
+ %5 = xor <4 x i32> %2, splat (i32 -1)
+ %6 = and <4 x i32> %1, %5
+ %7 = or <4 x i32> %4, %6
+ %8 = xor <4 x i32> %7, splat (i32 -1)
+ ret <4 x i32> %8
+}
diff --git a/llvm/test/CodeGen/AArch64/eor3.ll b/llvm/test/CodeGen/AArch64/eor3.ll
index 06ae6b09d002e..a83b425251c3e 100644
--- a/llvm/test/CodeGen/AArch64/eor3.ll
+++ b/llvm/test/CodeGen/AArch64/eor3.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; SHA3-LABEL: eor3_16x8_left:
@@ -13,6 +14,15 @@ define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; NOSHA3-NEXT: eor v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v2.16b, v0.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: eor3_16x8_left:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: mov v0.16b, v2.16b
+; SVE2-NEXT: ret
%4 = xor <16 x i8> %0, %1
%5 = xor <16 x i8> %2, %4
ret <16 x i8> %5
@@ -29,6 +39,15 @@ define <16 x i8> @eor3_16x8_right(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; NOSHA3-NEXT: eor v1.16b, v1.16b, v2.16b
; NOSHA3-NEXT: eor v0.16b, v1.16b, v0.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: eor3_16x8_right:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT: mov v0.16b, v1.16b
+; SVE2-NEXT: ret
%4 = xor <16 x i8> %1, %2
%5 = xor <16 x i8> %4, %0
ret <16 x i8> %5
@@ -45,6 +64,15 @@ define <8 x i16> @eor3_8x16_left(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
; NOSHA3-NEXT: eor v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v2.16b, v0.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: eor3_8x16_left:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: mov v0.16b, v2.16b
+; SVE2-NEXT: ret
%4 = xor <8 x i16> %0, %1
%5 = xor <8 x i16> %2, %4
ret <8 x i16> %5
@@ -61,6 +89,15 @@ define <8 x i16> @eor3_8x16_right(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
; NOSHA3-NEXT: eor v1.16b, v1.16b, v2.16b
; NOSHA3-NEXT: eor v0.16b, v1.16b, v0.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: eor3_8x16_right:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT: mov v0.16b, v1.16b
+; SVE2-NEXT: ret
%4 = xor <8 x i16> %1, %2
%5 = xor <8 x i16> %4, %0
ret <8 x i16> %5
@@ -77,6 +114,15 @@ define <4 x i32> @eor3_4x32_left(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
; NOSHA3-NEXT: eor v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v2.16b, v0.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: eor3_4x32_left:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: mov v0.16b, v2.16b
+; SVE2-NEXT: ret
%4 = xor <4 x i32> %0, %1
%5 = xor <4 x i32> %2, %4
ret <4 x i32> %5
@@ -93,6 +139,15 @@ define <4 x i32> @eor3_4x32_right(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
; NOSHA3-NEXT: eor v1.16b, v1.16b, v2.16b
; NOSHA3-NEXT: eor v0.16b, v1.16b, v0.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: eor3_4x32_right:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT: mov v0.16b, v1.16b
+; SVE2-NEXT: ret
%4 = xor <4 x i32> %1, %2
%5 = xor <4 x i32> %4, %0
ret <4 x i32> %5
@@ -109,6 +164,15 @@ define <2 x i64> @eor3_2x64_left(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; NOSHA3-NEXT: eor v0.16b, v0.16b, v1.16b
; NOSHA3-NEXT: eor v0.16b, v2.16b, v0.16b
; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: eor3_2x64_left:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q2 killed $q2 def $z2
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: // kill: def $q0 killed ...
[truncated]
|
llvm/test/CodeGen/AArch64/bsl.ll
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it worth having at least one test in this file for fixed-width vectors that aren't 64 or 128 bits, i.e. <4 x i8> or <2 x i16>? I'd expect in this case they'd just get promoted to <4 x i16> or <2 x i32>.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I didn't do this initially because, as you say, those types will get promoted to something else, which should be unrelated to the patterns added in this patch. But I'm happy to add the tests if you think they're worth having!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added a few tests for <4 x i8>, please let me know if they're what you had in mind. :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see. I was expecting that after type promotion the types would be legal (v4i8->v4i16 which is SVEType.DSub for VT=nxv8i16) and therefore start matching your isel patterns too, but perhaps the isel patterns are different due to masking? No worries, I guess the tests don't do any harm and maybe we can support them in future?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I think it's the masking that gets in the way, specifically after promoting v4i8->v4i16, the splat (i8 -1) is converted to a splat (i16 255), which no longer matches the main patterns defined in AArch64SVEInstrInfo. I don't think this is specific to the patterns defined in this patch, but it's certainly something we can revisit in the future. :)
david-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
llvm/test/CodeGen/AArch64/bsl.ll
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see. I was expecting that after type promotion the types would be legal (v4i8->v4i16 which is SVEType.DSub for VT=nxv8i16) and therefore start matching your isel patterns too, but perhaps the isel patterns are different due to masking? No worries, I guess the tests don't do any harm and maybe we can support them in future?
david-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I just realised the tests are failing so I've temporarily removed my LGTM!
Erm I don't think the failures are related to the patch, let me try a rebase to see if that sorts it! |
This affects EOR3/BCAX/BSL/NBSL/BSL1N/BSL2N. https://godbolt.org/z/q96K5Ee53
79cc5fc to
385ab19
Compare
david-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Looks like the linux build is passing now. :)
| def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>; | ||
|
|
||
| // Allow selecting SVE2 ternary ops with Neon types. | ||
| foreach VT = [nxv16i8, nxv8i16, nxv4i32, nxv2i64] in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you specifically care about the case when FEAT_SHA3 is not available? I ask because when available the NEON variants look to have less restrictive register requirements? Specially for BCAX and EOR3 which have a dedicated result register.
If you just want to use the other instructions (bsl1n etc) then a quick fix would be to pass the VT array into the class and then only set the parameter for the instructions that are in addition to those available under FEAT_SHA3.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - passing VT into the class sounds good to me. I did check previously that the Neon BCAX/EOR3 instructions were selected over the SVE ones when FEAT_SHA3 is available (due to the less restrictive register requirements, as you point out). Since they were, I didn't see a reason not to enable the SVE2 patterns for them too.
Would you rather I do as you suggested and only enable the patterns for BSL1N/BSL1N/NBSL, or perhaps add a runline such as
RUN: llc -mtriple=aarch64 -mattr=+sha3,+sve2 < %s | FileCheck --check-prefix=SHA3 %s
in llvm/test/CodeGen/AArch64/bcax.ll and llvm/test/CodeGen/AArch64/eor3.ll to ensure we select the SHA3 patterns even when SVE2 is available?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this works as is then just adding the RUN lines for verification works for me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks very much - done. :)
This affects EOR3/BCAX/BSL/NBSL/BSL1N/BSL2N.
This had initially been discussed in #138689 (comment).