-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AArch64] Use sve instructions for fixed-width smulh/umulh. #166168
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
a7cb9f5 to
379e408
Compare
|
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesLike v2i64 mul and operations like divide, we should be able to use the SVE Patch is 22.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166168.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a83185d6ade20..f9b86a07fe6e3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1841,6 +1841,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::MULHS, VT, Custom);
+ setOperationAction(ISD::MULHU, VT, Custom);
}
// NEON doesn't support 64-bit vector integer muls, but SVE does.
@@ -1877,10 +1879,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
- setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
- setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
- setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
- setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
diff --git a/llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll b/llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll
index 146720febf486..3c817e5ddbd82 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll
@@ -127,9 +127,11 @@ define <vscale x 2 x i64> @umulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %
define <16 x i8> @smulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: smulh_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <16 x i8> %a to <16 x i16>
%2 = sext <16 x i8> %b to <16 x i16>
@@ -142,9 +144,11 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @smulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: smulh_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <8 x i16> %a to <8 x i32>
%2 = sext <8 x i16> %b to <8 x i32>
@@ -157,9 +161,11 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @smulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smulh_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <4 x i32> %a to <4 x i64>
%2 = sext <4 x i32> %b to <4 x i64>
@@ -172,15 +178,11 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @smulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: smulh_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: mov x9, v1.d[1]
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: smulh x10, x10, x11
-; CHECK-NEXT: smulh x8, x8, x9
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <2 x i64> %a to <2 x i128>
%2 = sext <2 x i64> %b to <2 x i128>
@@ -193,9 +195,11 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <16 x i8> @umulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: umulh_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <16 x i8> %a to <16 x i16>
%2 = zext <16 x i8> %b to <16 x i16>
@@ -208,9 +212,11 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @umulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: umulh_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <8 x i16> %a to <8 x i32>
%2 = zext <8 x i16> %b to <8 x i32>
@@ -223,9 +229,11 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @umulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umulh_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <4 x i32> %a to <4 x i64>
%2 = zext <4 x i32> %b to <4 x i64>
@@ -238,15 +246,11 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @umulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: umulh_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: mov x9, v1.d[1]
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: umulh x10, x10, x11
-; CHECK-NEXT: umulh x8, x8, x9
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <2 x i64> %a to <2 x i128>
%2 = zext <2 x i64> %b to <2 x i128>
@@ -263,8 +267,11 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <8 x i8> @smulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: smulh_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = sext <8 x i8> %a to <8 x i16>
%2 = sext <8 x i8> %b to <8 x i16>
@@ -277,8 +284,11 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
define <4 x i16> @smulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: smulh_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = sext <4 x i16> %a to <4 x i32>
%2 = sext <4 x i16> %b to <4 x i32>
@@ -291,8 +301,11 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
define <2 x i32> @smulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: smulh_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = sext <2 x i32> %a to <2 x i64>
%2 = sext <2 x i32> %b to <2 x i64>
@@ -305,12 +318,11 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
define <1 x i64> @smulh_v1i64(<1 x i64> %a, <1 x i64> %b) {
; CHECK-LABEL: smulh_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: smulh x8, x8, x9
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = sext <1 x i64> %a to <1 x i128>
%2 = sext <1 x i64> %b to <1 x i128>
@@ -323,8 +335,11 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %a, <1 x i64> %b) {
define <8 x i8> @umulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: umulh_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = zext <8 x i8> %a to <8 x i16>
%2 = zext <8 x i8> %b to <8 x i16>
@@ -337,8 +352,11 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
define <4 x i16> @umulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: umulh_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = zext <4 x i16> %a to <4 x i32>
%2 = zext <4 x i16> %b to <4 x i32>
@@ -351,8 +369,11 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
define <2 x i32> @umulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: umulh_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = zext <2 x i32> %a to <2 x i64>
%2 = zext <2 x i32> %b to <2 x i64>
@@ -365,12 +386,11 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
define <1 x i64> @umulh_v1i64(<1 x i64> %a, <1 x i64> %b) {
; CHECK-LABEL: umulh_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: umulh x8, x8, x9
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = zext <1 x i64> %a to <1 x i128>
%2 = zext <1 x i64> %b to <1 x i128>
diff --git a/llvm/test/CodeGen/AArch64/sve2-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve2-int-mulh.ll
index d7534712b53a0..917d8e6ec22ef 100644
--- a/llvm/test/CodeGen/AArch64/sve2-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-int-mulh.ll
@@ -119,9 +119,10 @@ define <vscale x 2 x i64> @umulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %
define <16 x i8> @smulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: smulh_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.b, z0.b, z1.b
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <16 x i8> %a to <16 x i16>
%2 = sext <16 x i8> %b to <16 x i16>
@@ -134,9 +135,10 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @smulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: smulh_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.h, z0.h, z1.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <8 x i16> %a to <8 x i32>
%2 = sext <8 x i16> %b to <8 x i32>
@@ -149,9 +151,10 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @smulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smulh_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <4 x i32> %a to <4 x i64>
%2 = sext <4 x i32> %b to <4 x i64>
@@ -164,15 +167,10 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @smulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: smulh_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: mov x9, v1.d[1]
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: smulh x10, x10, x11
-; CHECK-NEXT: smulh x8, x8, x9
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: smulh z0.d, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = sext <2 x i64> %a to <2 x i128>
%2 = sext <2 x i64> %b to <2 x i128>
@@ -185,9 +183,10 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <16 x i8> @umulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: umulh_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.b, z0.b, z1.b
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <16 x i8> %a to <16 x i16>
%2 = zext <16 x i8> %b to <16 x i16>
@@ -200,9 +199,10 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @umulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: umulh_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.h, z0.h, z1.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <8 x i16> %a to <8 x i32>
%2 = zext <8 x i16> %b to <8 x i32>
@@ -215,9 +215,10 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @umulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umulh_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <4 x i32> %a to <4 x i64>
%2 = zext <4 x i32> %b to <4 x i64>
@@ -230,15 +231,10 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @umulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: umulh_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: mov x9, v1.d[1]
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: umulh x10, x10, x11
-; CHECK-NEXT: umulh x8, x8, x9
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: umulh z0.d, z0.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%1 = zext <2 x i64> %a to <2 x i128>
%2 = zext <2 x i64> %b to <2 x i128>
@@ -255,8 +251,10 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <8 x i8> @smulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: smulh_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.b, z0.b, z1.b
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = sext <8 x i8> %a to <8 x i16>
%2 = sext <8 x i8> %b to <8 x i16>
@@ -269,8 +267,10 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
define <4 x i16> @smulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: smulh_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.h, z0.h, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = sext <4 x i16> %a to <4 x i32>
%2 = sext <4 x i16> %b to <4 x i32>
@@ -283,8 +283,10 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
define <2 x i32> @smulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: smulh_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: smulh z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%1 = sext <2 x i32> %a to <2 x i64>
%2 = sext <2 x i32> %b to <2 x i64>
@@ -297,12 +299,10 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
define <1 x i64> @smulh_v1i64(<1 x i64> %a, <1 x i64> %b) {
; CHECK-LABEL: smulh_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-;...
[truncated]
|
|
Along with #170283, this seems to now do OK. |
379e408 to
fe8165c
Compare
Like v2i64 mul and operations like divide, we should be able to use the SVE umulh and smulh instructions with 128bit vectors, providing that we have SVE/SVE2. There are a number of other instructions that look like they should presumably be treated the same way.
fe8165c to
600adef
Compare
david-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
|
Thanks. |
Like v2i64 mul and operations like divide, we should be able to use the SVE
umulh and smulh instructions with 128bit vectors, providing that we have
SVE/SVE2. There are a number of other instructions that look like they should
presumably be treated the same way.