[AArch64] Add intrinsics for 9.6 crypto instructions #165241

Lukacma · 2025-10-27T12:06:51Z

This patch add intrinsics for crpyto instructions defined in this ACLE proposal

llvmbot · 2025-10-27T12:07:36Z

@llvm/pr-subscribers-backend-aarch64

Author: None (Lukacma)

Changes

This patch add intrinsics for crpyto instructions defined in this ACLE proposal

Patch is 34.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165241.diff

7 Files Affected:

(modified) clang/include/clang/Basic/arm_sve.td (+17)
(added) clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c (+215)
(modified) clang/utils/TableGen/SveEmitter.cpp (+8-1)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+27)
(modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+53-11)
(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+2-2)
(added) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-crypto.ll (+155)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..605cf93d196ff 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1841,6 +1841,23 @@ def SVPMULLT_PAIR_U64   : SInst<"svpmullt_pair[_{d}]",   "ddd", "Ul", MergeNone,
 def SVPMULLT_PAIR_N_U64 : SInst<"svpmullt_pair[_n_{d}]", "dda", "Ul", MergeNone, "aarch64_sve_pmullt_pair", [VerifyRuntimeMode]>;
 }
 
+let SVETargetGuard = "sve-aes2", SMETargetGuard = "sve-aes2,ssve-aes" in {
+def SVAESD_X2   : SInst<"svaesd_laneq[_{d}_x2]",   "22di", "Uc", MergeNone, "aarch64_sve_aesd_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESIMC_X2 : SInst<"svaesdimc_laneq[_{d}_x2]", "22di",  "Uc", MergeNone, "aarch64_sve_aesdimc_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESE_X2   : SInst<"svaese_laneq[_{d}_x2]",   "22di", "Uc", MergeNone, "aarch64_sve_aese_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESeMC_X2  : SInst<"svaesemc_laneq[_{d}_x2]",  "22di",  "Uc", MergeNone, "aarch64_sve_aesemc_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+
+def SVAESD_X4   : SInst<"svaesd_laneq[_{d}_x4]",   "44di", "Uc", MergeNone, "aarch64_sve_aesd_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESIMC_X4 : SInst<"svaesdimc_laneq[_{d}_x4]", "44di",  "Uc", MergeNone, "aarch64_sve_aesdimc_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESE_X4   : SInst<"svaese_laneq[_{d}_x4]",   "44di", "Uc", MergeNone, "aarch64_sve_aese_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESeMC_X4  : SInst<"svaesemc_laneq[_{d}_x4]",  "44di",  "Uc", MergeNone, "aarch64_sve_aesemc_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+
+def SVPMULL_U64   : SInst<"svpmull[_u64]",   "2dd", "Ul", MergeNone, "aarch64_sve_pmull", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMULL_N_U64 : SInst<"svpmull[_n_u64]", "2da", "Ul", MergeNone, "aarch64_sve_pmull", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMLAL_U64   : SInst<"svpmlal[_u64]",   "22dd", "Ul", MergeNone, "aarch64_sve_pmlal", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMLAL_N_U64 : SInst<"svpmlal[_n_u64]", "22da", "Ul", MergeNone, "aarch64_sve_pmlal", [IsOverloadNone, VerifyRuntimeMode]>;
+}
+
 let SVETargetGuard = "sve-sha3", SMETargetGuard = "sme2p1,sve-sha3" in {
 def SVRAX1 : SInst<"svrax1[_{d}]",   "ddd", "lUl", MergeNone, "aarch64_sve_rax1", [IsOverloadNone, VerifyRuntimeMode]>;
 }
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c
new file mode 100644
index 0000000000000..bbe71d9db92cf
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c
@@ -0,0 +1,215 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: @test_svaesd_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaesd_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesd_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesd_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesdimc_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svaesdimc_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesdimc_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesdimc_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaese_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaese_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaese_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaese_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesemc_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svaesemc_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesemc_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesemc_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesd_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaesd_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesd_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesd_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesdimc_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svaesdimc_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesdimc_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesdimc_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaese_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaese_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaese_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaese_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesemc_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svaesemc_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesemc_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesemc_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svpmull_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svpmull_u64u12__SVUint64_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmull_u64(svuint64_t op1, svuint64_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmull,_u64,,)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svpmull_n_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svpmull_n_u64u12__SVUint64_tm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmull_n_u64(svuint64_t op1, uint64_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmull,_n_u64,,)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svpmlal_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svpmlal_u6412svuint64x2_tu12__SVUint64_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmlal_u64(svuint64x2_t op1, svuint64_t op2, svuint64_t op3) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmlal,_u64,,)(op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svpmlal_n_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svpmlal_n_u6412svuint64x2_tu12__SVUint64_tm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmlal_n_u64(svuint64x2_t op1, svuint64_t op2, uint64_t op3) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmlal,_n_u64,,)(op1, op2, op3);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b1e94e0ce0975..bbeed4402da1b 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -272,9 +272,16 @@ class Intrinsic {
           Proto[I] == 'R' || Proto[I] == '@' || Proto[I] == '!')
         break;
 
+      if (Proto[I] == '2')
+        Param += 1;
+      if (Proto[I] == '4')
+        Param += 3;
+
       // Multivector modifier can be skipped
-      if (Proto[I] == '.')
+      if (Proto[I] == '.') {
+        Param -= 1; // Adjust for the increment at the top of the loop
         I += 2;
+      }
     }
     assert(I != Proto.size() && "Prototype has no splat operand");
     return Param;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b0269eec3347a..3829dbdce4bc7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -4194,4 +4194,31 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2  : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
+  
+  // AES2
+  class SVE2_Crypto_LANE_X2_Intrinsic
+  : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+      [llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+      [ImmArg<ArgIndex<3>>, IntrNoMem]>;
+  class SVE2_Crypto_LANE_X4_Intrinsic
+  : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+      [llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, 
+       llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+      [ImmArg<ArgIndex<5>>, IntrNoMem]>;
+
+  def int_aarch64_sve_aesd_laneq_x2 : SVE2_Crypto_LANE_X2_Intrinsic;
+  def int_aarch64_sve_aesdimc_laneq_x2 : SVE2_Crypto_LANE_X2_Intrinsic;
+  def int_aarch64_s...
[truncated]

llvmbot · 2025-10-27T12:07:37Z

@llvm/pr-subscribers-llvm-ir

Author: None (Lukacma)

Changes

This patch add intrinsics for crpyto instructions defined in this ACLE proposal

Patch is 34.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165241.diff

7 Files Affected:

(modified) clang/include/clang/Basic/arm_sve.td (+17)
(added) clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c (+215)
(modified) clang/utils/TableGen/SveEmitter.cpp (+8-1)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+27)
(modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+53-11)
(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+2-2)
(added) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-crypto.ll (+155)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..605cf93d196ff 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1841,6 +1841,23 @@ def SVPMULLT_PAIR_U64   : SInst<"svpmullt_pair[_{d}]",   "ddd", "Ul", MergeNone,
 def SVPMULLT_PAIR_N_U64 : SInst<"svpmullt_pair[_n_{d}]", "dda", "Ul", MergeNone, "aarch64_sve_pmullt_pair", [VerifyRuntimeMode]>;
 }
 
+let SVETargetGuard = "sve-aes2", SMETargetGuard = "sve-aes2,ssve-aes" in {
+def SVAESD_X2   : SInst<"svaesd_laneq[_{d}_x2]",   "22di", "Uc", MergeNone, "aarch64_sve_aesd_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESIMC_X2 : SInst<"svaesdimc_laneq[_{d}_x2]", "22di",  "Uc", MergeNone, "aarch64_sve_aesdimc_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESE_X2   : SInst<"svaese_laneq[_{d}_x2]",   "22di", "Uc", MergeNone, "aarch64_sve_aese_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESeMC_X2  : SInst<"svaesemc_laneq[_{d}_x2]",  "22di",  "Uc", MergeNone, "aarch64_sve_aesemc_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+
+def SVAESD_X4   : SInst<"svaesd_laneq[_{d}_x4]",   "44di", "Uc", MergeNone, "aarch64_sve_aesd_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESIMC_X4 : SInst<"svaesdimc_laneq[_{d}_x4]", "44di",  "Uc", MergeNone, "aarch64_sve_aesdimc_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESE_X4   : SInst<"svaese_laneq[_{d}_x4]",   "44di", "Uc", MergeNone, "aarch64_sve_aese_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESeMC_X4  : SInst<"svaesemc_laneq[_{d}_x4]",  "44di",  "Uc", MergeNone, "aarch64_sve_aesemc_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+
+def SVPMULL_U64   : SInst<"svpmull[_u64]",   "2dd", "Ul", MergeNone, "aarch64_sve_pmull", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMULL_N_U64 : SInst<"svpmull[_n_u64]", "2da", "Ul", MergeNone, "aarch64_sve_pmull", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMLAL_U64   : SInst<"svpmlal[_u64]",   "22dd", "Ul", MergeNone, "aarch64_sve_pmlal", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMLAL_N_U64 : SInst<"svpmlal[_n_u64]", "22da", "Ul", MergeNone, "aarch64_sve_pmlal", [IsOverloadNone, VerifyRuntimeMode]>;
+}
+
 let SVETargetGuard = "sve-sha3", SMETargetGuard = "sme2p1,sve-sha3" in {
 def SVRAX1 : SInst<"svrax1[_{d}]",   "ddd", "lUl", MergeNone, "aarch64_sve_rax1", [IsOverloadNone, VerifyRuntimeMode]>;
 }
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c
new file mode 100644
index 0000000000000..bbe71d9db92cf
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c
@@ -0,0 +1,215 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: @test_svaesd_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaesd_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesd_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesd_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesdimc_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svaesdimc_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesdimc_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesdimc_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaese_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaese_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaese_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaese_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesemc_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svaesemc_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesemc_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesemc_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesd_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaesd_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesd_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesd_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesdimc_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svaesdimc_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesdimc_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesdimc_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaese_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaese_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaese_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaese_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesemc_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svaesemc_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesemc_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesemc_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svpmull_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svpmull_u64u12__SVUint64_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmull_u64(svuint64_t op1, svuint64_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmull,_u64,,)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svpmull_n_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svpmull_n_u64u12__SVUint64_tm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmull_n_u64(svuint64_t op1, uint64_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmull,_n_u64,,)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svpmlal_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svpmlal_u6412svuint64x2_tu12__SVUint64_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmlal_u64(svuint64x2_t op1, svuint64_t op2, svuint64_t op3) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmlal,_u64,,)(op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svpmlal_n_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svpmlal_n_u6412svuint64x2_tu12__SVUint64_tm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmlal_n_u64(svuint64x2_t op1, svuint64_t op2, uint64_t op3) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmlal,_n_u64,,)(op1, op2, op3);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b1e94e0ce0975..bbeed4402da1b 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -272,9 +272,16 @@ class Intrinsic {
           Proto[I] == 'R' || Proto[I] == '@' || Proto[I] == '!')
         break;
 
+      if (Proto[I] == '2')
+        Param += 1;
+      if (Proto[I] == '4')
+        Param += 3;
+
       // Multivector modifier can be skipped
-      if (Proto[I] == '.')
+      if (Proto[I] == '.') {
+        Param -= 1; // Adjust for the increment at the top of the loop
         I += 2;
+      }
     }
     assert(I != Proto.size() && "Prototype has no splat operand");
     return Param;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b0269eec3347a..3829dbdce4bc7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -4194,4 +4194,31 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2  : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
+  
+  // AES2
+  class SVE2_Crypto_LANE_X2_Intrinsic
+  : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+      [llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+      [ImmArg<ArgIndex<3>>, IntrNoMem]>;
+  class SVE2_Crypto_LANE_X4_Intrinsic
+  : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+      [llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, 
+       llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+      [ImmArg<ArgIndex<5>>, IntrNoMem]>;
+
+  def int_aarch64_sve_aesd_laneq_x2 : SVE2_Crypto_LANE_X2_Intrinsic;
+  def int_aarch64_sve_aesdimc_laneq_x2 : SVE2_Crypto_LANE_X2_Intrinsic;
+  def int_aarch64_s...
[truncated]

llvmbot · 2025-10-27T12:07:37Z

@llvm/pr-subscribers-clang

Author: None (Lukacma)

Changes

This patch add intrinsics for crpyto instructions defined in this ACLE proposal

Patch is 34.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165241.diff

7 Files Affected:

(modified) clang/include/clang/Basic/arm_sve.td (+17)
(added) clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c (+215)
(modified) clang/utils/TableGen/SveEmitter.cpp (+8-1)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+27)
(modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+53-11)
(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+2-2)
(added) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-crypto.ll (+155)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..605cf93d196ff 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1841,6 +1841,23 @@ def SVPMULLT_PAIR_U64   : SInst<"svpmullt_pair[_{d}]",   "ddd", "Ul", MergeNone,
 def SVPMULLT_PAIR_N_U64 : SInst<"svpmullt_pair[_n_{d}]", "dda", "Ul", MergeNone, "aarch64_sve_pmullt_pair", [VerifyRuntimeMode]>;
 }
 
+let SVETargetGuard = "sve-aes2", SMETargetGuard = "sve-aes2,ssve-aes" in {
+def SVAESD_X2   : SInst<"svaesd_laneq[_{d}_x2]",   "22di", "Uc", MergeNone, "aarch64_sve_aesd_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESIMC_X2 : SInst<"svaesdimc_laneq[_{d}_x2]", "22di",  "Uc", MergeNone, "aarch64_sve_aesdimc_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESE_X2   : SInst<"svaese_laneq[_{d}_x2]",   "22di", "Uc", MergeNone, "aarch64_sve_aese_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESeMC_X2  : SInst<"svaesemc_laneq[_{d}_x2]",  "22di",  "Uc", MergeNone, "aarch64_sve_aesemc_laneq_x2", [IsOverloadNone, VerifyRuntimeMode]>;
+
+def SVAESD_X4   : SInst<"svaesd_laneq[_{d}_x4]",   "44di", "Uc", MergeNone, "aarch64_sve_aesd_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESIMC_X4 : SInst<"svaesdimc_laneq[_{d}_x4]", "44di",  "Uc", MergeNone, "aarch64_sve_aesdimc_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESE_X4   : SInst<"svaese_laneq[_{d}_x4]",   "44di", "Uc", MergeNone, "aarch64_sve_aese_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVAESeMC_X4  : SInst<"svaesemc_laneq[_{d}_x4]",  "44di",  "Uc", MergeNone, "aarch64_sve_aesemc_laneq_x4", [IsOverloadNone, VerifyRuntimeMode]>;
+
+def SVPMULL_U64   : SInst<"svpmull[_u64]",   "2dd", "Ul", MergeNone, "aarch64_sve_pmull", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMULL_N_U64 : SInst<"svpmull[_n_u64]", "2da", "Ul", MergeNone, "aarch64_sve_pmull", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMLAL_U64   : SInst<"svpmlal[_u64]",   "22dd", "Ul", MergeNone, "aarch64_sve_pmlal", [IsOverloadNone, VerifyRuntimeMode]>;
+def SVPMLAL_N_U64 : SInst<"svpmlal[_n_u64]", "22da", "Ul", MergeNone, "aarch64_sve_pmlal", [IsOverloadNone, VerifyRuntimeMode]>;
+}
+
 let SVETargetGuard = "sve-sha3", SMETargetGuard = "sme2p1,sve-sha3" in {
 def SVRAX1 : SInst<"svrax1[_{d}]",   "ddd", "lUl", MergeNone, "aarch64_sve_rax1", [IsOverloadNone, VerifyRuntimeMode]>;
 }
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c
new file mode 100644
index 0000000000000..bbe71d9db92cf
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_crypto.c
@@ -0,0 +1,215 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve-aes2 -target-feature +ssve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-aes2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: @test_svaesd_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaesd_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesd_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesd_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesdimc_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svaesdimc_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesdimc_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesdimc_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaese_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaese_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaese_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaese_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesemc_u8_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svaesemc_u8_x211svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x2(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x2_t test_svaesemc_u8_x2(svuint8x2_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesemc_laneq,_u8_x2,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesd_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaesd_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesd.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesd_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesd_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesdimc_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svaesdimc_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesdimc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesdimc_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesdimc_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaese_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svaese_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aese.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaese_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaese_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svaesemc_u8_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svaesemc_u8_x411svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.aesemc.laneq.x4(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svaesemc_u8_x4(svuint8x4_t op1, svuint8_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svaesemc_laneq,_u8_x4,,)(op1, op2, 0);
+}
+
+// CHECK-LABEL: @test_svpmull_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svpmull_u64u12__SVUint64_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmull_u64(svuint64_t op1, svuint64_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmull,_u64,,)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svpmull_n_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svpmull_n_u64u12__SVUint64_tm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmull(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmull_n_u64(svuint64_t op1, uint64_t op2) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmull,_n_u64,,)(op1, op2);
+}
+
+// CHECK-LABEL: @test_svpmlal_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svpmlal_u6412svuint64x2_tu12__SVUint64_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmlal_u64(svuint64x2_t op1, svuint64_t op2, svuint64_t op3) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmlal,_u64,,)(op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svpmlal_n_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svpmlal_n_u6412svuint64x2_tu12__SVUint64_tm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.pmlal(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
+//
+svuint64x2_t test_svpmlal_n_u64(svuint64x2_t op1, svuint64_t op2, uint64_t op3) STREAMING
+{
+  return SVE_ACLE_FUNC(svpmlal,_n_u64,,)(op1, op2, op3);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b1e94e0ce0975..bbeed4402da1b 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -272,9 +272,16 @@ class Intrinsic {
           Proto[I] == 'R' || Proto[I] == '@' || Proto[I] == '!')
         break;
 
+      if (Proto[I] == '2')
+        Param += 1;
+      if (Proto[I] == '4')
+        Param += 3;
+
       // Multivector modifier can be skipped
-      if (Proto[I] == '.')
+      if (Proto[I] == '.') {
+        Param -= 1; // Adjust for the increment at the top of the loop
         I += 2;
+      }
     }
     assert(I != Proto.size() && "Prototype has no splat operand");
     return Param;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b0269eec3347a..3829dbdce4bc7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -4194,4 +4194,31 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2  : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
+  
+  // AES2
+  class SVE2_Crypto_LANE_X2_Intrinsic
+  : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+      [llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+      [ImmArg<ArgIndex<3>>, IntrNoMem]>;
+  class SVE2_Crypto_LANE_X4_Intrinsic
+  : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+      [llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, 
+       llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+      [ImmArg<ArgIndex<5>>, IntrNoMem]>;
+
+  def int_aarch64_sve_aesd_laneq_x2 : SVE2_Crypto_LANE_X2_Intrinsic;
+  def int_aarch64_sve_aesdimc_laneq_x2 : SVE2_Crypto_LANE_X2_Intrinsic;
+  def int_aarch64_s...
[truncated]

This commit optimizes `SpecialCaseList` by using a `RadixTree` to filter glob patterns based on their prefixes. When matching a query, the `RadixTree` quickly identifies all glob patterns whose prefixes match the query's prefix. This significantly reduces the number of glob patterns that need to be fully evaluated, leading to performance improvements, especially when dealing with a large number of patterns. According to SpecialCaseListBM: Lookup benchmarks (significant improvements): ``` OVERALL_GEOMEAN -0.8177 ``` Lookup like `prefix*` benchmarks (huge improvements): ``` OVERALL_GEOMEAN -0.9819 ``` https://gist.github.com/vitalybuka/824884bcbc1713e815068c279159dafe --------- Co-authored-by: Copilot <[email protected]>

…r… (llvm#165038) This reverts commit bd27abc. See llvm#164670 (comment)

Given a GPR32 zeroing instruction, if the target supports zero cycle zeroing for GPR64 but not for GPR32, widen the instruction to 64 bit `$xn = MOVZXi 0, 0` instead of writing to `$wn` to exploit zero cycle zeroing. It also aligns naming in the generic zeroing test.

There are no other target specific passes in Passes.h and these really belong inside x86.h to be consistent with other targets. Reviewers: arsenm, phoebewang, RKSimon, topperc Reviewed By: arsenm Pull Request: llvm#165075

…eCombine (llvm#164628) Since new select/phi instructions may construct loops, the expression tree to be simplified may still be incomplete (i.e., it may contain select with dummy values or phi without incoming values). This patch removes the call to simplifyInstruction for now, as it doesn't break existing tests. Original PR: https://reviews.llvm.org/D36073 Fix the crash reported in llvm#163453 (comment).

…#165037) Use the same format as introduced for wmma by llvm#164920. Also make `blocks` default to 1.

Upstream support calling function that returns ComplexType Issue llvm#141365

Upstream the EHPersonality class for a function as a prerequisite for working with the handlers Issue llvm#154992

After llvm#164340 there is a tsan race on `OutstandingSymbolsCount` when decrementing it in `notifySymbolMetRequiredState` vs reading it in `isComplete()`. Fix this by having `IL_emit` filter out non-completed queries when it has the lock to do so, and that way we avoid needing to call `isComplete()` later.

llvm#165044) Use the same format as introduced for wmma by llvm#164920 and for mfma by llvm#165037.

This is to follow the discussion in llvm#164565 CallBase can cover more call-like instructions which carry caling convention flag. Co-authored-by: Yuanke Luo <[email protected]>

…lvm#165079) This patch moves the init, copyFrom, and grow methods in DenseMap and SmallDenseMap from public to private to hide implementation details. The only problem is that PhysicalRegisterUsageInfo calls DenseMap::grow instead of DenseMap::reserve, which I don't think is intended. This patch updates the call to reserve.

…lvm#165080) DenseMap::destroyAll currently iterates through the entire bucket array to call destructors keys and values. We don't need to do that if we know that both key and value types are trivially destructible, meaning that the destructors are no-ops. This patch introduces "constexpr if" at the beginning of destroyAll to skip the rest of the function if both key and value types are trivially destructible.

llvm#165081) This patch consolidates the two implementations of Recycler::clear with "if constexpr" for simplicity.

Note that "override" makes "virtual" redundant. Identified with modernize-use-override.

If the range size doesn't match the type size, it might read wrong data.

…lvm#163893) Cache extracted elements in lowerShuffleVector(). For example, when lowering ``` %0:_(<2 x s32>) = G_BUILD_VECTOR %0, %1 %2:_(<N x s32>) = G_SHUFFLE_VECTOR %1, shufflemask(0, 0, 0, 0 ... x N ) ``` Currently, we generate `N` `G_EXTRACT_VECTOR_ELT` for each element in shufflemask. This is undesirable and bloats the code, especially for larger vectors. With this change, we only generate one `G_EXTRACT_VECTOR_ELT` from `%0` and reuse it for all four result elements.

…vm#164543) This commit enhances the `SpecialCaseList::GlobMatcher` to filter globs more efficiently by considering both prefixes and suffixes. Previously, the `GlobMatcher` used a `RadixTree` to store globs based on their prefixes. This allowed for quick lookup of potential matches by matching the query string's prefix against the stored prefixes. However, for globs with common prefixes but different suffixes, unnecessary glob matching attempts could still occur. This change introduces a nested `RadixTree` structure: `PrefixSuffixToGlob: RadixTree<Prefix, RadixTree<Suffix, Globs>>`. Now, when a query string is matched, it first finds matching prefixes, and then within those prefix matches, it further filters by matching the reversed suffix of the query string against the reversed suffixes of the globs. This significantly reduces the number of `Glob::match` calls, especially for large special case lists with many globs sharing common prefixes but differing in their suffixes. According to SpecialCaseListBM: Lookup benchmarks (significant improvements): ``` OVERALL_GEOMEAN -0.5815 ``` Lookup `*suffix` and `prefix*suffix` like benchmarks (huge improvements): ``` OVERALL_GEOMEAN -0.9316 ``` https://gist.github.com/vitalybuka/e586751902760ced6beefcdf0d7b26fd

The test added in llvm#161067 writes artifacts to the current dir, i.e. `test.o` / `test.dwo` / `test.dwp`, which might not be writeable. Tests should use `%t` for test artifact location, i.e. `%t.o` / `%t.dwo` / `%t.dwp` However, since `"test.dwo"` is part of the assembly source file used as a test input, and that's not something lit will substitute, that typical approach doesn't work. We can instead ensure the output is in a good location by running `cd %t` (after setting it up).

LLVM prints switch cases indented by 2 additional spaces, as follows: ```LLVM switch i32 %x, label %default [ i32 0, label %phi i32 1, label %phi ] ``` Since this only changes the output IR of update_test_checks.py and does not change the logic of the File Check Pattern, there seems to be no need to update the existing test cases.

@pearu

This ports openxla/stablehlo#2682 implementation by @pearu. Three tests were added to `Integration/Dialect/Complex/CPU/correctness.mlir`. I also verified accuracy using XLA's complex_unary_op_test and its MLIR emitters.

1. createHvxPrefixPred was computing an invalid byte count for small predicate types, leading to a crash during instruction selection. 2. HexagonTargetLowering::SplitHvxMemOp assumed the memory vector type is always simple. This patch adds a guard to avoid processing non-simple vector types, which can lead to failure. Patch By: Fateme Hosseini Co-authored-by: pavani karveti <[email protected]> Co-authored-by: Sergei Larin <[email protected]> Co-authored-by: Pavani Karveti <[email protected]>

) Part of llvm#102817. This patch optimizes `rng::generate_n` for segmented iterators by forwarding the implementation directly to `std::generate_n`. - before ``` rng::generate_n(deque<int>)/32 21.7 ns 22.0 ns 32000000 rng::generate_n(deque<int>)/50 30.8 ns 30.7 ns 22400000 rng::generate_n(deque<int>)/1024 492 ns 488 ns 1120000 rng::generate_n(deque<int>)/8192 3938 ns 3924 ns 179200 ``` - after ``` rng::generate_n(deque<int>)/32 11.0 ns 11.0 ns 64000000 rng::generate_n(deque<int>)/50 16.2 ns 16.1 ns 40727273 rng::generate_n(deque<int>)/1024 292 ns 286 ns 2240000 rng::generate_n(deque<int>)/8192 2291 ns 2302 ns 298667 ```

…r switch lowering (llvm#155910) Currently it is considered suitable to lower to a bit test for a set of switch case clusters when the the number of unique destinations (`NumDests`) and the number of total comparisons (`NumCmps`) satisfy: `(NumDests == 1 && NumCmps >= 3) || (NumDests == 2 && NumCmps >= 5) || (NumDests == 3 && NumCmps >= 6)` However it is found for some cases on powerpc, for example, when NumDests is 3, and the number of comparisons for each destination is all 2, it's not profitable to lower the switch to bit test. This is to add an option to set the minimum of largest number of comparisons to use bit test for switch lowering. --------- Co-authored-by: Shimin Cui <[email protected]>

Lukacma · 2025-10-29T11:19:11Z

Sorry for spam. Messed up my rebase !

Lukacma requested review from CarolineConcatto and kmclaughlin-arm October 27, 2025 12:06

llvmbot added clang Clang issues not falling into any other category backend:AArch64 clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:ir labels Oct 27, 2025

vitalybuka and others added 21 commits October 29, 2025 11:16

Reapply "Reapply "[clang-format] Annotate ::operator and Foo::operato…

fb0f0d1

…r… (llvm#165038) This reverts commit bd27abc. See llvm#164670 (comment)

[X86] Move x86 specific create*Pass Functions to X86.h

c6b27ad

There are no other target specific passes in Passes.h and these really belong inside x86.h to be consistent with other targets. Reviewers: arsenm, phoebewang, RKSimon, topperc Reviewed By: arsenm Pull Request: llvm#165075

[mlir][amdgpu] Update mfma assembly format with intrinsic shape (llvm…

e02ca45

…#165037) Use the same format as introduced for wmma by llvm#164920. Also make `blocks` default to 1.

[CIR] Upstream CallOp with ComplexType as return type (llvm#164980)

be03e79

Upstream support calling function that returns ComplexType Issue llvm#141365

[CIR][NFC] Upstream EHPersonality for function (llvm#164883)

9ea21f6

Upstream the EHPersonality class for a function as a prerequisite for working with the handlers Issue llvm#154992

[mlir][amdgpu] Update scaled_mfma assembly format with intrinsic shape (

e38de5e

llvm#165044) Use the same format as introduced for wmma by llvm#164920 and for mfma by llvm#165037.

[ISel] Use CallBase instead of CallInst (llvm#164769)

ecab9f4

This is to follow the discussion in llvm#164565 CallBase can cover more call-like instructions which carry caling convention flag. Co-authored-by: Yuanke Luo <[email protected]>

Add Codegen/Hexagon/masked_gather.ll to profcheck-xfail (llvm#165093)

1a2aedf

Add new MemorySanitizer test cases for AArch64 (llvm#165094)

520691e

[Support] Consolidate the two implementations of Recycler::clear (NFC) (

0ead2f3

llvm#165081) This patch consolidates the two implementations of Recycler::clear with "if constexpr" for simplicity.

[Target] Add "override" where appropriate (NFC) (llvm#165083)

b29d4bf

Note that "override" makes "virtual" redundant. Identified with modernize-use-override.

[Attributor] Check range size before constant fold load (llvm#151359)

1fdbeaf

If the range size doesn't match the type size, it might read wrong data.

[clang] Proofread LanguageExtensions.rst (llvm#165082)

66513ef

rupprecht and others added 7 commits October 29, 2025 11:17

[mlir][complex] Fix exp accuracy (llvm#164952)

ef09fcd

This ports openxla/stablehlo#2682 implementation by @pearu. Three tests were added to `Integration/Dialect/Complex/CPU/correctness.mlir`. I also verified accuracy using XLA's complex_unary_op_test and its MLIR emitters.

Remove unnecessary code

c32e80c

Lukacma requested review from a team, DeinAlptraum, Endilll, JDevlieghere, QuietMisdreavus, aaupov, andykaylor, ayermolo, bcardosolopes, cyndyishida, daniel-grumberg, lanza, maksfb, nikic, paschalis-mpeis, rafaelauler, xlauko, yota9 and yozhu as code owners October 29, 2025 11:18

Lukacma closed this Oct 29, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Add intrinsics for 9.6 crypto instructions #165241

[AArch64] Add intrinsics for 9.6 crypto instructions #165241

Uh oh!

Lukacma commented Oct 27, 2025

Uh oh!

llvmbot commented Oct 27, 2025

Uh oh!

llvmbot commented Oct 27, 2025

Uh oh!

llvmbot commented Oct 27, 2025

Uh oh!

Lukacma commented Oct 29, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

125 participants

[AArch64] Add intrinsics for 9.6 crypto instructions #165241

[AArch64] Add intrinsics for 9.6 crypto instructions #165241

Uh oh!

Conversation

Lukacma commented Oct 27, 2025

Uh oh!

llvmbot commented Oct 27, 2025

Uh oh!

llvmbot commented Oct 27, 2025

Uh oh!

llvmbot commented Oct 27, 2025

Uh oh!

Lukacma commented Oct 29, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

125 participants