[AArch64] Add intrinsics support for multi-vector FMUL #163397

Lukacma · 2025-10-14T13:47:07Z

This patch adds intrinsics for multi-vector FMUL instructions introduced in armv9.6, based on this ACLE proposal.

Depends on #163536

llvmbot · 2025-10-14T13:47:42Z

@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-aarch64

Author: None (Lukacma)

Changes

This patch adds intrinsics for multi-vector FMUL instructions introduced in armv9.6, based on this ACLE proposal.

Depends on #163346

Patch is 70.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163397.diff

9 Files Affected:

(modified) clang/include/clang/Basic/arm_sve.td (+15)
(added) clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfmul.c (+76)
(added) clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfscale.c (+76)
(added) clang/test/CodeGen/AArch64/sme2p2-intrinsics/acle_sme2p2_fmul.c (+198)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+2-2)
(modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+40)
(added) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfmul.ll (+56)
(added) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll (+56)
(added) llvm/test/CodeGen/AArch64/sme2p2-intrinsics-fmul.ll (+164)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..f9402659b4254 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2082,6 +2082,13 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,sve-b16b16"in {
   defm SVBFMAXNM : BfSingleMultiVector<"maxnm">;
 }
 
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,sve-bfscale" in {
+  // BFMUL
+  defm SVBFMUL : BfSingleMultiVector<"mul">;
+  // BFSCALE
+  defm SVBFSCALE : BfSingleMultiVector<"scale">;
+}
+
 let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   // == ADD (vectors) ==
   def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
@@ -2389,3 +2396,11 @@ let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "ssve-fp8fma" in {
   def SVFMLALLTB_LANE : SInst<"svmlalltb_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
   def SVFMLALLTT_LANE : SInst<"svmlalltt_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
 }
+
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p2" in {
+  def FMUL_SINGLE_X2 : SInst<"svmul[_single_{d}_x2]", "22d", "hfd", MergeNone, "aarch64_sve_fmul_single_x2", [IsStreaming], []>;
+  def FMUL_SINGLE_X4 : SInst<"svmul[_single_{d}_x4]", "44d", "hfd", MergeNone, "aarch64_sve_fmul_single_x4", [IsStreaming], []>;
+
+  def FMUL_X2 : SInst<"svmul[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sve_fmul_x2", [IsStreaming], []>;
+  def FMUL_X4 : SInst<"svmul[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sve_fmul_x4", [IsStreaming], []>;
+}
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfmul.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfmul.c
new file mode 100644
index 0000000000000..187e9390f742c
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfmul.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_single_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svmul_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svmul_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming{
+  return SVE_ACLE_FUNC(svmul,_single_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_single_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svmul_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svmul_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming{
+  return SVE_ACLE_FUNC(svmul,_single_bf16_x4)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z18test_svmul_bf16_x214svbfloat16x2_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svmul_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming{
+  return SVE_ACLE_FUNC(svmul,_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE2]], <vscale x 8 x bfloat> [[ZM_COERCE3]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z18test_svmul_bf16_x414svbfloat16x4_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE2]], <vscale x 8 x bfloat> [[ZM_COERCE3]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svmul_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_streaming{
+  return SVE_ACLE_FUNC(svmul,_bf16_x4)(zdn, zm);
+}
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfscale.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfscale.c
new file mode 100644
index 0000000000000..6f8606c22954f
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfscale.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_single_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z27test_svscale_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svscale_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming{
+  return SVE_ACLE_FUNC(svscale,_single_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_single_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z27test_svscale_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svscale_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming{
+  return SVE_ACLE_FUNC(svscale,_single_bf16_x4)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z20test_svscale_bf16_x214svbfloat16x2_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svscale_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming{
+  return SVE_ACLE_FUNC(svscale,_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE2]], <vscale x 8 x bfloat> [[ZM_COERCE3]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z20test_svscale_bf16_x414svbfloat16x4_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COE...
[truncated]

jthackray

Can't spot anything obviously wrong. LGTM.

kmclaughlin-arm · 2025-11-11T16:11:39Z

clang/include/clang/Basic/arm_sve.td

+
+  def FMUL_X2 : SInst<"svmul[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sve_fmul_x2", [IsStreaming], []>;
+  def FMUL_X4 : SInst<"svmul[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sve_fmul_x4", [IsStreaming], []>;
+}


I think including the definitions here is correct, but I'm not sure if it means that they will be added to arm_sve.h? If so, I think this is different to what the acle doc states:

The intrinsics in this section are defined by the header file [`<arm_sme.h>`](#arm_sme.h) when `__ARM_FEATURE_SME2p2` is defined.

Good point ! I will suggest the fix in the ACLE

kmclaughlin-arm · 2025-11-11T16:31:17Z

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

+    case Intrinsic::aarch64_sve_fmul_single_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {AArch64::BFMUL_2ZZ, AArch64::FMUL_2ZZ_H, AArch64::FMUL_2ZZ_S,


I see that other bfloat instructions handled here still use the _H suffix, is it needed for bfmul too?

I think that is just a coincidence, I don't think there is any strict naming scheme for these things.

CarolineConcatto

Oh, it looks this patch is a stack from the previous one.
This patch also looks good!
I would update the commit message to add with intrinsics you are implementing here.

It looks like it is only this one:

//BFMUL:
svbfloat16x2_t svmul[_bf16_x2](svbfloat16x2_t zd, svbfloat16x2_t zm) __arm_streaming;
svbfloat16x2_t svmul[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zm) __arm_streaming;
svbfloat16x4_t svmul[_bf16_x4](svbfloat16x4_t zd, svbfloat16x4_t zm) __arm_streaming;
svbfloat16x4_t svmul[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zm) __arm_streaming;

Lukacma added 3 commits October 13, 2025 14:38

Add intrinsics

0d2b628

remove commented code

73aa7ee

[AArch64] Add intrinsics support for multi-vector FMUL

e128c90

Lukacma requested review from amilendra and jthackray October 14, 2025 13:47

llvmbot added clang Clang issues not falling into any other category backend:AArch64 clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:ir labels Oct 14, 2025

jthackray approved these changes Oct 14, 2025

View reviewed changes

kmclaughlin-arm reviewed Nov 11, 2025

View reviewed changes

CarolineConcatto approved these changes Dec 1, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Add intrinsics support for multi-vector FMUL #163397

[AArch64] Add intrinsics support for multi-vector FMUL #163397

Uh oh!

Lukacma commented Oct 14, 2025 •

edited

Loading

Uh oh!

llvmbot commented Oct 14, 2025 •

edited

Loading

Uh oh!

jthackray left a comment

Uh oh!

kmclaughlin-arm Nov 11, 2025

Uh oh!

Lukacma Nov 14, 2025

Uh oh!

kmclaughlin-arm Nov 11, 2025

Uh oh!

Lukacma Nov 14, 2025

Uh oh!

CarolineConcatto left a comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

[AArch64] Add intrinsics support for multi-vector FMUL #163397

Are you sure you want to change the base?

[AArch64] Add intrinsics support for multi-vector FMUL #163397

Uh oh!

Conversation

Lukacma commented Oct 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jthackray left a comment

Choose a reason for hiding this comment

Uh oh!

kmclaughlin-arm Nov 11, 2025

Choose a reason for hiding this comment

Uh oh!

Lukacma Nov 14, 2025

Choose a reason for hiding this comment

Uh oh!

kmclaughlin-arm Nov 11, 2025

Choose a reason for hiding this comment

Uh oh!

Lukacma Nov 14, 2025

Choose a reason for hiding this comment

Uh oh!

CarolineConcatto left a comment

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

Lukacma commented Oct 14, 2025 •

edited

Loading

llvmbot commented Oct 14, 2025 •

edited

Loading