Add amx-avx512

sayantn · sayantn · commit 97dc3f6fd72a · 2025-11-01T05:20:32.000+05:30
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
@@ -34,7 +34,8 @@
     f16,
     aarch64_unstable_target_feature,
     bigint_helper_methods,
-    funnel_shifts
+    funnel_shifts,
+    avx10_target_feature
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, stdarch_internal))]
 #![deny(clippy::missing_inline_in_public_items)]
diff --git a/crates/core_arch/src/x86_64/amx.rs b/crates/core_arch/src/x86_64/amx.rs
@@ -1,3 +1,5 @@
+use crate::core_arch::{simd::*, x86::*};
+
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
@@ -380,6 +382,67 @@ pub unsafe fn _tile_mmultf32ps<const DST: i32, const A: i32, const B: i32>() {
     tmmultf32ps(DST as i8, A as i8, B as i8);
 }
 
+/// Moves a row from a tile register to a zmm register, converting the packed 32-bit signed integer
+/// elements to packed single-precision (32-bit) floating-point elements.
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tcvtrowd2ps, TILE = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cvtrowd2ps<const TILE: i32>(row: u32) -> __m512 {
+    static_assert_uimm_bits!(TILE, 3);
+    tcvtrowd2ps(TILE as i8, row).as_m512()
+}
+
+/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
+/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
+/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tcvtrowps2phh, TILE = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cvtrowps2phh<const TILE: i32>(row: u32) -> __m512h {
+    static_assert_uimm_bits!(TILE, 3);
+    tcvtrowps2phh(TILE as i8, row).as_m512h()
+}
+
+/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
+/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
+/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tcvtrowps2phl, TILE = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cvtrowps2phl<const TILE: i32>(row: u32) -> __m512h {
+    static_assert_uimm_bits!(TILE, 3);
+    tcvtrowps2phl(TILE as i8, row).as_m512h()
+}
+
+/// Moves one row of tile data into a zmm vector register
+#[inline]
+#[rustc_legacy_const_generics(0)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tilemovrow, TILE = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_movrow<const TILE: i32>(row: u32) -> __m512i {
+    static_assert_uimm_bits!(TILE, 3);
+    tilemovrow(TILE as i8, row).as_m512i()
+}
+
 #[allow(improper_ctypes)]
 unsafe extern "C" {
     #[link_name = "llvm.x86.ldtilecfg"]
@@ -426,6 +489,14 @@ unsafe extern "C" {
     fn tileloaddrst164(dst: i8, base: *const u8, stride: usize);
     #[link_name = "llvm.x86.tmmultf32ps"]
     fn tmmultf32ps(dst: i8, a: i8, b: i8);
+    #[link_name = "llvm.x86.tcvtrowd2ps"]
+    fn tcvtrowd2ps(tile: i8, row: u32) -> f32x16;
+    #[link_name = "llvm.x86.tcvtrowps2phh"]
+    fn tcvtrowps2phh(tile: i8, row: u32) -> f16x32;
+    #[link_name = "llvm.x86.tcvtrowps2phl"]
+    fn tcvtrowps2phl(tile: i8, row: u32) -> f16x32;
+    #[link_name = "llvm.x86.tilemovrow"]
+    fn tilemovrow(tile: i8, row: u32) -> i32x16;
 }
 
 #[cfg(test)]