Skip to content
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
1eee9ed
feat: added the skeleton structure of the x86 module
madhav-madhusoodanan Aug 2, 2025
07f20f4
feat: added the XML intrinsic parser for x86
madhav-madhusoodanan Aug 2, 2025
191614a
feat: updated intrinsics creation
madhav-madhusoodanan Aug 3, 2025
a952d3b
feat: update building C code for x86 architecture.
madhav-madhusoodanan Aug 3, 2025
417f729
fix: code cleanup
madhav-madhusoodanan Aug 3, 2025
4c19bd2
chore: added Regex crate, updated the structure of X86IntrinsicType
madhav-madhusoodanan Aug 5, 2025
3e3bf65
feat: implemented build_rust_file of `x86` module
madhav-madhusoodanan Aug 5, 2025
3e95708
feat: implemented compare_outputs of `x86` module
madhav-madhusoodanan Aug 5, 2025
fbb8214
feat: implement `print_result_c` for `Intrinsic<X86IntrinsicType>`
madhav-madhusoodanan Aug 5, 2025
6c69404
feat: Added x86 to CI pipeline
madhav-madhusoodanan Aug 5, 2025
0d62fe4
fix: update arch flags being sent to the x86 compilation command
madhav-madhusoodanan Aug 5, 2025
bd48f59
fix: set default value for varname and type fields of the
madhav-madhusoodanan Aug 5, 2025
8a69c61
fix: correcting semantical logic for setting vec_len
madhav-madhusoodanan Aug 5, 2025
83bc235
fix: more support for Mask types
madhav-madhusoodanan Sep 5, 2025
76359ef
fix: remove unused imports
madhav-madhusoodanan Sep 6, 2025
0c803aa
feat: implemented print_result_c in the case the target type is
madhav-madhusoodanan Sep 7, 2025
a7dac63
feat: implemented get_lane_function for x86
madhav-madhusoodanan Sep 7, 2025
68b2b2c
chore: update c_prefix for mask and print_result_c for vector type
madhav-madhusoodanan Sep 7, 2025
1951ed0
feat: handled extraction for 64-bit vector elements
madhav-madhusoodanan Sep 8, 2025
9726455
feat: add 8x8 case for get_lane_function for 64-bit vector
madhav-madhusoodanan Sep 8, 2025
1a07650
debug: printing self incase print_result_c fails.
madhav-madhusoodanan Sep 9, 2025
e79129a
chore: update x86 module, removed intrinsicDefinition trait, formatting
madhav-madhusoodanan Sep 10, 2025
8efed65
fixed errors that caused errors with cpp file generation (un-handled
madhav-madhusoodanan Sep 13, 2025
33ead37
feat: correcting errors with generated C artifacts
madhav-madhusoodanan Sep 14, 2025
30e0642
fix: vec_len -> simd_len (an error was present due to setting vec_len…
madhav-madhusoodanan Sep 14, 2025
6f9e90e
feat: updating intrinsic-run execution bash script
madhav-madhusoodanan Sep 16, 2025
6fbdf07
chore: revert default target
madhav-madhusoodanan Sep 16, 2025
d3dbbd6
chore: adding comments about memory alignment of variables and bash s…
madhav-madhusoodanan Sep 17, 2025
adb8124
chore: adding backtracing for better debugging
madhav-madhusoodanan Sep 17, 2025
218c360
chore: add compilation flags
madhav-madhusoodanan Sep 17, 2025
e32b078
chore: add better error handling when writing and compiling mod_{i}.cpp,
madhav-madhusoodanan Sep 18, 2025
07024e9
feat: Fixed FP16 errors, made the loading function generation more
madhav-madhusoodanan Sep 20, 2025
5383867
chore: Ensuring "const" appears for constant arguments to intrinsics.
madhav-madhusoodanan Sep 24, 2025
7454872
chore: allowing cast() function to allow implicity type conversion for
madhav-madhusoodanan Sep 24, 2025
b28fc7a
feat: matching the expected number of elements for array to load
madhav-madhusoodanan Sep 24, 2025
f7f0d4e
feat: updated with debug printing and ostream implementation for vector
madhav-madhusoodanan Sep 24, 2025
2913908
chore: corrected the legal range of values for constrained arguments
madhav-madhusoodanan Sep 24, 2025
1e56470
feat: filter for duplicates in the definition of intrinsics
madhav-madhusoodanan Sep 24, 2025
39425f3
chore: vector types cannot be the type of an individual element in an
madhav-madhusoodanan Sep 24, 2025
51c8750
chore: accomodate for `immwidth` field for constraints
madhav-madhusoodanan Sep 24, 2025
52c0c08
feat: defined more load functions that are natively not defined (such as
madhav-madhusoodanan Sep 24, 2025
c5717c3
chore: corrected the imm-width correction location for _mm_mpsadbw_epu8
madhav-madhusoodanan Sep 24, 2025
06f1b0c
feat: added exclusion list to intrinsic-test CI pipeline
madhav-madhusoodanan Sep 24, 2025
9c1ec7d
chore: clean up unused variables
madhav-madhusoodanan Sep 24, 2025
c824690
feat: moved cast<T1, T2> to architecture-specific definitions
madhav-madhusoodanan Sep 27, 2025
3a1aab8
fix: remove extra brackets for cast definition in arm/config.rs
madhav-madhusoodanan Sep 27, 2025
3a4ae99
make `std::ostream& operator<<(std::ostream& os, float16_t value);`
madhav-madhusoodanan Sep 27, 2025
ab00695
feat: add missing_x86.txt to filter out intrinsics that cannot be tested
madhav-madhusoodanan Sep 27, 2025
06bb848
feat: added custom helper functions (that helped load intrinsic
madhav-madhusoodanan Sep 27, 2025
599b68f
chore: add more compiler flags for compiling x86 intrinsics in C++
madhav-madhusoodanan Sep 28, 2025
3a0fef0
chore: add verbose cli option to C++ compiler
madhav-madhusoodanan Sep 28, 2025
f123a9b
feat: add clang to dockerfile and change clang++-19 to clang++
madhav-madhusoodanan Sep 28, 2025
ce16379
fix: add `libstdc++-dev` to fix `iostream not found` error
madhav-madhusoodanan Sep 28, 2025
411fbcd
fix: making compilation step run one by one to prevent the process from
madhav-madhusoodanan Sep 29, 2025
ee2f1e7
feat: attempting compilation of smaller chunks for faster parallel
madhav-madhusoodanan Sep 29, 2025
4b06d4b
feat: add c_programs to PATH and increase chunk size to 400
madhav-madhusoodanan Sep 30, 2025
87e39a2
feat: display __mmask8 values so that non-utf8 values are not displayed
madhav-madhusoodanan Oct 2, 2025
3b77dc5
feat: add formatting for __m128i, __m256i, __m512i types that is similar
madhav-madhusoodanan Oct 3, 2025
9dbc078
feat: make the debug_i16 into a generic debug_as function that adapts to
madhav-madhusoodanan Oct 5, 2025
6c66eb7
feat: casting the results of the lane function by preserving the bits
madhav-madhusoodanan Oct 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ fi
# Test targets compiled with extra features.
case ${TARGET} in
x86_64-unknown-linux-gnu)
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/"
TEST_CXX_COMPILER="clang++-19"
TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}"
export STDARCH_DISABLE_ASSERT_INSTR=1

export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
Expand Down Expand Up @@ -181,6 +184,16 @@ case "${TARGET}" in
--linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
--cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
;;

x86_64-unknown-linux-gnu*)
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="true" \
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
--bin intrinsic-test -- intrinsics_data/x86-intel.xml \
--runner "${TEST_RUNNER}" \
--cppcompiler "${TEST_CXX_COMPILER}" \
--target "${TARGET}"
;;
*)
;;
esac
Expand Down
3 changes: 3 additions & 0 deletions crates/intrinsic-test/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ pretty_env_logger = "0.5.0"
rayon = "1.5.0"
diff = "0.1.12"
itertools = "0.14.0"
quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] }
serde-xml-rs = "0.8.0"
regex = "1.11.1"
12 changes: 11 additions & 1 deletion crates/intrinsic-test/src/arm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,17 @@ impl SupportedArchitectureTest for ArmArchitectureTest {

const NOTICE: &str = config::NOTICE;

const PLATFORM_C_HEADERS: &[&str] = &["arm_neon.h", "arm_acle.h", "arm_fp16.h"];
const PLATFORM_C_HEADERS: &[&str] = &[
"iostream",
"cstring",
"iomanip",
"sstream",
"cstddef",
"cstdint",
"arm_neon.h",
"arm_acle.h",
"arm_fp16.h",
];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are additional headers needed on arm?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There were a couple of headers that were part of common/gen_c.rs which I had brought here.

Oh, I think cstddef and cstdint won't be necessary in arm though.

const PLATFORM_C_DEFINITIONS: &str = config::POLY128_OSTREAM_DEF;
const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::POLY128_OSTREAM_DECL;

Expand Down
2 changes: 1 addition & 1 deletion crates/intrinsic-test/src/common/argument.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ where
for arg in self.iter().filter(|&arg| !arg.has_constraint()) {
writeln!(
w,
"{indentation}const {ty} {name}_vals[] = {values};",
"{indentation}alignas(64) const {ty} {name}_vals[] = {values};",
Comment on lines -111 to +118
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting, I suppose we're casting values to simd vectors and then performing some sort of aligned read? In any case, can you leave a comment on why the alignment is required?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really, I added this as a general precautionary measure.

Fortunately when I was working on x86, there were intrinsics that helped with unaligned reads (like _mm_loadu_epi16 and the like), but I'm not sure if the same could be told for other architectures too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But yes, I'll add a comment on the same.

Copy link
Contributor Author

@madhav-madhusoodanan madhav-madhusoodanan Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@folkertdev I wish to confer with you on this. I chose 64 so that it generalizes well across 16-bit, 32-bit and 64-bit alignment requirements.

Would 64 be a good value, or is it overkill?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd go with either 64 or remove it completely, any other inbetween value does not really make sense.

ty = arg.ty.c_scalar_type(),
name = arg.name,
values = arg.ty.populate_random(indentation, loads, &Language::C)
Expand Down
16 changes: 5 additions & 11 deletions crates/intrinsic-test/src/common/gen_c.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ pub fn generate_c_constraint_blocks<'a, T: IntrinsicTypeDefinition + 'a>(
let ty = current.ty.c_type();

writeln!(w, "{indentation}{{")?;
writeln!(w, "{body_indentation}{ty} {} = {i};", current.name)?;
writeln!(w, "{body_indentation}const {ty} {} = {i};", current.name)?;

generate_c_constraint_blocks(
w,
Expand Down Expand Up @@ -103,14 +103,11 @@ pub fn write_mod_cpp<T: IntrinsicTypeDefinition>(
writeln!(w, "#include <{header}>")?;
}

writeln!(w, "{}", forward_declarations)?;

writeln!(
w,
r#"
#include <iostream>
#include <cstring>
#include <iomanip>
#include <sstream>

template<typename T1, typename T2> T1 cast(T2 x) {{
static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
T1 ret{{}};
Expand All @@ -120,13 +117,9 @@ template<typename T1, typename T2> T1 cast(T2 x) {{

std::ostream& operator<<(std::ostream& os, float16_t value);



"#
)?;

writeln!(w, "{}", forward_declarations)?;

for intrinsic in intrinsics {
create_c_test_function(w, intrinsic)?;
}
Expand All @@ -137,12 +130,13 @@ std::ostream& operator<<(std::ostream& os, float16_t value);
pub fn write_main_cpp<'a>(
w: &mut impl std::io::Write,
arch_specific_definitions: &str,
arch_specific_headers: &[&str],
intrinsics: impl Iterator<Item = &'a str> + Clone,
) -> std::io::Result<()> {
writeln!(w, "#include <iostream>")?;
writeln!(w, "#include <string>")?;

for header in ["arm_neon.h", "arm_acle.h", "arm_fp16.h"] {
for header in arch_specific_headers {
writeln!(w, "#include <{header}>")?;
}

Expand Down
6 changes: 3 additions & 3 deletions crates/intrinsic-test/src/common/gen_rust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ pub fn generate_rust_test_loop<T: IntrinsicTypeDefinition>(
w: &mut impl std::io::Write,
intrinsic: &Intrinsic<T>,
indentation: Indentation,
specializations: &[Vec<u8>],
specializations: &[Vec<i64>],
passes: u32,
) -> std::io::Result<()> {
let intrinsic_name = &intrinsic.name;
Expand Down Expand Up @@ -255,15 +255,15 @@ pub fn generate_rust_test_loop<T: IntrinsicTypeDefinition>(
/// Generate the specializations (unique sequences of const-generic arguments) for this intrinsic.
fn generate_rust_specializations<'a>(
constraints: &mut impl Iterator<Item = impl Iterator<Item = i64>>,
) -> Vec<Vec<u8>> {
) -> Vec<Vec<i64>> {
let mut specializations = vec![vec![]];

for constraint in constraints {
specializations = constraint
.flat_map(|right| {
specializations.iter().map(move |left| {
let mut left = left.clone();
left.push(u8::try_from(right).unwrap());
left.push(i64::try_from(right).unwrap());
left
})
})
Expand Down
50 changes: 39 additions & 11 deletions crates/intrinsic-test/src/common/intrinsic_helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,10 @@ impl TypeKind {
Self::Float => "float",
Self::Int(Sign::Signed) => "int",
Self::Int(Sign::Unsigned) => "uint",
Self::Mask => "uint",
Self::Poly => "poly",
Self::Char(Sign::Signed) => "char",
Self::Vector => "int",
_ => unreachable!("Not used: {:#?}", self),
}
}
Expand Down Expand Up @@ -131,7 +133,7 @@ impl IntrinsicType {
if let Some(bl) = self.bit_len {
bl
} else {
unreachable!("")
unreachable!("{:#?}", self)
}
}

Expand All @@ -154,6 +156,7 @@ impl IntrinsicType {
pub fn c_scalar_type(&self) -> String {
match self.kind() {
TypeKind::Char(_) => String::from("char"),
TypeKind::Vector => String::from("int32_t"),
_ => format!(
"{prefix}{bits}_t",
prefix = self.kind().c_prefix(),
Expand All @@ -162,14 +165,6 @@ impl IntrinsicType {
}
}

pub fn rust_scalar_type(&self) -> String {
format!(
"{prefix}{bits}",
prefix = self.kind().rust_prefix(),
bits = self.inner_size()
)
}

pub fn c_promotion(&self) -> &str {
match *self {
IntrinsicType {
Expand Down Expand Up @@ -222,7 +217,8 @@ impl IntrinsicType {
match self {
IntrinsicType {
bit_len: Some(bit_len @ (8 | 16 | 32 | 64)),
kind: kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_)),
kind:
kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_) | TypeKind::Mask),
simd_len,
vec_len,
..
Expand Down Expand Up @@ -283,6 +279,29 @@ impl IntrinsicType {
)))
)
}
IntrinsicType {
kind: TypeKind::Vector,
bit_len: Some(bit_len @ (128 | 256 | 512)),
simd_len,
..
} => {
let (prefix, suffix) = match language {
Language::Rust => ("[", "]"),
Language::C => ("{", "}"),
};
let body_indentation = indentation.nested();
let effective_bit_len = 32;
let effective_vec_len = bit_len / effective_bit_len;
format!(
"{prefix}\n{body}\n{indentation}{suffix}",
body = (0..(simd_len.unwrap_or(1) * effective_vec_len + loads - 1))
.format_with(",\n", |i, fmt| {
let src = value_for_array(effective_bit_len, i);
assert!(src == 0 || src.ilog2() < *bit_len);
fmt(&format_args!("{body_indentation}{src:#x}"))
})
)
}
_ => unimplemented!("populate random: {:#?}", self),
}
}
Expand All @@ -298,7 +317,7 @@ impl IntrinsicType {
kind: TypeKind::Int(_) | TypeKind::Poly,
..
} => true,
_ => unimplemented!(),
_ => true,
}
}

Expand Down Expand Up @@ -330,4 +349,13 @@ pub trait IntrinsicTypeDefinition: Deref<Target = IntrinsicType> {
/// rust debug output format for the return type. The generated line assumes
/// there is an int i in scope which is the current pass number.
fn print_result_c(&self, indentation: Indentation, additional: &str) -> String;

/// To enable architecture-specific logic
fn rust_scalar_type(&self) -> String {
format!(
"{prefix}{bits}",
prefix = self.kind().rust_prefix(),
bits = self.inner_size()
)
}
}
10 changes: 10 additions & 0 deletions crates/intrinsic-test/src/common/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::fs::File;
use std::io::{self, Write};

use rayon::prelude::*;

Expand Down Expand Up @@ -76,6 +77,14 @@ pub trait SupportedArchitectureTest {
if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() {
let output = cpp_compiler
.compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?;
if !output.status.success() {
io::stdout()
.write_all(&output.stdout)
.expect("Failed to write to stdout!");
io::stderr()
.write_all(&output.stderr)
.expect("Failed to write to stderr!");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the debug print that is part of the assert should already print the stdoud and stderr data. Is this to get better formatting?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yes, you're right.
I hadn't noticed the debug print of the assert, my bad.

assert!(output.status.success(), "{output:?}");
}

Expand All @@ -88,6 +97,7 @@ pub trait SupportedArchitectureTest {
write_main_cpp(
&mut file,
Self::PLATFORM_C_DEFINITIONS,
Self::PLATFORM_C_HEADERS,
self.intrinsics().iter().map(|i| i.name.as_str()),
)
.unwrap();
Expand Down
3 changes: 3 additions & 0 deletions crates/intrinsic-test/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ extern crate log;

mod arm;
mod common;
mod x86;

use arm::ArmArchitectureTest;
use common::SupportedArchitectureTest;
use common::cli::{Cli, ProcessedCli};
use x86::X86ArchitectureTest;

fn main() {
pretty_env_logger::init();
Expand All @@ -18,6 +20,7 @@ fn main() {
| "armv7-unknown-linux-gnueabihf"
| "aarch64_be-unknown-linux-gnu" => run(ArmArchitectureTest::create(processed_cli_options)),

"x86_64-unknown-linux-gnu" => run(X86ArchitectureTest::create(processed_cli_options)),
_ => std::process::exit(0),
}
}
Expand Down
Loading