Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,17 @@ edition = "2021"
ark-ff = "0.4.0"
ark-poly = "0.4.0"
ark-std ="0.4.0"
ark-serialize = "0.4.2"
num-bigint = "0.4"
num-traits = "0.2"
zeroize = "1.8.1"

[dev-dependencies]
criterion = "0.4"

[features]
simd = []

[[bench]]
name = "explanation"
harness = false
name = "reduce_sum_benches"
4 changes: 3 additions & 1 deletion benches/explanation.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
fn main() {
eprintln!("Error: This project uses a custom benchmarking workflow.");
eprintln!("Please navigate to the appropriate bench directory and call the shell './run_bench.sh' directly.");
eprintln!("Please choose a bench:");
eprintln!(" Full Protocol Benches: 'cd ./benches/sumcheck-benches/ && cargo build --release && ./run_benches.sh'");
eprintln!(" Lagrange Polynomial Benches: 'cd ./benches/lag-poly-benches/ && cargo build --release && ./run_benches.sh'");
std::process::exit(1);
}
6 changes: 5 additions & 1 deletion benches/lag-poly-benches/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

95 changes: 95 additions & 0 deletions benches/reduce_sum_benches.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#![feature(portable_simd)]

use ark_std::{
simd::{cmp::SimdPartialOrd, u32x4, Mask, Simd},
test_rng,
};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use space_efficient_sumcheck::fields::{
aarch64_neon::reduce_sum_32_bit_modulus_asm, reduce_sum_naive, VecOps, M31, M31_MODULUS,
};

// TODO (z-tech): this is the benchmark we should hit with both Neon and AVX
const LANES: usize = 4;
pub fn reduce_sum_packed(values: &[u32]) -> u32 {
let packed_modulus: Simd<u32, LANES> = u32x4::splat(M31_MODULUS);
let mut packed_sums1: Simd<u32, LANES> = u32x4::splat(0);
let mut packed_sums2: Simd<u32, LANES> = u32x4::splat(0);
let mut packed_sums3: Simd<u32, LANES> = u32x4::splat(0);
let mut packed_sums4: Simd<u32, LANES> = u32x4::splat(0);
for i in (0..values.len()).step_by(16) {
let tmp_packed_sums_1: Simd<u32, LANES> =
packed_sums1 + u32x4::from_slice(&values[i..i + 4]);
let tmp_packed_sums_2: Simd<u32, LANES> =
packed_sums2 + u32x4::from_slice(&values[i + 4..i + 8]);
let tmp_packed_sums_3: Simd<u32, LANES> =
packed_sums3 + u32x4::from_slice(&values[i + 8..i + 12]);
let tmp_packed_sums_4: Simd<u32, LANES> =
packed_sums4 + u32x4::from_slice(&values[i + 12..i + 16]);
let is_mod_needed_1: Mask<i32, LANES> = tmp_packed_sums_1.simd_ge(packed_modulus);
let is_mod_needed_2: Mask<i32, LANES> = tmp_packed_sums_2.simd_ge(packed_modulus);
let is_mod_needed_3: Mask<i32, LANES> = tmp_packed_sums_3.simd_ge(packed_modulus);
let is_mod_needed_4: Mask<i32, LANES> = tmp_packed_sums_4.simd_ge(packed_modulus);
packed_sums1 =
is_mod_needed_1.select(tmp_packed_sums_1 - packed_modulus, tmp_packed_sums_1);
packed_sums2 =
is_mod_needed_2.select(tmp_packed_sums_2 - packed_modulus, tmp_packed_sums_2);
packed_sums3 =
is_mod_needed_3.select(tmp_packed_sums_3 - packed_modulus, tmp_packed_sums_3);
packed_sums4 =
is_mod_needed_4.select(tmp_packed_sums_4 - packed_modulus, tmp_packed_sums_4);
}
reduce_sum_naive(&packed_sums1.to_array())
+ reduce_sum_naive(&packed_sums2.to_array())
+ reduce_sum_naive(&packed_sums3.to_array())
+ reduce_sum_naive(&packed_sums4.to_array())
}

fn reduce_sum_naive_bench(c: &mut Criterion) {
let random_values: Vec<u32> = (0..2_i32.pow(13))
.map(|_| M31::rand(&mut test_rng()).to_u32())
.collect();

c.bench_function("reduce_sum_naive", |b| {
b.iter(|| black_box(reduce_sum_naive(&random_values)))
});
}

fn reduce_sum_simd_lib(c: &mut Criterion) {
let random_values: Vec<u32> = (0..2_i32.pow(13))
.map(|_| M31::rand(&mut test_rng()).to_u32())
.collect();

c.bench_function("reduce_sum_simd_lib", |b| {
b.iter(|| black_box(reduce_sum_packed(&random_values)))
});
}

fn reduce_sum_neon_intrinsics(c: &mut Criterion) {
let random_values: Vec<M31> = (0..2_i32.pow(13))
.map(|_| M31::rand(&mut test_rng()))
.collect();

c.bench_function("reduce_sum_neon_intrinsics", |b| {
b.iter(|| black_box(M31::reduce_sum(&random_values)))
});
}

fn reduce_sum_neon_asm(c: &mut Criterion) {
let random_values: Vec<u32> = (0..2_i32.pow(13))
.map(|_| M31::rand(&mut test_rng()).to_u32())
.collect();

c.bench_function("reduce_sum_neon_asm", |b| {
b.iter(|| black_box(reduce_sum_32_bit_modulus_asm(&random_values, M31_MODULUS)))
});
}

criterion_group!(
benches,
reduce_sum_naive_bench,
reduce_sum_simd_lib,
reduce_sum_neon_intrinsics,
reduce_sum_neon_asm,
);
criterion_main!(benches);
74 changes: 74 additions & 0 deletions src/fields/aarch64_neon/asm/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use ark_std::arch::asm;

use crate::fields::m31::reduce_sum_naive;

pub fn reduce_sum_32_bit_modulus_asm(values: &[u32], modulus: u32) -> u32 {
let modulus: *const u32 = [modulus; 4].as_ptr();
let mut sums: [u32; 4] = [0; 4];
for step in (0..values.len()).step_by(4) {
let vals: *const u32 = unsafe { values.as_ptr().add(step) };

// TODO (z-tech): Again this should be unrolled, it's also important to understand if these loads / writes are not optimal
unsafe {
asm!(
// Load accumulated sums into register v0
"ldr q0, [{0}]",

// Load the new values into register v1
"ldr q1, [{1}]",

// Load the modulus into register v3
"ldr q3, [{2}]",

// Add values to accumulated sums and put result into v0
"add v0.4s, v0.4s, v1.4s",

// Subtract the modulus from the result and put it in v2
"sub v2.4s, v0.4s, v3.4s",

// Keep the minimum of those operations
"umin v0.4s, v0.4s, v2.4s",

// Load it back into sum accumulator
"st1 {{v0.4s}}, [{0}]",

inout(reg) sums.as_mut_ptr() => _,
in(reg) vals,
in(reg) modulus,
);
}
}

let arr: [u32; 4] = unsafe { core::mem::transmute(sums) };
reduce_sum_naive(&arr)
}

#[cfg(test)]
mod tests {
use crate::fields::{
aarch64_neon::reduce_sum_32_bit_modulus_asm,
m31::{M31, M31_MODULUS},
};
use ark_ff::Zero;
use ark_std::test_rng;

#[test]
fn reduce_sum_correctness() {
fn reduce_sum_sanity(vec: &[M31]) -> M31 {
M31::from(vec.iter().fold(M31::zero(), |acc, &x| (acc + x)))
}

let mut rng = test_rng();
let random_field_values: Vec<M31> = (0..1 << 13).map(|_| M31::rand(&mut rng)).collect();
let random_field_values_u32: Vec<u32> =
random_field_values.iter().map(|m| m.to_u32()).collect();
let exp = reduce_sum_sanity(&random_field_values);
assert_eq!(
exp,
M31::from(reduce_sum_32_bit_modulus_asm(
&random_field_values_u32,
M31_MODULUS
))
);
}
}
86 changes: 86 additions & 0 deletions src/fields/aarch64_neon/intrinsics/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use ark_std::{
arch::aarch64::{
uint32x4_t, vaddq_u32, vandq_u32, vcgeq_u32, vdupq_n_u32, vld1q_u32, vminq_u32, vmlsq_u32,
vmulq_u32, vqdmulhq_s32, vreinterpretq_s32_u32, vreinterpretq_u32_s32, vst1q_u32,
vsubq_u32,
},
mem::transmute,
};

use crate::fields::m31::reduce_sum_naive;

#[inline(always)]
fn sum_vectors(v0: &mut uint32x4_t, v1: &uint32x4_t, packed_modulus: &uint32x4_t) {
let raw_sum = unsafe { vaddq_u32(*v0, *v1) };
let gte_mask = unsafe { vcgeq_u32(raw_sum, *packed_modulus) };
*v0 = unsafe { vsubq_u32(raw_sum, vandq_u32(*packed_modulus, gte_mask)) };
// an alternative to the above three lines is this, you can experiment to see which is more performant
// let sum1 = vaddq_u32(*v0, *v1);
// let sum2 = vsubq_u32(sum1, *packed_modulus);
// *v0 = vminq_u32(sum1, vandq_u32(*packed_modulus, sum2));
}

pub fn reduce_sum_32_bit_modulus(values: &[u32], modulus: u32) -> u32 {
let modulus: uint32x4_t = unsafe { transmute::<[u32; 4], uint32x4_t>([modulus; 4]) };
let mut sums: uint32x4_t = unsafe { vdupq_n_u32(0) };

// TODO (z-tech): This should be unrolled, you have to figure out how much unrolling is the sweet spot (try 16, 32, ...)
for step in (0..values.len()).step_by(4) {
let v: uint32x4_t = unsafe { vld1q_u32(values.as_ptr().add(step)) };
sum_vectors(&mut sums, &v, &modulus);
}

let arr: [u32; 4] = unsafe { transmute(sums) };
reduce_sum_naive(&arr)
}

pub fn scalar_mult_32_bit_modulus(values: &mut [u32], scalar: u32, modulus: u32) {
let packed_modulus: uint32x4_t = unsafe { transmute::<[u32; 4], uint32x4_t>([modulus; 4]) };
let packed_scalar: uint32x4_t = unsafe { transmute::<[u32; 4], uint32x4_t>([scalar; 4]) };
for step in (0..values.len()).step_by(4) {
unsafe {
let lhs = vld1q_u32(values.as_ptr().add(step));
let upper = vreinterpretq_u32_s32(vqdmulhq_s32(
vreinterpretq_s32_u32(lhs),
vreinterpretq_s32_u32(packed_scalar),
));
let lower = vmulq_u32(lhs, packed_scalar);
let t = vmlsq_u32(lower, upper, packed_modulus);
let res = vminq_u32(
vmlsq_u32(lower, upper, packed_modulus),
vsubq_u32(t, packed_modulus),
);
vst1q_u32(values.as_mut_ptr().add(step), res);
}
}
}

#[cfg(test)]
mod tests {
use crate::fields::{
aarch64_neon::reduce_sum_32_bit_modulus,
m31::{M31, M31_MODULUS},
};
use ark_ff::Zero;
use ark_std::test_rng;

#[test]
fn reduce_sum_correctness() {
fn reduce_sum_sanity(vec: &[M31]) -> M31 {
M31::from(vec.iter().fold(M31::zero(), |acc, &x| (acc + x)))
}

let mut rng = test_rng();
let random_field_values: Vec<M31> = (0..1 << 13).map(|_| M31::rand(&mut rng)).collect();
let random_field_values_u32: Vec<u32> =
random_field_values.iter().map(|m| m.to_u32()).collect();
let exp = reduce_sum_sanity(&random_field_values);
assert_eq!(
exp,
M31::from(reduce_sum_32_bit_modulus(
&random_field_values_u32,
M31_MODULUS
))
);
}
}
5 changes: 5 additions & 0 deletions src/fields/aarch64_neon/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod asm;
mod intrinsics;

pub use asm::reduce_sum_32_bit_modulus_asm;
pub use intrinsics::{reduce_sum_32_bit_modulus, scalar_mult_32_bit_modulus};
19 changes: 19 additions & 0 deletions src/fields/m31/fft_field.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use super::M31;

use ark_ff::FftField;

// TODO (z-tech): These might be correct we must verify each one

impl FftField for M31 {
const GENERATOR: Self = M31 { value: 7 };

const TWO_ADICITY: u32 = 1;

const TWO_ADIC_ROOT_OF_UNITY: Self = M31 { value: 2147483646 };

const SMALL_SUBGROUP_BASE: Option<u32> = Some(3);

const SMALL_SUBGROUP_BASE_ADICITY: Option<u32> = Some(1);

const LARGE_SUBGROUP_ROOT_OF_UNITY: Option<Self> = Some(M31 { value: 6 });
}
Loading