add remainder dot_product

miguelraz · miguelraz · commit d4153e429ce8 · 2022-03-26T16:10:25.000-06:00
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
@@ -1,39 +1,112 @@
 // Code taken from the `packed_simd` crate
 // Run this code with `cargo test --example dot_product`
+//use core::iter::zip;
+//use std::iter::zip;
+
 #![feature(array_chunks)]
+#![feature(slice_as_chunks)]
+// Add these imports to use the stdsimd library
+#![feature(portable_simd)]
 use core_simd::*;
 
-/// This is your barebones dot product implementation: 
-/// Take 2 vectors, multiply them element wise and *then*
-/// add up the result. In the next example we will see if there
-///  is any difference to adding as we go along multiplying.
+// This is your barebones dot product implementation: 
+// Take 2 vectors, multiply them element wise and *then*
+// go along the resulting array and add up the result.
+// In the next example we will see if there
+//  is any difference to adding and multiplying in tandem.
 pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
 
     a.iter()
     .zip(b.iter())
-    .map(|a, b| a * b)
+    .map(|(a, b)| a * b)
     .sum()
 }
 
+// When dealing with SIMD, it is very important to think about the amount
+// of data movement and when it happens. We're going over simple computation examples here, and yet
+// it is not trivial to understand what may or may not contribute to performance
+// changes. Eventually, you will need tools to inspect the generated assembly and confirm your 
+// hypothesis and benchmarks - we will mention them later on.
+// With the use of `fold`, we're doing a multiplication,
+// and then adding it to the sum, one element from both vectors at a time.
 pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
     a.iter()
     .zip(b.iter())
-    .fold(0.0, |a, b| a * b)
+    .fold(0.0, |a, zipped| {a + zipped.0 * zipped.1})
 }
 
+// We now move on to the SIMD implementations: notice the following constructs:
+// `array_chunks::<4>`: mapping this over the vector will let use construct SIMD vectors
+// `f32x4::from_array`: construct the SIMD vector from a slice 
+// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them.
+// This approach essentially uses SIMD to produce a vector of length N/4 of all the products,
+// and then add those with `sum()`. This is suboptimal.
+// TODO: ASCII diagrams
 pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
-
     // TODO handle remainder when a.len() % 4 != 0
     a.array_chunks::<4>()
         .map(|&a| f32x4::from_array(a))
         .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
-        .map(|(a, b)| (a * b).horizontal_sum())
+        .map(|(a, b)| (a * b).reduce_sum())
         .sum()
 }
 
+// There's some simple ways to improve the previous code:
+// 1. Make a `zero` `f32x4` SIMD vector that we will be accumulating into
+// So that there is only one `sum()` reduction when the last `f32x4` has been processed
+// 2. Exploit Fused Multiply Add so that the multiplication, addition and sinking into the reduciton
+// happen in the same step.
+// If the arrays are large, minimizing the data shuffling will lead to great perf.
+// If the arrays are small, handling the remainder elements when the length isn't a multiple of 4
+// Can become a problem.
+pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.0), |acc, zipped| {acc + zipped.0 * zipped.1})
+        .reduce_sum()
+}
+
+// 
+pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.0), |acc, zipped| {acc + zipped.0 * zipped.1})
+        .reduce_sum()
+}
+
+const LANES: usize = 4;
+pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    let (a_extra, a_chunks) = a.as_rchunks();
+    let (b_extra, b_chunks) = b.as_rchunks();
+
+    // These are always true, but for emphasis:
+    assert_eq!(a_chunks.len(), b_chunks.len());
+    assert_eq!(a_extra.len(), b_extra.len());
+
+    let mut sums = [0.0; LANES];
+    for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) {
+        *d = x * y;
+    }
+
+    let mut sums = f32x4::from_array(sums);
+    std::iter::zip(a_chunks, b_chunks)
+        .for_each(|(x, y)| {
+            sums += f32x4::from_array(*x) * f32x4::from_array(*y);
+        });
+
+    sums.reduce_sum()
+}
 fn main() {
     // Empty main to make cargo happy
 }
@@ -45,10 +118,15 @@ mod tests {
         use super::*;
         let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
         let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
+        let x: Vec<f32> = [0.5; 1003].to_vec();
+        let y: Vec<f32> = [2.0; 1003].to_vec();
 
         assert_eq!(0.0, dot_prod_0(&a, &b));
         assert_eq!(0.0, dot_prod_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_0(&a, &b));
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_2(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_3(&a, &b));
+        assert_eq!(1003.0, dot_prod_simd_3(&x, &y));
     }
 }