cleanup dot_product and README.md

miguelraz · miguelraz · commit 4b93386f824a · 2022-03-29T16:23:28.000-06:00
diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md
@@ -10,10 +10,4 @@ Run the tests with the command
 cargo run --example dot_product
 ```
 
-and the benchmarks via the command
-
-```
-cargo run --example --benchmark ???
-```
-
-and measure the timings on your local system.
+and verify the code for `dot_product.rs` on your machine.
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
@@ -1,6 +1,5 @@
 // Code taken from the `packed_simd` crate
 // Run this code with `cargo test --example dot_product`
-//use core::iter::zip;
 //use std::iter::zip;
 
 #![feature(array_chunks)]
@@ -72,17 +71,21 @@ pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 {
         .reduce_sum()
 }
 
-// 
+// A lot of knowledgeable use of SIMD comes from knowing specific instructions that are 
+// available - let's try to use the `mul_add` instruction, which is the fused-multiply-add we were looking for.
+use std_float::StdFloat;
 pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
     // TODO handle remainder when a.len() % 4 != 0
+    let mut res = f32x4::splat(0.0);
     a.array_chunks::<4>()
         .map(|&a| f32x4::from_array(a))
         .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
-        .fold(f32x4::splat(0.0), |acc, zipped| {acc + zipped.0 * zipped.1})
-        .reduce_sum()
+        .for_each(|(a,b)| { res = a.mul_add(b, res); });
+        res.reduce_sum()
 }
 
+// Finally, we will write the same operation but handling the loop remainder.
 const LANES: usize = 4;
 pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
@@ -121,12 +124,16 @@ mod tests {
         let x: Vec<f32> = [0.5; 1003].to_vec();
         let y: Vec<f32> = [2.0; 1003].to_vec();
 
+        // Basic check
         assert_eq!(0.0, dot_prod_0(&a, &b));
         assert_eq!(0.0, dot_prod_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_0(&a, &b));
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_2(&a, &b));
         assert_eq!(0.0, dot_prod_simd_3(&a, &b));
+
+        // We can handle vectors that are non-multiples of 4
         assert_eq!(1003.0, dot_prod_simd_3(&x, &y));
+
     }
 }