1
1
//! Code taken from the `packed_simd` crate.
2
2
//! Run this code with `cargo test --example dot_product`.
3
3
4
- #![ feature( array_chunks) ]
5
- #![ feature( slice_as_chunks) ]
6
4
// Add these imports to use the stdsimd library
7
5
#![ feature( portable_simd) ]
8
6
use core_simd:: simd:: prelude:: * ;
@@ -33,7 +31,7 @@ pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
33
31
}
34
32
35
33
// We now move on to the SIMD implementations: notice the following constructs:
36
- // `array_chunks ::<4>`: mapping this over the vector will let use construct SIMD vectors
34
+ // `as_chunks ::<4>`: mapping this over the vector will let us construct SIMD vectors
37
35
// `f32x4::from_array`: construct the SIMD vector from a slice
38
36
// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them.
39
37
// This approach essentially uses SIMD to produce a vector of length N/4 of all the products,
@@ -42,9 +40,11 @@ pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
42
40
pub fn dot_prod_simd_0 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
43
41
assert_eq ! ( a. len( ) , b. len( ) ) ;
44
42
// TODO handle remainder when a.len() % 4 != 0
45
- a. array_chunks :: < 4 > ( )
43
+ a. as_chunks :: < 4 > ( )
44
+ . 0
45
+ . iter ( )
46
46
. map ( |& a| f32x4:: from_array ( a) )
47
- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
47
+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
48
48
. map ( |( a, b) | ( a * b) . reduce_sum ( ) )
49
49
. sum ( )
50
50
}
@@ -60,9 +60,11 @@ pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
60
60
pub fn dot_prod_simd_1 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
61
61
assert_eq ! ( a. len( ) , b. len( ) ) ;
62
62
// TODO handle remainder when a.len() % 4 != 0
63
- a. array_chunks :: < 4 > ( )
63
+ a. as_chunks :: < 4 > ( )
64
+ . 0
65
+ . iter ( )
64
66
. map ( |& a| f32x4:: from_array ( a) )
65
- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
67
+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
66
68
. fold ( f32x4:: splat ( 0.0 ) , |acc, zipped| acc + zipped. 0 * zipped. 1 )
67
69
. reduce_sum ( )
68
70
}
@@ -74,9 +76,11 @@ pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
74
76
assert_eq ! ( a. len( ) , b. len( ) ) ;
75
77
// TODO handle remainder when a.len() % 4 != 0
76
78
let mut res = f32x4:: splat ( 0.0 ) ;
77
- a. array_chunks :: < 4 > ( )
79
+ a. as_chunks :: < 4 > ( )
80
+ . 0
81
+ . iter ( )
78
82
. map ( |& a| f32x4:: from_array ( a) )
79
- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
83
+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
80
84
. for_each ( |( a, b) | {
81
85
res = a. mul_add ( b, res) ;
82
86
} ) ;
@@ -113,9 +117,11 @@ pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
113
117
// next example.
114
118
pub fn dot_prod_simd_4 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
115
119
let mut sum = a
116
- . array_chunks :: < 4 > ( )
120
+ . as_chunks :: < 4 > ( )
121
+ . 0
122
+ . iter ( )
117
123
. map ( |& a| f32x4:: from_array ( a) )
118
- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
124
+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
119
125
. map ( |( a, b) | a * b)
120
126
. fold ( f32x4:: splat ( 0.0 ) , std:: ops:: Add :: add)
121
127
. reduce_sum ( ) ;
@@ -131,9 +137,11 @@ pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
131
137
// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
132
138
// Notice the use of `mul_add`, which can do a multiply and an add operation ber iteration.
133
139
pub fn dot_prod_simd_5 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
134
- a. array_chunks :: < 4 > ( )
140
+ a. as_chunks :: < 4 > ( )
141
+ . 0
142
+ . iter ( )
135
143
. map ( |& a| f32x4:: from_array ( a) )
136
- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
144
+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
137
145
. fold ( f32x4:: splat ( 0. ) , |acc, ( a, b) | a. mul_add ( b, acc) )
138
146
. reduce_sum ( )
139
147
}
0 commit comments