@@ -179,15 +179,19 @@ static void simd_reduce_logical_BOOL(npy_bool* op, npy_bool* ip, npy_intp len) {
179
179
#if defined(NPY_HAVE_SSE2)
180
180
NPY_PREFETCH (reinterpret_cast <const char *>(ip + wstep), 0 , 3 );
181
181
#endif
182
- vec_u8 v[UNROLL] = {};
183
- for (int i = 0 ; i < UNROLL; i++) {
184
- v[i] = hn::LoadU (u8 , ip + vstep * i);
185
- }
186
-
187
- vec_u8 m01 = traits.reduce (v[0 ], v[1 ]);
188
- vec_u8 m23 = traits.reduce (v[2 ], v[3 ]);
189
- vec_u8 m45 = traits.reduce (v[4 ], v[5 ]);
190
- vec_u8 m67 = traits.reduce (v[6 ], v[7 ]);
182
+ vec_u8 v0 = hn::LoadU (u8 , ip);
183
+ vec_u8 v1 = hn::LoadU (u8 , ip + vstep);
184
+ vec_u8 v2 = hn::LoadU (u8 , ip + vstep * 2 );
185
+ vec_u8 v3 = hn::LoadU (u8 , ip + vstep * 3 );
186
+ vec_u8 v4 = hn::LoadU (u8 , ip + vstep * 4 );
187
+ vec_u8 v5 = hn::LoadU (u8 , ip + vstep * 5 );
188
+ vec_u8 v6 = hn::LoadU (u8 , ip + vstep * 6 );
189
+ vec_u8 v7 = hn::LoadU (u8 , ip + vstep * 7 );
190
+
191
+ vec_u8 m01 = traits.reduce (v0, v1);
192
+ vec_u8 m23 = traits.reduce (v2, v3);
193
+ vec_u8 m45 = traits.reduce (v4, v5);
194
+ vec_u8 m67 = traits.reduce (v6, v7);
191
195
192
196
vec_u8 m0123 = traits.reduce (m01, m23);
193
197
vec_u8 m4567 = traits.reduce (m45, m67);
0 commit comments