@@ -179,15 +179,19 @@ static void simd_reduce_logical_BOOL(npy_bool* op, npy_bool* ip, npy_intp len) {
179179 #if defined(NPY_HAVE_SSE2)
180180 NPY_PREFETCH (reinterpret_cast <const char *>(ip + wstep), 0 , 3 );
181181 #endif
182- vec_u8 v[UNROLL] = {};
183- for (int i = 0 ; i < UNROLL; i++) {
184- v[i] = hn::LoadU (u8 , ip + vstep * i);
185- }
186-
187- vec_u8 m01 = traits.reduce (v[0 ], v[1 ]);
188- vec_u8 m23 = traits.reduce (v[2 ], v[3 ]);
189- vec_u8 m45 = traits.reduce (v[4 ], v[5 ]);
190- vec_u8 m67 = traits.reduce (v[6 ], v[7 ]);
182+ vec_u8 v0 = hn::LoadU (u8 , ip);
183+ vec_u8 v1 = hn::LoadU (u8 , ip + vstep);
184+ vec_u8 v2 = hn::LoadU (u8 , ip + vstep * 2 );
185+ vec_u8 v3 = hn::LoadU (u8 , ip + vstep * 3 );
186+ vec_u8 v4 = hn::LoadU (u8 , ip + vstep * 4 );
187+ vec_u8 v5 = hn::LoadU (u8 , ip + vstep * 5 );
188+ vec_u8 v6 = hn::LoadU (u8 , ip + vstep * 6 );
189+ vec_u8 v7 = hn::LoadU (u8 , ip + vstep * 7 );
190+
191+ vec_u8 m01 = traits.reduce (v0, v1);
192+ vec_u8 m23 = traits.reduce (v2, v3);
193+ vec_u8 m45 = traits.reduce (v4, v5);
194+ vec_u8 m67 = traits.reduce (v6, v7);
191195
192196 vec_u8 m0123 = traits.reduce (m01, m23);
193197 vec_u8 m4567 = traits.reduce (m45, m67);
0 commit comments