@@ -301,35 +301,40 @@ class SimdOpCheckARM : public SimdOpCheckTest {
301301 if (target.os != Target::IOS && target.os != Target::OSX) {
302302 // VLD* are not profitable on Apple silicon
303303
304+ // Even on non-Apple silicon, LLVM occasionally decides it's
305+ // more profitable to use shuffles, so make sure we use both end
306+ // points in the loaded vector so that a vld{2,3,4} is safe and
307+ // useful.
308+ auto ld = [&](auto buf, int stride) {
309+ return max (buf (x * stride), buf (x * stride + stride - 1 ));
310+ };
311+
304312 // VLD2 X - Load Two-Element Structures
305- // These need to be vectorized at least 2 native vectors wide,
306- // so we get a full vectors' worth that we know is safe to
307- // access.
308- check (arm32 ? " vld2.8" : " ld2" , 32 * w, in_i8 (x * 2 ));
309- check (arm32 ? " vld2.8" : " ld2" , 32 * w, in_u8 (x * 2 ));
310- check (arm32 ? " vld2.16" : " ld2" , 16 * w, in_i16 (x * 2 ));
311- check (arm32 ? " vld2.16" : " ld2" , 16 * w, in_u16 (x * 2 ));
312- check (arm32 ? " vld2.32" : " ld2" , 8 * w, in_i32 (x * 2 ));
313- check (arm32 ? " vld2.32" : " ld2" , 8 * w, in_u32 (x * 2 ));
314- check (arm32 ? " vld2.32" : " ld2" , 8 * w, in_f32 (x * 2 ));
313+ check (arm32 ? " vld2.8" : " ld2" , 32 * w, ld (in_i8, 2 ));
314+ check (arm32 ? " vld2.8" : " ld2" , 32 * w, ld (in_u8, 2 ));
315+ check (arm32 ? " vld2.16" : " ld2" , 16 * w, ld (in_i16, 2 ));
316+ check (arm32 ? " vld2.16" : " ld2" , 16 * w, ld (in_u16, 2 ));
317+ check (arm32 ? " vld2.32" : " ld2" , 8 * w, ld (in_i32, 2 ));
318+ check (arm32 ? " vld2.32" : " ld2" , 8 * w, ld (in_u32, 2 ));
319+ check (arm32 ? " vld2.32" : " ld2" , 8 * w, ld (in_f32, 2 ));
315320
316321 // VLD3 X - Load Three-Element Structures
317- check (arm32 ? " vld3.8" : " ld3" , 32 * w, in_i8 (x * 3 ));
318- check (arm32 ? " vld3.8" : " ld3" , 32 * w, in_u8 (x * 3 ));
319- check (arm32 ? " vld3.16" : " ld3" , 16 * w, in_i16 (x * 3 ));
320- check (arm32 ? " vld3.16" : " ld3" , 16 * w, in_u16 (x * 3 ));
321- check (arm32 ? " vld3.32" : " ld3" , 8 * w, in_i32 (x * 3 ));
322- check (arm32 ? " vld3.32" : " ld3" , 8 * w, in_u32 (x * 3 ));
323- check (arm32 ? " vld3.32" : " ld3" , 8 * w, in_f32 (x * 3 ));
322+ check (arm32 ? " vld3.8" : " ld3" , 32 * w, ld (in_i8, 3 ));
323+ check (arm32 ? " vld3.8" : " ld3" , 32 * w, ld (in_u8, 3 ));
324+ check (arm32 ? " vld3.16" : " ld3" , 16 * w, ld (in_i16, 3 ));
325+ check (arm32 ? " vld3.16" : " ld3" , 16 * w, ld (in_u16, 3 ));
326+ check (arm32 ? " vld3.32" : " ld3" , 8 * w, ld (in_i32, 3 ));
327+ check (arm32 ? " vld3.32" : " ld3" , 8 * w, ld (in_u32, 3 ));
328+ check (arm32 ? " vld3.32" : " ld3" , 8 * w, ld (in_f32, 3 ));
324329
325330 // VLD4 X - Load Four-Element Structures
326- check (arm32 ? " vld4.8" : " ld4" , 32 * w, in_i8 (x * 4 ));
327- check (arm32 ? " vld4.8" : " ld4" , 32 * w, in_u8 (x * 4 ));
328- check (arm32 ? " vld4.16" : " ld4" , 16 * w, in_i16 (x * 4 ));
329- check (arm32 ? " vld4.16" : " ld4" , 16 * w, in_u16 (x * 4 ));
330- check (arm32 ? " vld4.32" : " ld4" , 8 * w, in_i32 (x * 4 ));
331- check (arm32 ? " vld4.32" : " ld4" , 8 * w, in_u32 (x * 4 ));
332- check (arm32 ? " vld4.32" : " ld4" , 8 * w, in_f32 (x * 4 ));
331+ check (arm32 ? " vld4.8" : " ld4" , 32 * w, ld (in_i8, 4 ));
332+ check (arm32 ? " vld4.8" : " ld4" , 32 * w, ld (in_u8, 4 ));
333+ check (arm32 ? " vld4.16" : " ld4" , 16 * w, ld (in_i16, 4 ));
334+ check (arm32 ? " vld4.16" : " ld4" , 16 * w, ld (in_u16, 4 ));
335+ check (arm32 ? " vld4.32" : " ld4" , 8 * w, ld (in_i32, 4 ));
336+ check (arm32 ? " vld4.32" : " ld4" , 8 * w, ld (in_u32, 4 ));
337+ check (arm32 ? " vld4.32" : " ld4" , 8 * w, ld (in_f32, 4 ));
333338 } else if (!arm32) {
334339 // On Apple Silicon we expect dense loads followed by shuffles.
335340 check (" uzp1.16b" , 32 * w, in_i8 (x * 2 ));
0 commit comments