Skip to content

Commit 2d1f036

Browse files
authored
Make the vld2 pattern more obviously profitable (#8765)
1 parent 206cbb2 commit 2d1f036

File tree

1 file changed

+29
-24
lines changed

1 file changed

+29
-24
lines changed

test/correctness/simd_op_check_arm.cpp

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -301,35 +301,40 @@ class SimdOpCheckARM : public SimdOpCheckTest {
301301
if (target.os != Target::IOS && target.os != Target::OSX) {
302302
// VLD* are not profitable on Apple silicon
303303

304+
// Even on non-Apple silicon, LLVM occasionally decides it's
305+
// more profitable to use shuffles, so make sure we use both end
306+
// points in the loaded vector so that a vld{2,3,4} is safe and
307+
// useful.
308+
auto ld = [&](auto buf, int stride) {
309+
return max(buf(x * stride), buf(x * stride + stride - 1));
310+
};
311+
304312
// VLD2 X - Load Two-Element Structures
305-
// These need to be vectorized at least 2 native vectors wide,
306-
// so we get a full vectors' worth that we know is safe to
307-
// access.
308-
check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2));
309-
check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2));
310-
check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2));
311-
check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2));
312-
check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2));
313-
check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2));
314-
check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2));
313+
check(arm32 ? "vld2.8" : "ld2", 32 * w, ld(in_i8, 2));
314+
check(arm32 ? "vld2.8" : "ld2", 32 * w, ld(in_u8, 2));
315+
check(arm32 ? "vld2.16" : "ld2", 16 * w, ld(in_i16, 2));
316+
check(arm32 ? "vld2.16" : "ld2", 16 * w, ld(in_u16, 2));
317+
check(arm32 ? "vld2.32" : "ld2", 8 * w, ld(in_i32, 2));
318+
check(arm32 ? "vld2.32" : "ld2", 8 * w, ld(in_u32, 2));
319+
check(arm32 ? "vld2.32" : "ld2", 8 * w, ld(in_f32, 2));
315320

316321
// VLD3 X - Load Three-Element Structures
317-
check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3));
318-
check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3));
319-
check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3));
320-
check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3));
321-
check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3));
322-
check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3));
323-
check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3));
322+
check(arm32 ? "vld3.8" : "ld3", 32 * w, ld(in_i8, 3));
323+
check(arm32 ? "vld3.8" : "ld3", 32 * w, ld(in_u8, 3));
324+
check(arm32 ? "vld3.16" : "ld3", 16 * w, ld(in_i16, 3));
325+
check(arm32 ? "vld3.16" : "ld3", 16 * w, ld(in_u16, 3));
326+
check(arm32 ? "vld3.32" : "ld3", 8 * w, ld(in_i32, 3));
327+
check(arm32 ? "vld3.32" : "ld3", 8 * w, ld(in_u32, 3));
328+
check(arm32 ? "vld3.32" : "ld3", 8 * w, ld(in_f32, 3));
324329

325330
// VLD4 X - Load Four-Element Structures
326-
check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4));
327-
check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4));
328-
check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4));
329-
check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4));
330-
check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4));
331-
check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4));
332-
check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4));
331+
check(arm32 ? "vld4.8" : "ld4", 32 * w, ld(in_i8, 4));
332+
check(arm32 ? "vld4.8" : "ld4", 32 * w, ld(in_u8, 4));
333+
check(arm32 ? "vld4.16" : "ld4", 16 * w, ld(in_i16, 4));
334+
check(arm32 ? "vld4.16" : "ld4", 16 * w, ld(in_u16, 4));
335+
check(arm32 ? "vld4.32" : "ld4", 8 * w, ld(in_i32, 4));
336+
check(arm32 ? "vld4.32" : "ld4", 8 * w, ld(in_u32, 4));
337+
check(arm32 ? "vld4.32" : "ld4", 8 * w, ld(in_f32, 4));
333338
} else if (!arm32) {
334339
// On Apple Silicon we expect dense loads followed by shuffles.
335340
check("uzp1.16b", 32 * w, in_i8(x * 2));

0 commit comments

Comments
 (0)