@@ -14,24 +14,22 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
1414; CHECK-NEXT: // %bb.2: // %for.body.us.preheader
1515; CHECK-NEXT: ptrue p0.h
1616; CHECK-NEXT: add x11, x2, x11, lsl #1
17- ; CHECK-NEXT: mov x12, #-16 // =0xfffffffffffffff0
18- ; CHECK-NEXT: ptrue p1.b
1917; CHECK-NEXT: mov w8, wzr
18+ ; CHECK-NEXT: ptrue p1.b
2019; CHECK-NEXT: mov x9, xzr
2120; CHECK-NEXT: mov w10, wzr
22- ; CHECK-NEXT: addvl x12, x12, #1
23- ; CHECK-NEXT: mov x13, #4 // =0x4
24- ; CHECK-NEXT: mov x14, #8 // =0x8
21+ ; CHECK-NEXT: mov x12, #4 // =0x4
22+ ; CHECK-NEXT: mov x13, #8 // =0x8
2523; CHECK-NEXT: .LBB0_3: // %for.body.us
2624; CHECK-NEXT: // =>This Loop Header: Depth=1
2725; CHECK-NEXT: // Child Loop BB0_4 Depth 2
28- ; CHECK-NEXT: add x15 , x0, x9, lsl #2
29- ; CHECK-NEXT: sbfiz x16 , x8, #1, #32
30- ; CHECK-NEXT: mov x17 , x2
31- ; CHECK-NEXT: ldp s0, s1, [x15 ]
32- ; CHECK-NEXT: add x16, x16 , #8
33- ; CHECK-NEXT: ldp s2, s3, [x15 , #8]
34- ; CHECK-NEXT: ubfiz x15 , x8, #1, #32
26+ ; CHECK-NEXT: add x14 , x0, x9, lsl #2
27+ ; CHECK-NEXT: sbfiz x15 , x8, #1, #32
28+ ; CHECK-NEXT: mov x16 , x2
29+ ; CHECK-NEXT: ldp s0, s1, [x14 ]
30+ ; CHECK-NEXT: add x15, x15 , #8
31+ ; CHECK-NEXT: ldp s2, s3, [x14 , #8]
32+ ; CHECK-NEXT: ubfiz x14 , x8, #1, #32
3533; CHECK-NEXT: fcvt h0, s0
3634; CHECK-NEXT: fcvt h1, s1
3735; CHECK-NEXT: fcvt h2, s2
@@ -43,56 +41,52 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
4341; CHECK-NEXT: .LBB0_4: // %for.cond.i.preheader.us
4442; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
4543; CHECK-NEXT: // => This Inner Loop Header: Depth=2
46- ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x17, x15]
47- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17]
48- ; CHECK-NEXT: add x18, x17, x16
49- ; CHECK-NEXT: add x3, x17, x15
44+ ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x16, x14]
45+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16]
46+ ; CHECK-NEXT: add x17, x16, x15
47+ ; CHECK-NEXT: add x18, x16, x14
48+ ; CHECK-NEXT: add x3, x17, #8
49+ ; CHECK-NEXT: add x4, x17, #16
5050; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
51- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x17, x16 ]
51+ ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x16, x15 ]
5252; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
53- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13 , lsl #1]
53+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, x12 , lsl #1]
5454; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
55- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
56- ; CHECK-NEXT: add x18, x18, #16
55+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, x13, lsl #1]
5756; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
58- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17 , #1, mul vl]
59- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 ]
60- ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3 , #1, mul vl]
57+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16 , #1, mul vl]
58+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 ]
59+ ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18 , #1, mul vl]
6160; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
62- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
63- ; CHECK-NEXT: add x18, x18, x12
61+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #1, mul vl]
6462; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
65- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1 ]
63+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #1, mul vl ]
6664; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
67- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
68- ; CHECK-NEXT: add x18, x18, #16
65+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #1, mul vl]
6966; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
70- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17 , #2, mul vl]
71- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 , #1, mul vl]
72- ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3 , #2, mul vl]
67+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16 , #2, mul vl]
68+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 , #1, mul vl]
69+ ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18 , #2, mul vl]
7370; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
74- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
75- ; CHECK-NEXT: add x18, x18, x12
71+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #2, mul vl]
7672; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
77- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1 ]
73+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #2, mul vl ]
7874; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
79- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
80- ; CHECK-NEXT: add x18, x18, #16
75+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #2, mul vl]
8176; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
82- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17 , #3, mul vl]
83- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 , #2, mul vl]
84- ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3 , #3, mul vl]
77+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16 , #3, mul vl]
78+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 , #2, mul vl]
79+ ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18 , #3, mul vl]
8580; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
86- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
87- ; CHECK-NEXT: add x18, x18, x12
81+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #3, mul vl]
8882; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
89- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1 ]
83+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #3, mul vl ]
9084; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
91- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1 ]
85+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #3, mul vl ]
9286; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
93- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 , #3, mul vl]
94- ; CHECK-NEXT: addvl x17, x17 , #4
95- ; CHECK-NEXT: cmp x17 , x11
87+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 , #3, mul vl]
88+ ; CHECK-NEXT: addvl x16, x16 , #4
89+ ; CHECK-NEXT: cmp x16 , x11
9690; CHECK-NEXT: b.lo .LBB0_4
9791; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us
9892; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
0 commit comments