|
4 | 4 | ; See https://llvm.org/LICENSE.txt for license information. |
5 | 5 | ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | 6 | ; |
7 | | -; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates |
| 7 | +; (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates |
8 | 8 | ; RUN: llc -O2 -mtriple=aie2 %s -o - | FileCheck %s --check-prefix=ASM |
9 | 9 |
|
10 | 10 | ; This is a reduced version of the Add2D_0 MLLib benchmark which only contains |
@@ -35,60 +35,50 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm |
35 | 35 | ; ASM-LABEL: add2d: |
36 | 36 | ; ASM: .p2align 4 |
37 | 37 | ; ASM-NEXT: // %bb.0: // %newFuncRoot |
38 | | -; ASM-NEXT: paddb [sp], #32; nopx |
39 | | -; ASM-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill |
40 | | -; ASM-NEXT: paddb [p0], #40; st p6, [sp, #-28] // 4-byte Folded Spill |
| 38 | +; ASM-NEXT: paddb [p0], #40; nopx |
41 | 39 | ; ASM-NEXT: lda m2, [p0], #-4 |
42 | 40 | ; ASM-NEXT: lda m3, [p0], #8 |
43 | 41 | ; ASM-NEXT: lda m5, [p0], #8 |
44 | 42 | ; ASM-NEXT: lda m4, [p0], #-24 |
45 | | -; ASM-NEXT: lda r4, [p0], #36 |
46 | | -; ASM-NEXT: lda r2, [p0], #-32 |
47 | | -; ASM-NEXT: lda r0, [p0], #-12; mov p6, sp |
48 | | -; ASM-NEXT: lda r1, [p0], #40; paddb [p6], #-36 |
49 | | -; ASM-NEXT: lda p6, [p6, #0]; mov p7, sp |
50 | | -; ASM-NEXT: paddb [p7], #-40 |
51 | | -; ASM-NEXT: lda r5, [p7, #0] |
52 | | -; ASM-NEXT: lda m1, [p0], #36 |
| 43 | +; ASM-NEXT: lda r4, [p0], #36; paddb [sp], #32 |
| 44 | +; ASM-NEXT: lda r2, [p0], #-32; st p7, [sp, #-32] // 4-byte Folded Spill |
| 45 | +; ASM-NEXT: lda r0, [p0], #-12; st p6, [sp, #-28] // 4-byte Folded Spill |
| 46 | +; ASM-NEXT: lda r1, [p0], #40; mov p6, sp |
| 47 | +; ASM-NEXT: paddb [p6], #-36; mov p7, sp |
| 48 | +; ASM-NEXT: lda r5, [p6, #0]; paddb [p7], #-40 |
| 49 | +; ASM-NEXT: lda p7, [p7, #0]; mov p6, sp |
| 50 | +; ASM-NEXT: lda m1, [p0], #36; paddb [p6], #-44 |
| 51 | +; ASM-NEXT: lda p6, [p6, #0] |
53 | 52 | ; ASM-NEXT: lda m0, [p0], #-8 |
54 | | -; ASM-NEXT: lda dn0, [p0], #-8 |
55 | | -; ASM-NEXT: st r1, [p4, #0] |
| 53 | +; ASM-NEXT: lda dn0, [p0], #-8; st r1, [p4, #0] |
56 | 54 | ; ASM-NEXT: lda dj0, [p0], #12; nez r3, r0; mov p4, sp |
57 | | -; ASM-NEXT: st r3, [p5, #0] |
58 | | -; ASM-NEXT: lda dn4, [p0], #-8; paddb [p4], #-44; mov p5, sp |
59 | | -; ASM-NEXT: lda p4, [p4, #0]; paddb [p5], #-48 |
60 | | -; ASM-NEXT: lda p7, [p5, #0]; mov p5, sp |
61 | | -; ASM-NEXT: lda dj4, [p0], #-36; paddb [p5], #-52 |
62 | | -; ASM-NEXT: lda p5, [p5, #0] |
63 | | -; ASM-NEXT: st m1, [p6, #0] |
64 | | -; ASM-NEXT: mov p6, r5 |
65 | | -; ASM-NEXT: nop |
66 | | -; ASM-NEXT: st m0, [p6, #0] |
67 | | -; ASM-NEXT: st dj0, [p4, #0] |
68 | | -; ASM-NEXT: st dj4, [p7, #0] |
69 | | -; ASM-NEXT: st dn0, [p5, #0] |
| 55 | +; ASM-NEXT: lda dn4, [p0], #-8; st r3, [p5, #0] |
| 56 | +; ASM-NEXT: lda dj4, [p0], #-36; paddb [p4], #-48; mov p5, r5 |
| 57 | +; ASM-NEXT: lda p4, [p4, #0]; st m1, [p5, #0] |
70 | 58 | ; ASM-NEXT: lda r0, [p0], #-36; mov p5, sp |
71 | | -; ASM-NEXT: lda r5, [p0, #0]; paddb [p5], #-76; mov p6, sp |
72 | | -; ASM-NEXT: lda r9, [p5, #0]; paddb [p6], #-56; mov p5, sp |
73 | | -; ASM-NEXT: lda r6, [p6, #0]; paddb [p5], #-80; mov p4, sp |
74 | | -; ASM-NEXT: lda r10, [p5, #0]; paddb [p4], #-60; mov p5, sp |
75 | | -; ASM-NEXT: lda p6, [p4, #0]; paddb [p5], #-84 |
76 | | -; ASM-NEXT: lda r11, [p5, #0]; mov p0, sp |
77 | | -; ASM-NEXT: paddb [p0], #-72; mov p4, sp |
78 | | -; ASM-NEXT: lda p0, [p0, #0]; paddb [p4], #-64; mov p5, sp |
79 | | -; ASM-NEXT: lda p7, [p4, #0]; paddb [p5], #-88; mov p4, sp |
80 | | -; ASM-NEXT: lda r12, [p5, #0]; paddb [p4], #-68; mov p5, sp |
81 | | -; ASM-NEXT: lda p4, [p4, #0]; paddb [p5], #-92 |
82 | | -; ASM-NEXT: lda r13, [p5, #0] |
83 | | -; ASM-NEXT: mova r6, #1; add r7, r2, #-1; mov p5, r6 |
84 | | -; ASM-NEXT: mova r6, #3; ne r4, r4, r6 |
85 | | -; ASM-NEXT: ltu r7, r7, r6 |
86 | | -; ASM-NEXT: jz r7, #.LBB0_2 |
87 | | -; ASM-NEXT: st dn4, [p5, #0]; nez r0, r0 // Delay Slot 5 |
88 | | -; ASM-NEXT: st r0, [p6, #0] // Delay Slot 4 |
89 | | -; ASM-NEXT: paddb [p2], m3; st r5, [p7, #0] // Delay Slot 3 |
90 | | -; ASM-NEXT: padda [p1], m2; paddb [p2], m5; and r8, r2, r6; st r4, [p4, #0] // Delay Slot 2 |
91 | | -; ASM-NEXT: mova r6, #0; paddb [p2], m4; st r8, [p0, #0] // Delay Slot 1 |
| 59 | +; ASM-NEXT: lda r5, [p0, #0]; paddb [p5], #-52 |
| 60 | +; ASM-NEXT: lda p5, [p5, #0]; mov p0, sp |
| 61 | +; ASM-NEXT: st m0, [p7, #0] |
| 62 | +; ASM-NEXT: mov p7, sp |
| 63 | +; ASM-NEXT: paddb [p7], #-56; st dj0, [p6, #0] |
| 64 | +; ASM-NEXT: lda r6, [p7, #0]; mov p6, sp |
| 65 | +; ASM-NEXT: paddb [p0], #-72; mov p7, sp |
| 66 | +; ASM-NEXT: lda p0, [p0, #0]; paddb [p6], #-60; st dj4, [p4, #0] |
| 67 | +; ASM-NEXT: lda r7, [p6, #0]; mov p4, sp |
| 68 | +; ASM-NEXT: paddb [p4], #-76; mov p6, sp |
| 69 | +; ASM-NEXT: lda r11, [p4, #0]; paddb [p7], #-64; mov p4, sp |
| 70 | +; ASM-NEXT: lda p7, [p7, #0]; paddb [p6], #-68; st dn0, [p5, #0] |
| 71 | +; ASM-NEXT: lda r8, [p6, #0]; paddb [p4], #-80; nez r0, r0; mov p5, r6 |
| 72 | +; ASM-NEXT: lda p6, [p4, #0]; st dn4, [p5, #0]; movx r6, #1 |
| 73 | +; ASM-NEXT: ne r4, r4, r6; mov p4, sp |
| 74 | +; ASM-NEXT: mova r6, #3; paddb [p4], #-84; add r7, r2, #-1; mov p5, r7 |
| 75 | +; ASM-NEXT: lda r9, [p4, #0]; ltu r7, r7, r6; mov p4, sp |
| 76 | +; ASM-NEXT: st r0, [p5, #0]; paddb [p4], #-88; jz r7, #.LBB0_2 |
| 77 | +; ASM-NEXT: lda r10, [p4, #0]; mov p4, sp // Delay Slot 5 |
| 78 | +; ASM-NEXT: paddb [p4], #-92; st r5, [p7, #0] // Delay Slot 4 |
| 79 | +; ASM-NEXT: lda p4, [p4, #0]; paddb [p2], m3; mov p7, r8 // Delay Slot 3 |
| 80 | +; ASM-NEXT: st r4, [p7, #0]; paddb [p2], m5; and r8, r2, r6 // Delay Slot 2 |
| 81 | +; ASM-NEXT: padda [p1], m2; paddb [p2], m4; movx r6, #0; st r8, [p0, #0] // Delay Slot 1 |
92 | 82 | ; ASM-NEXT: // %bb.1: |
93 | 83 | ; ASM-NEXT: nopb ; nopa ; nops ; j #.LBB0_5; nopv |
94 | 84 | ; ASM-NEXT: nopa ; nopx // Delay Slot 5 |
@@ -137,23 +127,19 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm |
137 | 127 | ; ASM-NEXT: nop |
138 | 128 | ; ASM-NEXT: nop |
139 | 129 | ; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32 |
140 | | -; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 |
141 | | -; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32; mov crUPSSign, #0 |
142 | | -; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; mov r6, dc0 |
143 | | -; ASM-NEXT: mov r0, dc4 |
| 130 | +; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32; mov crUPSSign, #0 |
| 131 | +; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32; mov r6, dc0 |
| 132 | +; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; mov r0, dc4 |
144 | 133 | ; ASM-NEXT: mov crSRSSign, #0 |
145 | 134 | ; ASM-NEXT: .p2align 4 |
146 | 135 | ; ASM-NEXT: .LBB0_5: // %for.cond.cleanup.unr-lcssa.split |
147 | | -; ASM-NEXT: nopb ; lda p7, [sp, #-32]; nops ; nopxm ; nopv // 4-byte Folded Reload |
148 | | -; ASM-NEXT: mov p0, r13 |
149 | | -; ASM-NEXT: st r0, [p0, #0] |
150 | | -; ASM-NEXT: mov p0, r12 |
151 | | -; ASM-NEXT: st r6, [p0, #0] |
152 | | -; ASM-NEXT: lda p6, [sp, #-28]; mov p0, r11 // 4-byte Folded Reload |
153 | | -; ASM-NEXT: st p3, [p0, #0]; ret lr |
154 | | -; ASM-NEXT: mov p0, r10 // Delay Slot 5 |
155 | | -; ASM-NEXT: st p2, [p0, #0] // Delay Slot 4 |
156 | | -; ASM-NEXT: mov p0, r9 // Delay Slot 3 |
| 136 | +; ASM-NEXT: nopx ; mov p0, r10 |
| 137 | +; ASM-NEXT: lda p7, [sp, #-32]; st r0, [p4, #0] // 4-byte Folded Reload |
| 138 | +; ASM-NEXT: lda p6, [sp, #-28]; st r6, [p0, #0] // 4-byte Folded Reload |
| 139 | +; ASM-NEXT: ret lr ; mov p0, r9 |
| 140 | +; ASM-NEXT: st p3, [p0, #0] // Delay Slot 5 |
| 141 | +; ASM-NEXT: mov p0, r11 // Delay Slot 4 |
| 142 | +; ASM-NEXT: st p2, [p6, #0] // Delay Slot 3 |
157 | 143 | ; ASM-NEXT: st p1, [p0, #0] // Delay Slot 2 |
158 | 144 | ; ASM-NEXT: paddb [sp], #-32 // Delay Slot 1 |
159 | 145 | newFuncRoot: |
|
0 commit comments