|
| 1 | +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. |
| 2 | +# See https://llvm.org/LICENSE.txt for license information. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 4 | +# |
| 5 | +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates |
| 6 | + |
| 7 | +# RUN: llc --mtriple=aie2p -O2 --start-before=postmisched %s \ |
| 8 | +# RUN: --aie-addrspace-none-is-safe=1 \ |
| 9 | +# RUN: -pass-remarks-output=- -pass-remarks-filter=postpipeliner -o /dev/null | FileCheck %s |
| 10 | + |
| 11 | + |
| 12 | +--- | |
| 13 | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { |
| 14 | + ; CHECK:--- !Passed |
| 15 | + ; CHECK-NEXT:Pass: postpipeliner |
| 16 | + ; CHECK-NEXT:Name: schedule |
| 17 | + ; CHECK-NEXT:Function: gemm |
| 18 | + ; CHECK-NEXT:Args: |
| 19 | + ; CHECK-NEXT: - String: 'Schedule found: NS=' |
| 20 | + ; CHECK-NEXT: - NS: '5' |
| 21 | + ; CHECK-NEXT: - String: ' II=' |
| 22 | + ; CHECK-NEXT: - II: '7' |
| 23 | + ; CHECK-NEXT:... |
| 24 | + entry: |
| 25 | + %cmp5 = icmp sgt i32 %n, 0 |
| 26 | + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup |
| 27 | + |
| 28 | + for.body.preheader: ; preds = %entry |
| 29 | + call void @llvm.set.loop.iterations.i32(i32 %n) |
| 30 | + br label %for.body |
| 31 | + |
| 32 | + for.body: ; preds = %for.body.preheader, %for.body |
| 33 | + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] |
| 34 | + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] |
| 35 | + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4 |
| 36 | + %add = add nsw i32 %0, 1 |
| 37 | + store i32 %add, ptr addrspace(5) %d.addr.07, align 4 |
| 38 | + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 |
| 39 | + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 |
| 40 | + %1 = call i1 @llvm.loop.decrement.i32(i32 1) |
| 41 | + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !0 |
| 42 | + |
| 43 | + for.cond.cleanup: ; preds = %for.body, %entry |
| 44 | + ret void |
| 45 | + |
| 46 | + } |
| 47 | + |
| 48 | + |
| 49 | + define dso_local void @gemm_lowitercount(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { |
| 50 | + ; CHECK:--- !Missed |
| 51 | + ; CHECK-NEXT:Pass: postpipeliner |
| 52 | + ; CHECK-NEXT:Name: schedule |
| 53 | + ; CHECK-NEXT:Function: gemm_lowitercount |
| 54 | + ; CHECK-NEXT:Args: |
| 55 | + ; CHECK-NEXT: - String: No schedule found. |
| 56 | + ; CHECK-NEXT:... |
| 57 | + entry: |
| 58 | + %cmp5 = icmp sgt i32 %n, 0 |
| 59 | + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup |
| 60 | + |
| 61 | + for.body.preheader: ; preds = %entry |
| 62 | + call void @llvm.set.loop.iterations.i32(i32 %n) |
| 63 | + br label %for.body |
| 64 | + |
| 65 | + for.body: ; preds = %for.body.preheader, %for.body |
| 66 | + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] |
| 67 | + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] |
| 68 | + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4 |
| 69 | + %add = add nsw i32 %0, 1 |
| 70 | + store i32 %add, ptr addrspace(5) %d.addr.07, align 4 |
| 71 | + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 |
| 72 | + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 |
| 73 | + %1 = call i1 @llvm.loop.decrement.i32(i32 1) |
| 74 | + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !3 |
| 75 | + |
| 76 | + for.cond.cleanup: ; preds = %for.body, %entry |
| 77 | + ret void |
| 78 | + |
| 79 | + } |
| 80 | + |
| 81 | + declare void @llvm.set.loop.iterations.i32(i32) |
| 82 | + declare i1 @llvm.loop.decrement.i32(i32) |
| 83 | + |
| 84 | + !0 = distinct !{!0, !1, !2} |
| 85 | + !1 = !{!"llvm.loop.mustprogress"} |
| 86 | + !2 = !{!"llvm.loop.itercount.range", i64 10} |
| 87 | + !3 = distinct !{!3, !1, !4} |
| 88 | + !4 = !{!"llvm.loop.itercount.range", i64 2} |
| 89 | + |
| 90 | +... |
| 91 | +--- |
| 92 | +name: gemm |
| 93 | +alignment: 16 |
| 94 | +tracksRegLiveness: true |
| 95 | +body: | |
| 96 | + bb.0.entry (align 16): |
| 97 | + successors: %bb.2 |
| 98 | + liveins: $p0, $p1, $r0 |
| 99 | +
|
| 100 | + $lc = ADD_NC_mv_add_ri $r0, 0 |
| 101 | + $ls = MOVXM %bb.2 |
| 102 | + $le = MOVXM <mcsymbol .L_LEnd0> |
| 103 | +
|
| 104 | + bb.2.for.body (align 16): |
| 105 | + successors: %bb.2, %bb.3 |
| 106 | + liveins: $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dj7, $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $dn0, $dn1, $dn2, $dn4, $dn5, $m0, $m1, $m2, $m3, $m4, $m5, $p0, $p1, $p2, $p4, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r8, $y5:0x0000000000000033, $d0_3d:0x0003C00000200E00, $d1_3d:0x0003C00000200E00, $dn3 |
| 107 | +
|
| 108 | + $p5 = MOVS $p6 |
| 109 | + $x4 = VLDB_dmx_ldb_x_idx_imm $p6, 64 :: (load (<16 x s32>)) |
| 110 | + $x9, $p6, $dc0, $dc4 = VLDB_3D_dmx_ldb_x $p6, $d0_3d :: (load (<16 x s32>)) |
| 111 | + $p3 = MOVS $p7 |
| 112 | + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 64 :: (load (<32 x s16>)) |
| 113 | + $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf $p7, $d1_3d :: (load (<32 x s16>)) |
| 114 | + $x8 = VSHUFFLE_vec_shuffle_x $x9, $x4, $r0 |
| 115 | + $x9 = VSHUFFLE_vec_shuffle_x $x9, $x4, $r1 |
| 116 | + $p5 = PADDB_pstm_nrm $p5, $m4 |
| 117 | + $x6 = VLDB_dmx_ldb_x_idx_imm $p5, 0 :: (load (<16 x s32>)) |
| 118 | + $x1 = VLDB_dmx_ldb_x_idx_imm $p5, 64 :: (load (<16 x s32>)) |
| 119 | + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 120 | + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y4, $y5, $r2, implicit-def $srfpflags, implicit $crfpmask |
| 121 | + $x0 = VSHUFFLE_vec_shuffle_x $x1, $x6, $r0 |
| 122 | + $x1 = VSHUFFLE_vec_shuffle_x $x1, $x6, $r1 |
| 123 | + $ex3 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 124 | + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y0, $y5, $r2, implicit-def $srfpflags, implicit $crfpmask |
| 125 | + $p3 = PADDA_pstm_nrm $p3, $m5 |
| 126 | + $ex5 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 127 | + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p3, 0 :: (load (<32 x s16>)) |
| 128 | + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p3, 64 :: (load (<32 x s16>)) |
| 129 | + $ex7 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 130 | + $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm3, $ex2, $ex3, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 131 | + $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm2, $ex2, $ex5, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 132 | + $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm1, $ex7, $ex3, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 133 | + $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm0, $ex7, $ex5, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 134 | +
|
| 135 | + PseudoLoopEnd <mcsymbol .L_LEnd2>, %bb.2 |
| 136 | +
|
| 137 | + bb.3 (align 16): |
| 138 | + RET implicit $lr |
| 139 | + DelayedSchedBarrier |
| 140 | +
|
| 141 | +... |
| 142 | +--- |
| 143 | +name: gemm_lowitercount |
| 144 | +alignment: 16 |
| 145 | +tracksRegLiveness: true |
| 146 | +body: | |
| 147 | + bb.0.entry (align 16): |
| 148 | + successors: %bb.2 |
| 149 | + liveins: $p0, $p1, $r0 |
| 150 | +
|
| 151 | + $lc = ADD_NC_mv_add_ri $r0, 0 |
| 152 | + $ls = MOVXM %bb.2 |
| 153 | + $le = MOVXM <mcsymbol .L_LEnd1> |
| 154 | +
|
| 155 | + bb.2.for.body (align 16): |
| 156 | + successors: %bb.2, %bb.3 |
| 157 | + liveins: $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dj7, $dm0:0x000000001800000C, $dm1:0x000000001800000C, $dm2:0x000000001800000C, $dm3:0x000000001800000C, $dn0, $dn1, $dn2, $dn4, $dn5, $m0, $m1, $m2, $m3, $m4, $m5, $p0, $p1, $p2, $p4, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r8, $y5:0x0000000000000033, $d0_3d:0x0003C00000200E00, $d1_3d:0x0003C00000200E00, $dn3 |
| 158 | +
|
| 159 | + $p5 = MOVS $p6 |
| 160 | + $x4 = VLDB_dmx_ldb_x_idx_imm $p6, 64 :: (load (<16 x s32>)) |
| 161 | + $x9, $p6, $dc0, $dc4 = VLDB_3D_dmx_ldb_x $p6, $d0_3d :: (load (<16 x s32>)) |
| 162 | + $p3 = MOVS $p7 |
| 163 | + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p7, 64 :: (load (<32 x s16>)) |
| 164 | + $cml4, $p7, $dc1, $dc5 = VLDA_3D_CONV_fp32_bf16_dmx_lda_ups_bf $p7, $d1_3d :: (load (<32 x s16>)) |
| 165 | + $x8 = VSHUFFLE_vec_shuffle_x $x9, $x4, $r0 |
| 166 | + $x9 = VSHUFFLE_vec_shuffle_x $x9, $x4, $r1 |
| 167 | + $p5 = PADDB_pstm_nrm $p5, $m4 |
| 168 | + $x6 = VLDB_dmx_ldb_x_idx_imm $p5, 0 :: (load (<16 x s32>)) |
| 169 | + $x1 = VLDB_dmx_ldb_x_idx_imm $p5, 64 :: (load (<16 x s32>)) |
| 170 | + $ex2 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 171 | + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y4, $y5, $r2, implicit-def $srfpflags, implicit $crfpmask |
| 172 | + $x0 = VSHUFFLE_vec_shuffle_x $x1, $x6, $r0 |
| 173 | + $x1 = VSHUFFLE_vec_shuffle_x $x1, $x6, $r1 |
| 174 | + $ex3 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 175 | + $dm4 = VMUL_f_vmul_bf_vmul_bf_core_Y_Y $y0, $y5, $r2, implicit-def $srfpflags, implicit $crfpmask |
| 176 | + $p3 = PADDA_pstm_nrm $p3, $m5 |
| 177 | + $ex5 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 178 | + $cml4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p3, 0 :: (load (<32 x s16>)) |
| 179 | + $cmh4 = VLDA_CONV_fp32_bf16_dmx_lda_ups_bf_idx_imm $p3, 64 :: (load (<32 x s16>)) |
| 180 | + $ex7 = VCONV_bfp16ebs8_fp32 $dm4, implicit-def $srf2bflags, implicit $crf2bmask, implicit $crrnd |
| 181 | + $dm3 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm3, $ex2, $ex3, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 182 | + $dm2 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm2, $ex2, $ex5, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 183 | + $dm1 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm1, $ex7, $ex3, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 184 | + $dm0 = VMAC_f_vmac_bfp_vmul_bfp_core_EX_EX $dm0, $ex7, $ex5, $r3, implicit-def $srfpflags, implicit $crfpmask |
| 185 | +
|
| 186 | + PseudoLoopEnd <mcsymbol .L_LEnd1>, %bb.2 |
| 187 | +
|
| 188 | + bb.3 (align 16): |
| 189 | + RET implicit $lr |
| 190 | + DelayedSchedBarrier |
| 191 | +
|
| 192 | +... |
0 commit comments