|
| 1 | +# RUN: llc -mtriple=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b \ |
| 2 | +# RUN: -run-pass=pipeliner -debug-only=pipeliner 2>&1 \ |
| 3 | +# RUN: %s -o - | FileCheck %s |
| 4 | +# REQUIRES: asserts |
| 5 | + |
| 6 | +# Check that the loop is software pipelined. |
| 7 | + |
| 8 | +# CHECK: Schedule Found? 1 (II=4) |
| 9 | + |
| 10 | +--- | |
| 11 | + target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" |
| 12 | + target triple = "hexagon" |
| 13 | + |
| 14 | + ; Function Attrs: nounwind |
| 15 | + define void @ham(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 { |
| 16 | + bb: |
| 17 | + %ashr = ashr i32 %arg3, 2 |
| 18 | + %ashr6 = ashr i32 %arg3, 1 |
| 19 | + %add = add nsw i32 %ashr6, %ashr |
| 20 | + %icmp = icmp sgt i32 %arg2, 0 |
| 21 | + br i1 %icmp, label %bb7, label %bb61 |
| 22 | + |
| 23 | + bb7: ; preds = %bb |
| 24 | + %sdiv = sdiv i32 %arg1, 64 |
| 25 | + br label %bb9 |
| 26 | + |
| 27 | + bb9: ; preds = %bb57, %bb7 |
| 28 | + %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ] |
| 29 | + %0 = icmp sgt i32 %arg1, 63 |
| 30 | + %ashr10 = ashr exact i32 %phi, 1 |
| 31 | + %mul = mul nsw i32 %ashr10, %arg3 |
| 32 | + br i1 %0, label %bb11, label %bb57 |
| 33 | + |
| 34 | + bb11: ; preds = %bb9 |
| 35 | + %add12 = add nsw i32 %phi, 1 |
| 36 | + %mul13 = mul nsw i32 %add12, %arg5 |
| 37 | + %mul14 = mul nsw i32 %phi, %arg5 |
| 38 | + %add15 = add i32 %add, %mul |
| 39 | + %add16 = add i32 %mul, %ashr |
| 40 | + %add17 = add i32 %mul, %ashr6 |
| 41 | + %cgep = getelementptr inbounds i8, ptr %arg4, i32 %mul13 |
| 42 | + %cgep1 = getelementptr inbounds i8, ptr %arg4, i32 %mul14 |
| 43 | + %cgep2 = getelementptr inbounds i16, ptr %arg, i32 %add15 |
| 44 | + %cgep3 = getelementptr inbounds i16, ptr %arg, i32 %add16 |
| 45 | + %cgep4 = getelementptr inbounds i16, ptr %arg, i32 %add17 |
| 46 | + %cgep5 = getelementptr inbounds i16, ptr %arg, i32 %mul |
| 47 | + br label %bb28 |
| 48 | + |
| 49 | + bb28: ; preds = %bb28, %bb11 |
| 50 | + %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ] |
| 51 | + %phi30 = phi ptr [ %cgep5, %bb11 ], [ %cgep6, %bb28 ] |
| 52 | + %phi31 = phi ptr [ %cgep4, %bb11 ], [ %cgep7, %bb28 ] |
| 53 | + %phi32 = phi ptr [ %cgep3, %bb11 ], [ %cgep8, %bb28 ] |
| 54 | + %phi33 = phi ptr [ %cgep2, %bb11 ], [ %cgep9, %bb28 ] |
| 55 | + %phi34 = phi ptr [ %cgep, %bb11 ], [ %cgep11, %bb28 ] |
| 56 | + %phi35 = phi ptr [ %cgep1, %bb11 ], [ %cgep10, %bb28 ] |
| 57 | + %load = load <16 x i32>, ptr %phi30, align 64 |
| 58 | + %load38 = load <16 x i32>, ptr %phi31, align 64 |
| 59 | + %load40 = load <16 x i32>, ptr %phi32, align 64 |
| 60 | + %load42 = load <16 x i32>, ptr %phi33, align 64 |
| 61 | + %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38) |
| 62 | + %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38) |
| 63 | + %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42) |
| 64 | + %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42) |
| 65 | + %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44) |
| 66 | + %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44) |
| 67 | + %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45) |
| 68 | + %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45) |
| 69 | + %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46) |
| 70 | + %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48) |
| 71 | + store <16 x i32> %call50, ptr %phi35, align 64 |
| 72 | + store <16 x i32> %call51, ptr %phi34, align 64 |
| 73 | + %add54 = add nsw i32 %phi29, 1 |
| 74 | + %icmp55 = icmp slt i32 %add54, %sdiv |
| 75 | + %cgep6 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1 |
| 76 | + %cgep7 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1 |
| 77 | + %cgep8 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1 |
| 78 | + %cgep9 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1 |
| 79 | + %cgep10 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1 |
| 80 | + %cgep11 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1 |
| 81 | + br i1 %icmp55, label %bb28, label %bb57 |
| 82 | + |
| 83 | + bb57: ; preds = %bb28, %bb9 |
| 84 | + %add58 = add nsw i32 %phi, 2 |
| 85 | + %icmp59 = icmp slt i32 %add58, %arg2 |
| 86 | + br i1 %icmp59, label %bb9, label %bb61 |
| 87 | + |
| 88 | + bb61: ; preds = %bb57, %bb |
| 89 | + ret void |
| 90 | + } |
| 91 | + |
| 92 | + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) |
| 93 | + declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1 |
| 94 | + |
| 95 | + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) |
| 96 | + declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1 |
| 97 | + |
| 98 | + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) |
| 99 | + declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1 |
| 100 | + |
| 101 | + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) |
| 102 | + declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1 |
| 103 | + |
| 104 | + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) |
| 105 | + declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1 |
| 106 | + |
| 107 | + attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length64b" "unsafe-fp-math"="false" "use-soft-float"="false" } |
| 108 | + attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length64b" } |
| 109 | + |
| 110 | +... |
| 111 | +--- |
| 112 | +name: ham |
| 113 | +alignment: 16 |
| 114 | +tracksRegLiveness: true |
| 115 | +body: | |
| 116 | + bb.0.bb: |
| 117 | + successors: %bb.1(0x50000000), %bb.6(0x30000000) |
| 118 | + liveins: $r0, $r1, $r2, $r3, $r4, $r5 |
| 119 | + |
| 120 | + %32:intregs = COPY $r5 |
| 121 | + %31:intregs = COPY $r4 |
| 122 | + %30:intregs = COPY $r3 |
| 123 | + %29:intregs = COPY $r2 |
| 124 | + %28:intregs = COPY $r1 |
| 125 | + %27:intregs = COPY $r0 |
| 126 | + %33:predregs = C2_cmpgti %29, 0 |
| 127 | + J2_jumpf %33, %bb.6, implicit-def dead $pc |
| 128 | + J2_jump %bb.1, implicit-def dead $pc |
| 129 | + |
| 130 | + bb.1.bb7: |
| 131 | + successors: %bb.2(0x80000000) |
| 132 | + |
| 133 | + %0:intregs = S2_asr_i_r %30, 2 |
| 134 | + %1:intregs = S2_asr_i_r %30, 1 |
| 135 | + %2:intregs = nsw A2_add %1, %0 |
| 136 | + %36:intregs = S2_asr_i_r %28, 31 |
| 137 | + %37:intregs = S2_lsr_i_r_acc %28, killed %36, 26 |
| 138 | + %3:intregs = S2_asr_i_r killed %37, 6 |
| 139 | + %35:intregs = A2_tfrsi 0 |
| 140 | + %38:predregs = C2_cmpgti %28, 63 |
| 141 | + %63:intregs = A2_addi %29, 1 |
| 142 | + %64:intregs = S2_lsr_i_r %63, 1 |
| 143 | + %65:intregs = COPY %64 |
| 144 | + J2_loop1r %bb.2, %65, implicit-def $lc1, implicit-def $sa1 |
| 145 | + |
| 146 | + bb.2.bb9 (machine-block-address-taken): |
| 147 | + successors: %bb.3(0x40000000), %bb.5(0x40000000) |
| 148 | + |
| 149 | + %4:intregs = PHI %35, %bb.1, %26, %bb.5 |
| 150 | + J2_jumpf %38, %bb.5, implicit-def dead $pc |
| 151 | + J2_jump %bb.3, implicit-def dead $pc |
| 152 | + |
| 153 | + bb.3.bb11: |
| 154 | + successors: %bb.4(0x80000000) |
| 155 | + |
| 156 | + %40:intregs = exact S2_asr_i_r %4, 1 |
| 157 | + %5:intregs = nsw M2_mpyi %40, %30 |
| 158 | + %42:intregs = nsw A2_addi %4, 1 |
| 159 | + %43:intregs = A2_add %2, %5 |
| 160 | + %44:intregs = A2_add %5, %0 |
| 161 | + %45:intregs = A2_add %5, %1 |
| 162 | + %6:intregs = M2_maci %31, killed %42, %32 |
| 163 | + %7:intregs = M2_maci %31, %4, %32 |
| 164 | + %8:intregs = S2_addasl_rrri %27, killed %43, 1 |
| 165 | + %9:intregs = S2_addasl_rrri %27, killed %44, 1 |
| 166 | + %10:intregs = S2_addasl_rrri %27, killed %45, 1 |
| 167 | + %11:intregs = S2_addasl_rrri %27, %5, 1 |
| 168 | + %62:intregs = COPY %3 |
| 169 | + J2_loop0r %bb.4, %62, implicit-def $lc0, implicit-def $sa0, implicit-def $usr |
| 170 | + |
| 171 | + bb.4.bb28 (machine-block-address-taken): |
| 172 | + successors: %bb.4(0x7c000000), %bb.5(0x04000000) |
| 173 | + |
| 174 | + %13:intregs = PHI %11, %bb.3, %20, %bb.4 |
| 175 | + %14:intregs = PHI %10, %bb.3, %21, %bb.4 |
| 176 | + %15:intregs = PHI %9, %bb.3, %22, %bb.4 |
| 177 | + %16:intregs = PHI %8, %bb.3, %23, %bb.4 |
| 178 | + %17:intregs = PHI %6, %bb.3, %25, %bb.4 |
| 179 | + %18:intregs = PHI %7, %bb.3, %24, %bb.4 |
| 180 | + %46:hvxvr, %20:intregs = V6_vL32b_pi %13, 64 :: (load (s512) from %ir.phi30) |
| 181 | + %47:hvxvr, %21:intregs = V6_vL32b_pi %14, 64 :: (load (s512) from %ir.phi31) |
| 182 | + %48:hvxvr, %22:intregs = V6_vL32b_pi %15, 64 :: (load (s512) from %ir.phi32) |
| 183 | + %49:hvxvr, %23:intregs = V6_vL32b_pi %16, 64 :: (load (s512) from %ir.phi33) |
| 184 | + %50:hvxvr = V6_vaddh %46, %47 |
| 185 | + %51:hvxvr = V6_vsubh %46, %47 |
| 186 | + %52:hvxvr = V6_vaddh %48, %49 |
| 187 | + %53:hvxvr = V6_vsubh %48, %49 |
| 188 | + %54:hvxvr = V6_vavgh %50, %52 |
| 189 | + %55:hvxvr = V6_vnavgh %50, %52 |
| 190 | + %56:hvxvr = V6_vavgh %51, %53 |
| 191 | + %57:hvxvr = V6_vnavgh %51, %53 |
| 192 | + %58:hvxvr = V6_vsathub killed %55, killed %54 |
| 193 | + %59:hvxvr = V6_vsathub killed %57, killed %56 |
| 194 | + %24:intregs = V6_vS32b_pi %18, 64, killed %58 :: (store (s512) into %ir.phi35) |
| 195 | + %25:intregs = V6_vS32b_pi %17, 64, killed %59 :: (store (s512) into %ir.phi34) |
| 196 | + ENDLOOP0 %bb.4, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 |
| 197 | + J2_jump %bb.5, implicit-def dead $pc |
| 198 | + |
| 199 | + bb.5.bb57: |
| 200 | + successors: %bb.2(0x7c000000), %bb.6(0x04000000) |
| 201 | + |
| 202 | + %26:intregs = nsw A2_addi %4, 2 |
| 203 | + ENDLOOP1 %bb.2, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1 |
| 204 | + J2_jump %bb.6, implicit-def dead $pc |
| 205 | + |
| 206 | + bb.6.bb61: |
| 207 | + PS_jmpret $r31, implicit-def dead $pc |
| 208 | +
|
| 209 | +... |
0 commit comments