Skip to content

Commit c8158ac

Browse files
KorovinVladigcbot
authored andcommitted
Cost model: analyze added crit_edges
crite_edges which are added by BreakCriticalEdges pass change the loop structure. Extended ocloc test to cover more cases.
1 parent 0c0eb00 commit c8158ac

File tree

3 files changed

+358
-2
lines changed

3 files changed

+358
-2
lines changed

IGC/VectorCompiler/lib/Utils/GenX/CostInfo.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,16 @@ LoopCountExpr vc::restoreLCEFromMetadata(const llvm::Loop &L) {
8282
auto *TI = Latch->getTerminator();
8383
IGC_ASSERT(TI);
8484
auto *ExprNode = TI->getMetadata(VCLoopExprMetaKind);
85-
if (!ExprNode)
86-
return LoopCountExpr{};
85+
if (!ExprNode) {
86+
// When hitting a critical_edge it is possible that
87+
// a metadata could remain in an exiting block.
88+
if (auto *Exit = Latch->getSinglePredecessor()) {
89+
auto *ExitTI = Exit->getTerminator();
90+
ExprNode = ExitTI->getMetadata(VCLoopExprMetaKind);
91+
}
92+
if (!ExprNode)
93+
return LoopCountExpr{};
94+
}
8795

8896
LoopCountExpr LCE;
8997
LCE.IsUndef = false;
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys, pvc-supported
10+
; RUN: llvm-as %s -o %t.bc
11+
; RUN: ocloc -device pvc -llvm_input -options "-vc-codegen -ze-collect-cost-info -igc_opts 'ShaderDumpEnable=1, DumpToCustomDir=%t'" -output_no_suffix -file %t.bc
12+
; RUN: cat %t/*.zeinfo | FileCheck %s
13+
14+
target datalayout = "e-p:64:64-p3:32:32-p6:32:32-i64:64-n8:16:32:64"
15+
target triple = "genx64-unknown-unknown"
16+
17+
; CHECK: kernels_cost_info:
18+
; CHECK-NEXT: - name: kernel
19+
20+
; CHECK: kcm_args_sym:
21+
; CHECK-NEXT: - argNo: 3
22+
; CHECK-NEXT: byteOffset: 0
23+
; CHECK-NEXT: sizeInBytes: 4
24+
; CHECK-NEXT: isInDirect: false
25+
; CHECK-NEXT: - argNo: 2
26+
; CHECK-NEXT: byteOffset: 0
27+
; CHECK-NEXT: sizeInBytes: 4
28+
; CHECK-NEXT: isInDirect: false
29+
; CHECK-NEXT: - argNo: 1
30+
; CHECK-NEXT: byteOffset: 120
31+
; CHECK-NEXT: sizeInBytes: 8
32+
; CHECK-NEXT: isInDirect: true
33+
34+
; CHECK: kcm_loop_count_exps:
35+
; CHECK-NEXT: - factor: 1
36+
; CHECK-NEXT: argsym_index: 0
37+
; CHECK-NEXT: C: 0
38+
; CHECK-NEXT: - factor: -0.25
39+
; CHECK-NEXT: argsym_index: 1
40+
; CHECK-NEXT: C: 3.75
41+
; CHECK-NEXT: - factor: -1
42+
; CHECK-NEXT: argsym_index: 1
43+
; CHECK-NEXT: C: -20
44+
; CHECK-NEXT: - factor: 1
45+
; CHECK-NEXT: argsym_index: 2
46+
; CHECK-NEXT: C: 0
47+
; CHECK-NEXT: - factor: 0
48+
; CHECK-NEXT: argsym_index: -1
49+
; CHECK-NEXT: C: 127
50+
; CHECK-NEXT: - factor: 0
51+
; CHECK-NEXT: argsym_index: -1
52+
; CHECK-NEXT: C: 0
53+
54+
; COM: The loop costs are estimated by finalizer. Only verify that
55+
; COM: that the number of blocks equals the number of loops + 1.
56+
; CHECK: Kcm_loop_costs:
57+
; CHECK-NEXT: - cycle: {{.*}}
58+
; CHECK-NEXT: bytes_loaded: {{.*}}
59+
; CHECK-NEXT: bytes_stored: {{.*}}
60+
; CHECK-NEXT: num_loops: {{.*}}
61+
; CHECK-COUNT-6: - cycle: {{.*}}
62+
63+
; COM: IR represents the following kernel:
64+
; COM: kernel(__global int *A, __global int *B, int C, int D) {
65+
; COM: for (int i = 0; i < D; ++i)
66+
; COM: A[0] = B[0];
67+
; COM: for (int i = 15; i > C; i -= 4)
68+
; COM: A[0] = B[0];
69+
; COM: for (int i = C; i > 2 * C + 20; --i)
70+
; COM: A[0] = B[0];
71+
; COM: for (int i = 0; i < B[30]; ++i)
72+
; COM: A[0] = B[0];
73+
; COM: for (int i = 0; i < 256; i += 2)
74+
; COM: A[0] = B[0];
75+
; COM: for (int i = C; i < D; ++i)
76+
; COM: A[0] = B[0];
77+
; COM: }
78+
79+
define spir_kernel void @kernel(i32 addrspace(1)* "VCArgumentIOKind"="0" %A, i32 addrspace(1)* "VCArgumentIOKind"="0" %B, i32 "VCArgumentIOKind"="0" %C, i32 "VCArgumentIOKind"="0" %D) #0 !spirv.ParameterDecorations !5 !intel_reqd_sub_group_size !8 {
80+
entry:
81+
%cmp31 = icmp sgt i32 %D, 0
82+
br i1 %cmp31, label %for.body, label %for.cond3.preheader
83+
84+
for.cond3.preheader: ; preds = %for.body, %entry
85+
%cmp429 = icmp slt i32 %C, 15
86+
br i1 %cmp429, label %for.body5, label %for.cond11.preheader
87+
88+
for.body: ; preds = %entry, %for.body
89+
%i.032 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
90+
%0 = load i32, i32 addrspace(1)* %B, align 4
91+
store i32 %0, i32 addrspace(1)* %A, align 4
92+
%inc = add nuw nsw i32 %i.032, 1, !spirv.Decorations !14
93+
%exitcond33.not = icmp eq i32 %inc, %D
94+
br i1 %exitcond33.not, label %for.cond3.preheader, label %for.body
95+
96+
for.cond11.preheader: ; preds = %for.body5, %for.cond3.preheader
97+
%mul = shl nsw i32 %C, 1
98+
%add = add nsw i32 %mul, 20, !spirv.Decorations !14
99+
%cmp1227 = icmp slt i32 %add, %C
100+
br i1 %cmp1227, label %for.body13, label %for.cond19.preheader
101+
102+
for.body5: ; preds = %for.cond3.preheader, %for.body5
103+
%i2.030 = phi i32 [ %sub, %for.body5 ], [ 15, %for.cond3.preheader ]
104+
%1 = load i32, i32 addrspace(1)* %B, align 4
105+
store i32 %1, i32 addrspace(1)* %A, align 4
106+
%sub = add nsw i32 %i2.030, -4
107+
%cmp4 = icmp sgt i32 %sub, %C
108+
br i1 %cmp4, label %for.body5, label %for.cond11.preheader
109+
110+
for.cond19.preheader: ; preds = %for.body13, %for.cond11.preheader
111+
%arrayidx20 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 30
112+
%2 = load i32, i32 addrspace(1)* %arrayidx20, align 4
113+
%cmp2125 = icmp sgt i32 %2, 0
114+
br i1 %cmp2125, label %for.body22, label %for.body31.preheader
115+
116+
for.body31.preheader: ; preds = %for.body22, %for.cond19.preheader
117+
br label %for.body31
118+
119+
for.body13: ; preds = %for.cond11.preheader, %for.body13
120+
%i10.028 = phi i32 [ %dec, %for.body13 ], [ %C, %for.cond11.preheader ]
121+
%3 = load i32, i32 addrspace(1)* %B, align 4
122+
store i32 %3, i32 addrspace(1)* %A, align 4
123+
%dec = add nsw i32 %i10.028, -1, !spirv.Decorations !14
124+
%cmp12 = icmp sgt i32 %dec, %add
125+
br i1 %cmp12, label %for.body13, label %for.cond19.preheader
126+
127+
for.body22: ; preds = %for.cond19.preheader, %for.body22
128+
%i18.026 = phi i32 [ %inc26, %for.body22 ], [ 0, %for.cond19.preheader ]
129+
%4 = load i32, i32 addrspace(1)* %B, align 4
130+
store i32 %4, i32 addrspace(1)* %A, align 4
131+
%inc26 = add nuw nsw i32 %i18.026, 1, !spirv.Decorations !14
132+
%5 = load i32, i32 addrspace(1)* %arrayidx20, align 4
133+
%cmp21 = icmp slt i32 %inc26, %5
134+
br i1 %cmp21, label %for.body22, label %for.body31.preheader
135+
136+
for.cond38.preheader: ; preds = %for.body31
137+
%cmp3922 = icmp slt i32 %C, %D
138+
br i1 %cmp3922, label %for.body40, label %for.end45
139+
140+
for.body31: ; preds = %for.body31.preheader, %for.body31
141+
%i28.024 = phi i32 [ %add35, %for.body31 ], [ 0, %for.body31.preheader ]
142+
%6 = load i32, i32 addrspace(1)* %B, align 4
143+
store i32 %6, i32 addrspace(1)* %A, align 4
144+
%add35 = add nuw nsw i32 %i28.024, 2, !spirv.Decorations !14
145+
%cmp30 = icmp ult i32 %i28.024, 254
146+
br i1 %cmp30, label %for.body31, label %for.cond38.preheader
147+
148+
for.body40: ; preds = %for.cond38.preheader, %for.body40
149+
%i37.023 = phi i32 [ %inc44, %for.body40 ], [ %C, %for.cond38.preheader ]
150+
%7 = load i32, i32 addrspace(1)* %B, align 4
151+
store i32 %7, i32 addrspace(1)* %A, align 4
152+
%inc44 = add nsw i32 %i37.023, 1, !spirv.Decorations !14
153+
%exitcond.not = icmp eq i32 %inc44, %D
154+
br i1 %exitcond.not, label %for.end45, label %for.body40
155+
156+
for.end45: ; preds = %for.body40, %for.cond38.preheader
157+
ret void
158+
}
159+
160+
attributes #0 = { noinline nounwind "VCFunction" "VCNamedBarrierCount"="0" "VCSLMSize"="0" }
161+
162+
!spirv.MemoryModel = !{!0}
163+
!opencl.enable.FP_CONTRACT = !{}
164+
!spirv.Source = !{!1}
165+
!opencl.spir.version = !{!2}
166+
!opencl.ocl.version = !{!1}
167+
!opencl.used.extensions = !{!3}
168+
!opencl.used.optional.core.features = !{!3}
169+
!spirv.Generator = !{!4}
170+
171+
!0 = !{i32 2, i32 2}
172+
!1 = !{i32 0, i32 0}
173+
!2 = !{i32 1, i32 2}
174+
!3 = !{}
175+
!4 = !{i16 6, i16 14}
176+
!5 = !{!6, !6, !6, !6}
177+
!6 = !{!7}
178+
!7 = !{i32 5625, i32 0}
179+
!8 = !{i32 1}
180+
!9 = !{!10}
181+
!10 = !{i32 44, i32 8}
182+
!11 = !{!12}
183+
!12 = !{i32 44, i32 4}
184+
!13 = !{!14}
185+
!14 = !{i32 4469}
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys, pvc-supported
10+
; RUN: llvm-as %s -o %t.bc
11+
; RUN: ocloc -device pvc -llvm_input -options "-vc-codegen -ze-collect-cost-info -igc_opts 'ShaderDumpEnable=1, DumpToCustomDir=%t'" -output_no_suffix -file %t.bc
12+
; RUN: cat %t/*.zeinfo | FileCheck %s
13+
14+
target datalayout = "e-p:64:64-p3:32:32-p6:32:32-i64:64-n8:16:32:64"
15+
target triple = "genx64-unknown-unknown"
16+
17+
; CHECK: kernels_cost_info:
18+
; CHECK-NEXT: - name: kernel
19+
20+
; CHECK: kcm_args_sym:
21+
; CHECK-NEXT: - argNo: 2
22+
; CHECK-NEXT: byteOffset: 0
23+
; CHECK-NEXT: sizeInBytes: 4
24+
; CHECK-NEXT: isInDirect: false
25+
26+
; CHECK: kcm_loop_count_exps:
27+
; CHECK-NEXT: - factor: 0.25
28+
; CHECK-NEXT: argsym_index: 0
29+
; CHECK-NEXT: C: 0
30+
; CHECK-NEXT: - factor: 0.0625
31+
; CHECK-NEXT: argsym_index: 0
32+
; CHECK-NEXT: C: 0
33+
; CHECK-NEXT: - factor: 0
34+
; CHECK-NEXT: argsym_index: -1
35+
; CHECK-NEXT: C: 225
36+
; CHECK-NEXT: - factor: 1
37+
; CHECK-NEXT: argsym_index: 0
38+
; CHECK-NEXT: C: 0
39+
; CHECK-NEXT: - factor: 0
40+
; CHECK-NEXT: argsym_index: -1
41+
; CHECK-NEXT: C: 0
42+
43+
; COM: The loop costs are estimated by finalizer. Only verify that
44+
; COM: that the number of blocks equals the number of loops + 1.
45+
; CHECK: Kcm_loop_costs:
46+
; CHECK-NEXT: - cycle: {{.*}}
47+
; CHECK-NEXT: bytes_loaded: {{.*}}
48+
; CHECK-NEXT: bytes_stored: {{.*}}
49+
; CHECK-NEXT: num_loops: {{.*}}
50+
; CHECK-COUNT-5: - cycle: {{.*}}
51+
52+
; COM: IR represents the following kernel:
53+
; COM: kernel(__global unsigned *A, __global unsigned *B, unsigned C) {
54+
; COM: for (unsigned i = 0; i < C; i += 4)
55+
; COM: A[i] = B[i];
56+
; COM: for (unsigned i = 0; i < C; i += 16)
57+
; COM: A[i] = B[i];
58+
; COM: for (unsigned i = 32; i <= 256; ++i)
59+
; COM: A[i] = B[i];
60+
; COM: for (unsigned i = 0; i < C; ++i)
61+
; COM: for (unsigned j = i; j < 32; ++j)
62+
; COM: A[i] = B[j];
63+
; COM: }
64+
65+
define spir_kernel void @kernel(i32 addrspace(1)* "VCArgumentIOKind"="0" %A, i32 addrspace(1)* "VCArgumentIOKind"="0" %B, i32 "VCArgumentIOKind"="0" %C) #0 !spirv.ParameterDecorations !5 !intel_reqd_sub_group_size !8 {
66+
entry:
67+
%cmp31.not = icmp eq i32 %C, 0
68+
br i1 %cmp31.not, label %for.body17.preheader, label %for.body
69+
70+
for.body17.preheader: ; preds = %for.body6, %entry
71+
br label %for.body17
72+
73+
for.body: ; preds = %entry, %for.body
74+
%i.032 = phi i32 [ %add, %for.body ], [ 0, %entry ]
75+
%idxprom = zext i32 %i.032 to i64
76+
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %idxprom
77+
%0 = load i32, i32 addrspace(1)* %arrayidx, align 4
78+
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %idxprom
79+
store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
80+
%add = add i32 %i.032, 4
81+
%cmp = icmp ult i32 %add, %C
82+
br i1 %cmp, label %for.body, label %for.body6
83+
84+
for.body6: ; preds = %for.body, %for.body6
85+
%i3.030 = phi i32 [ %add12, %for.body6 ], [ 0, %for.body ]
86+
%idxprom7 = zext i32 %i3.030 to i64
87+
%arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %idxprom7
88+
%1 = load i32, i32 addrspace(1)* %arrayidx8, align 4
89+
%arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %idxprom7
90+
store i32 %1, i32 addrspace(1)* %arrayidx10, align 4
91+
%add12 = add i32 %i3.030, 16
92+
%cmp5 = icmp ult i32 %add12, %C
93+
br i1 %cmp5, label %for.body6, label %for.body17.preheader
94+
95+
for.cond25.preheader: ; preds = %for.body17
96+
br i1 %cmp31.not, label %for.end40, label %for.cond28.preheader.lr.ph
97+
98+
for.cond28.preheader.lr.ph: ; preds = %for.cond25.preheader
99+
%wide.trip.count = zext i32 %C to i64
100+
br label %for.cond28.preheader
101+
102+
for.body17: ; preds = %for.body17.preheader, %for.body17
103+
%indvars.iv36 = phi i64 [ %indvars.iv.next37, %for.body17 ], [ 32, %for.body17.preheader ]
104+
%arrayidx19 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %indvars.iv36
105+
%2 = load i32, i32 addrspace(1)* %arrayidx19, align 4
106+
%arrayidx21 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %indvars.iv36
107+
store i32 %2, i32 addrspace(1)* %arrayidx21, align 4
108+
%indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1
109+
%exitcond38.not = icmp eq i64 %indvars.iv.next37, 257
110+
br i1 %exitcond38.not, label %for.cond25.preheader, label %for.body17
111+
112+
for.cond28.preheader: ; preds = %for.cond28.preheader.lr.ph, %for.inc38
113+
%indvars.iv = phi i64 [ 0, %for.cond28.preheader.lr.ph ], [ %indvars.iv.next, %for.inc38 ]
114+
%cmp2923 = icmp ult i64 %indvars.iv, 32
115+
br i1 %cmp2923, label %for.body30.lr.ph, label %for.inc38
116+
117+
for.body30.lr.ph: ; preds = %for.cond28.preheader
118+
%arrayidx34 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %indvars.iv
119+
br label %for.body30
120+
121+
for.body30: ; preds = %for.body30.lr.ph, %for.body30
122+
%indvars.iv33 = phi i64 [ %indvars.iv, %for.body30.lr.ph ], [ %indvars.iv.next34, %for.body30 ]
123+
%arrayidx32 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %indvars.iv33
124+
%3 = load i32, i32 addrspace(1)* %arrayidx32, align 4
125+
store i32 %3, i32 addrspace(1)* %arrayidx34, align 4
126+
%indvars.iv.next34 = add nuw nsw i64 %indvars.iv33, 1
127+
%lftr.wideiv1 = trunc i64 %indvars.iv.next34 to i32
128+
%exitcond = icmp eq i32 %lftr.wideiv1, 32
129+
br i1 %exitcond, label %for.inc38, label %for.body30
130+
131+
for.inc38: ; preds = %for.body30, %for.cond28.preheader
132+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
133+
%exitcond35.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
134+
br i1 %exitcond35.not, label %for.end40, label %for.cond28.preheader
135+
136+
for.end40: ; preds = %for.inc38, %for.cond25.preheader
137+
ret void
138+
}
139+
140+
attributes #0 = { noinline nounwind "VCFunction" "VCNamedBarrierCount"="0" "VCSLMSize"="0" }
141+
142+
!spirv.MemoryModel = !{!0}
143+
!opencl.enable.FP_CONTRACT = !{}
144+
!spirv.Source = !{!1}
145+
!opencl.spir.version = !{!2}
146+
!opencl.ocl.version = !{!1}
147+
!opencl.used.extensions = !{!3}
148+
!opencl.used.optional.core.features = !{!3}
149+
!spirv.Generator = !{!4}
150+
151+
!0 = !{i32 2, i32 2}
152+
!1 = !{i32 0, i32 0}
153+
!2 = !{i32 1, i32 2}
154+
!3 = !{}
155+
!4 = !{i16 6, i16 14}
156+
!5 = !{!6, !6, !6}
157+
!6 = !{!7}
158+
!7 = !{i32 5625, i32 0}
159+
!8 = !{i32 1}
160+
!9 = !{!10}
161+
!10 = !{i32 44, i32 8}
162+
!11 = !{!12}
163+
!12 = !{i32 44, i32 4}

0 commit comments

Comments
 (0)