Skip to content

Commit 2f315fb

Browse files
karolzwolakigcbot
authored andcommitted
PrivateMemoryResolution: Do not inline non-stack locations on O2
With optimizations disabled there's no guarantee that the offsets in registers will live throughout the entire variable lifetime. Therefore we cannot inline them. That means we shouldn't attach "StorageOffset" matadata to the instructions in this case.
1 parent 08398d5 commit 2f315fb

File tree

3 files changed

+216
-8
lines changed

3 files changed

+216
-8
lines changed

IGC/Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryResolution.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -998,6 +998,9 @@ bool PrivateMemoryResolution::resolveAllocaInstructions(bool privateOnStack) {
998998
Value *stackAlloca = builder.CreateCall(stackAllocaFunc, totalOffset, VALUE_NAME("stackAlloca"));
999999
privateBuffer =
10001000
builder.CreatePointerCast(stackAlloca, pAI->getType(), VALUE_NAME(pAI->getName() + ".privateBuffer"));
1001+
1002+
// Attaching this metadata is crucial to both properly interpret this locations as stack based ond to inline it.
1003+
// Because these are stack locations we can safely inline them even with optimizations disabled (O0).
10011004
auto DbgUses = llvm::FindDbgAddrUses(pAI);
10021005
for (auto Use : DbgUses) {
10031006
if (auto DbgDcl = dyn_cast_or_null<DbgDeclareInst>(Use)) {
@@ -1297,14 +1300,19 @@ bool PrivateMemoryResolution::resolveAllocaInstructions(bool privateOnStack) {
12971300
Value *privateBuffer =
12981301
builder.CreatePointerCast(bufferBase, pAI->getType(), VALUE_NAME(pAI->getName() + ".privateBuffer"));
12991302

1300-
auto DbgUses = llvm::FindDbgAddrUses(pAI);
1301-
for (auto Use : DbgUses) {
1302-
if (auto DbgDcl = dyn_cast_or_null<DbgDeclareInst>(Use)) {
1303-
// Attach metadata to instruction containing offset of storage
1304-
unsigned int scalarBufferOffset = m_ModAllocaInfo->getBufferOffset(pAI);
1305-
auto OffsetMD =
1306-
MDNode::get(builder.getContext(), ConstantAsMetadata::get(builder.getInt32(scalarBufferOffset)));
1307-
DbgDcl->setMetadata("StorageOffset", OffsetMD);
1303+
// Attaching this metadata will make this location be inlined.
1304+
// We can only safely inline such locations with optimizations disabled.
1305+
// On O2 we have no guarantee the offsets in registers are gonna be valid throughout the entire variable lifetime.
1306+
if (modMD->compOpt.OptDisable) {
1307+
auto DbgUses = llvm::FindDbgAddrUses(pAI);
1308+
for (auto Use : DbgUses) {
1309+
if (auto DbgDcl = dyn_cast_or_null<DbgDeclareInst>(Use)) {
1310+
// Attach metadata to instruction containing offset of storage
1311+
unsigned int scalarBufferOffset = m_ModAllocaInfo->getBufferOffset(pAI);
1312+
auto OffsetMD =
1313+
MDNode::get(builder.getContext(), ConstantAsMetadata::get(builder.getInt32(scalarBufferOffset)));
1314+
DbgDcl->setMetadata("StorageOffset", OffsetMD);
1315+
}
13081316
}
13091317
}
13101318

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024-2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; Tests that non stack variables don't have StorageOffset matadata attached that would cause them to get inlined.
10+
; With optimizations disabled there's no guarantee that the offsets in registers will live throughout the entire variable lifetime.
11+
12+
; REQUIRES: regkeys, llvm-16-plus
13+
14+
; LLVM with opaque pointers:
15+
; RUN: igc_opt --opaque-pointers --igc-private-mem-resolution --platformdg2 --regkey EnableOpaquePointersBackend=1 -S %s | FileCheck %s
16+
17+
; CHECK: call void @llvm.dbg.declare(metadata ptr %{{[.a-zA-Z0-9]+}}, metadata {{!?[0-9]*}}, metadata !DIExpression(DW_OP_constu, 4, DW_OP_swap, DW_OP_xderef)), !dbg !{{[0-9]+}}
18+
; CHECK-NOT: !StorageOffset
19+
20+
source_filename = "dont_inline_non_stack_vars_with_optimizations.ll"
21+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
22+
target triple = "spir64-unknown-unknown"
23+
24+
; Function Attrs: convergent
25+
define spir_kernel void @foo(i64 %0, <8 x i32> %r0, <3 x i32> %globalOffset, <3 x i32> %enqueuedLocalSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, ptr %privateBase) #0 {
26+
%globalOffset.scalar = extractelement <3 x i32> %globalOffset, i64 0
27+
%enqueuedLocalSize.scalar = extractelement <3 x i32> %enqueuedLocalSize, i64 0
28+
%r0.scalar4 = extractelement <8 x i32> %r0, i64 1
29+
%2 = alloca [81 x float], align 4, !user_as_priv !483
30+
call void @llvm.dbg.declare(metadata ptr %2, metadata !484, metadata !DIExpression(DW_OP_constu, 4, DW_OP_swap, DW_OP_xderef)), !dbg !492
31+
%3 = mul i32 %enqueuedLocalSize.scalar, %r0.scalar4, !dbg !493
32+
%4 = zext i16 %localIdX to i32, !dbg !493
33+
%5 = add i32 %3, %4, !dbg !493
34+
%6 = add i32 %5, %globalOffset.scalar, !dbg !493
35+
%7 = icmp sgt i32 %6, 0
36+
br i1 %7, label %9, label %8
37+
38+
8: ; preds = %1
39+
store i8 0, ptr %2, align 4, !user_as_priv !494
40+
%.phi.trans.insert = getelementptr [81 x float], ptr %2, i64 0, i64 %0, !user_as_priv !483
41+
%.pre = load float, ptr %.phi.trans.insert, align 4, !user_as_priv !494
42+
store float %.pre, ptr addrspace(1) null, align 4294967296
43+
br label %.loopexit2
44+
45+
.loopexit2: ; preds = %.loopexit2, %8
46+
br label %.loopexit2
47+
48+
9: ; preds = %1
49+
ret void
50+
}
51+
52+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
53+
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
54+
55+
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
56+
declare spir_func i32 @__builtin_IB_get_group_id(i32 noundef) local_unnamed_addr #2
57+
58+
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
59+
declare spir_func i32 @__builtin_IB_get_enqueued_local_size(i32 noundef) local_unnamed_addr #2
60+
61+
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
62+
declare spir_func i32 @__builtin_IB_get_local_id_x() local_unnamed_addr #2
63+
64+
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
65+
declare spir_func i32 @__builtin_IB_get_global_offset(i32 noundef) local_unnamed_addr #2
66+
67+
declare i32 @printf(ptr addrspace(2), ...)
68+
69+
attributes #0 = { convergent }
70+
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
71+
attributes #2 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
72+
73+
!llvm.module.flags = !{!0, !1}
74+
!llvm.dbg.cu = !{!2}
75+
!igc.functions = !{!12}
76+
!IGCMetadata = !{!23}
77+
78+
!0 = !{i32 2, !"Debug Info Version", i32 3}
79+
!1 = !{i32 1, !"wchar_size", i32 4}
80+
!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang based Intel(R) oneAPI DPC++/C++ Compiler 2026.0.0 (2026.x.0.20250920)", isOptimized: false, flags: " --driver-mode=g++ --intel -c -O2 -g -fiopenmp --offload-targets=spir64 -fno-exceptions -D GPU_OFFLOAD_NOEH -D GPU_OFFLOAD_WA_CLANG -D GPU_OFFLOAD_WA_DT convolution.cpp -fveclib=SVML -shared -fPIC", runtimeVersion: 0, emissionKind: FullDebug, globals: !4, imports: !11)
81+
!3 = !DIFile(filename: "convolution.cpp", directory: "/netbatch/alTC98419_00/exp/ompo_kernels_small_gpuCpp~2-2/opt_speed_debug_igen")
82+
!4 = !{!5, !9}
83+
!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
84+
!6 = distinct !DIGlobalVariable(name: "src_tile_width", scope: !2, file: !3, line: 464, type: !7, isLocal: true, isDefinition: true)
85+
!7 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !8)
86+
!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
87+
!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
88+
!10 = distinct !DIGlobalVariable(name: "src_tile_h_offset", scope: !2, file: !3, line: 463, type: !7, isLocal: true, isDefinition: true)
89+
!11 = !{}
90+
!12 = !{ptr @foo, !13}
91+
!13 = !{!14, !15}
92+
!14 = !{!"function_type", i32 0}
93+
!15 = !{!"implicit_arg_desc", !16, !17, !18, !19, !20, !21, !22}
94+
!16 = !{i32 0}
95+
!17 = !{i32 2}
96+
!18 = !{i32 7}
97+
!19 = !{i32 8}
98+
!20 = !{i32 9}
99+
!21 = !{i32 10}
100+
!22 = !{i32 13}
101+
!23 = !{!"ModuleMD", !25}
102+
!25 = !{!"compOpt", !29}
103+
!29 = !{!"OptDisable", i1 false}
104+
!483 = !{!""}
105+
!484 = !DILocalVariable(name: "test_filter_tile", scope: !485, file: !3, line: 454, type: !490)
106+
!485 = distinct !DILexicalBlock(scope: !486, file: !3, line: 452, column: 196)
107+
!486 = distinct !DILexicalBlock(scope: !487, file: !3, line: 447, column: 196)
108+
!487 = distinct !DISubprogram(name: "_ZN19ConvolutionLocalsAN15execute_offloadEPfi.extracted", scope: null, file: !3, line: 447, type: !488, scopeLine: 447, flags: DIFlagArtificial, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized | DISPFlagMainSubprogram, unit: !2, templateParams: !11, retainedNodes: !11)
109+
!488 = !DISubroutineType(types: !489)
110+
!489 = !{null}
111+
!490 = !DICompositeType(tag: DW_TAG_array_type, baseType: !491, size: 2592, elements: !11)
112+
!491 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
113+
!492 = !DILocation(line: 454, column: 15, scope: !485)
114+
!493 = !DILocation(line: 452, column: 1, scope: !485)
115+
!494 = !{!"CannotUseSOALayout"}
116+
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024-2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; Tests that non stack variables on O2 aren't inlined.
10+
; With optimizations disabled there's no guarantee that the offsets in registers will live throughout the entire variable lifetime.
11+
12+
; REQUIRES: regkeys, oneapi-readelf, llvm-16-plus
13+
14+
; LLVM with opaque pointers:
15+
; RUN: llvm-as -opaque-pointers=1 %s -o %t
16+
; RUN: ocloc compile -llvm_input -file %t -device dg2 -options "-g -igc_opts 'EnableOpaquePointersBackend=1, ElfDumpEnable=1, DumpUseShorterName=0, DebugDumpNamePrefix=%t_'"
17+
; RUN: oneapi-readelf --debug-dump %t_OCL_simd32_foo.elf | FileCheck %s
18+
19+
; CHECK: DW_AT_name : test_filter_tile
20+
; CHECK-NEXT: DW_AT_decl_file : 1
21+
; CHECK-NEXT: DW_AT_decl_line : 454
22+
; CHECK-NEXT: DW_AT_type : {{.*}}
23+
; CHECK-NEXT: DW_AT_location : 0 (location list)
24+
25+
; CHECK: Contents of the .debug_loc section:
26+
; CHECK: {{0+}} {{[0-9A-Fa-f]+}} {{[0-9A-Fa-f]+}} (DW_OP_INTEL_push_simd_lane; DW_OP_lit16; DW_OP_ge; DW_OP_bra: 16; DW_OP_INTEL_push_simd_lane; DW_OP_lit2; DW_OP_shr; DW_OP_plus_uconst: {{[0-9]+}}; DW_OP_INTEL_push_simd_lane; DW_OP_lit3; DW_OP_and; DW_OP_const1u: 64; DW_OP_mul; DW_OP_INTEL_regval_bits: 64; DW_OP_skip: 15; DW_OP_INTEL_push_simd_lane; DW_OP_lit16; DW_OP_minus; DW_OP_lit2; DW_OP_shr; DW_OP_plus_uconst: {{[0-9]+}}; DW_OP_INTEL_push_simd_lane; DW_OP_lit3; DW_OP_and; DW_OP_const1u: 64; DW_OP_mul; DW_OP_INTEL_regval_bits: 64)
27+
28+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
29+
target triple = "spir64-unknown-unknown"
30+
31+
declare spir_func i64 @_Z13get_global_idj(i32)
32+
33+
define spir_kernel void @foo(i64 %0) {
34+
%2 = alloca [81 x float], align 4
35+
call void @llvm.dbg.declare(metadata ptr %2, metadata !11, metadata !DIExpression(DW_OP_constu, 4, DW_OP_swap, DW_OP_xderef)), !dbg !19
36+
%3 = call spir_func i64 @_Z13get_global_idj(i32 0), !dbg !20
37+
%4 = trunc i64 %3 to i32
38+
%5 = icmp sgt i32 %4, 0
39+
br i1 %5, label %7, label %6
40+
41+
6: ; preds = %1
42+
store i8 0, ptr %2, align 4
43+
%.phi.trans.insert = getelementptr [81 x float], ptr %2, i64 0, i64 %0
44+
%.pre = load float, ptr %.phi.trans.insert, align 4
45+
br label %.loopexit2
46+
47+
.loopexit2: ; preds = %.loopexit2, %6
48+
store float %.pre, ptr addrspace(1) null, align 4294967296
49+
br label %.loopexit2
50+
51+
7: ; preds = %1
52+
ret void
53+
}
54+
55+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
56+
declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
57+
58+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
59+
60+
!llvm.module.flags = !{!0}
61+
!llvm.dbg.cu = !{!1}
62+
63+
!0 = !{i32 2, !"Debug Info Version", i32 3}
64+
!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !2, producer: "clang based Intel(R) oneAPI DPC++/C++ Compiler 2026.0.0 (2026.x.0.20250920)", isOptimized: false, flags: " --driver-mode=g++ --intel -c -O2 -g -fiopenmp --offload-targets=spir64 -fno-exceptions -D GPU_OFFLOAD_NOEH -D GPU_OFFLOAD_WA_CLANG -D GPU_OFFLOAD_WA_DT convolution.cpp -fveclib=SVML -shared -fPIC", runtimeVersion: 0, emissionKind: FullDebug, globals: !3, imports: !10)
65+
!2 = !DIFile(filename: "convolution.cpp", directory: "/netbatch/alTC98419_00/exp/ompo_kernels_small_gpuCpp~2-2/opt_speed_debug_igen")
66+
!3 = !{!4, !8}
67+
!4 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
68+
!5 = distinct !DIGlobalVariable(name: "src_tile_width", scope: !1, file: !2, line: 464, type: !6, isLocal: true, isDefinition: true)
69+
!6 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !7)
70+
!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
71+
!8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
72+
!9 = distinct !DIGlobalVariable(name: "src_tile_h_offset", scope: !1, file: !2, line: 463, type: !6, isLocal: true, isDefinition: true)
73+
!10 = !{}
74+
!11 = !DILocalVariable(name: "test_filter_tile", scope: !12, file: !2, line: 454, type: !17)
75+
!12 = distinct !DILexicalBlock(scope: !13, file: !2, line: 452, column: 196)
76+
!13 = distinct !DILexicalBlock(scope: !14, file: !2, line: 447, column: 196)
77+
!14 = distinct !DISubprogram(name: "_ZN19ConvolutionLocalsAN15execute_offloadEPfi.extracted", scope: null, file: !2, line: 447, type: !15, scopeLine: 447, flags: DIFlagArtificial, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized | DISPFlagMainSubprogram, unit: !1, templateParams: !10, retainedNodes: !10)
78+
!15 = !DISubroutineType(types: !16)
79+
!16 = !{null}
80+
!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 2592, elements: !10)
81+
!18 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
82+
!19 = !DILocation(line: 454, column: 15, scope: !12)
83+
!20 = !DILocation(line: 452, column: 1, scope: !12)
84+

0 commit comments

Comments
 (0)