Skip to content

Commit dc84d72

Browse files
authored
Fix recursive zero replacement bottoming out in scalars. (microsoft#6516)
During memcpy replacement, the scalarrepl-param-hlsl pass will replace certain uses of a zero-initialized global variable. The pass uses a recursive algorithm to replace uses of that global by a zero value. The current code expects the recursion to bottom out in an aggregate type. But it can bottom out in a scalar integer or scalar float. This patch fixes the scalar cases. Lit-based pass tests are included.
1 parent b065a0d commit dc84d72

File tree

3 files changed

+272
-1
lines changed

3 files changed

+272
-1
lines changed

lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3693,7 +3693,9 @@ static bool ReplaceUseOfZeroInit(Instruction *I, Value *V, DominatorTree &DT,
36933693
if (ReplaceUseOfZeroInit(I, UI, DT, Reachable))
36943694
continue;
36953695
} else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
3696-
LI->replaceAllUsesWith(ConstantAggregateZero::get(LI->getType()));
3696+
// Replace uses of the load with a constant zero.
3697+
Constant *replacement = Constant::getNullValue(LI->getType());
3698+
LI->replaceAllUsesWith(replacement);
36973699
LI->eraseFromParent();
36983700
continue;
36993701
}
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
2+
3+
; The pass replaces a memcpy from a zero-initialized global that does not have an intervening store.
4+
; When tracing through geps and bitcasts of uses of that global, the algorithm might
5+
; bottom out at replacing a load of a scalar float. Verify this works.
6+
7+
; In the following code, %2 should be replaced by float 0.0
8+
; %2 = load float, float* %src_in_g,...
9+
; It only has one use: being stored to one of the elements of @g_1
10+
11+
; CHECK: for.body.i:
12+
; CHECK: [[DEST:%[a-z0-9\.]+]] = getelementptr inbounds [10 x float], [10 x float]* @g_1, i32 0
13+
; CHECK: store float 0.000000e+00, float* [[DEST]]
14+
; CHECK: end.block:
15+
16+
17+
target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
18+
target triple = "dxil-ms-dx"
19+
20+
%struct.ByteAddressBuffer = type { i32 }
21+
%ConstantBuffer = type opaque
22+
%struct.PSOut = type { <4 x float> }
23+
24+
@"\01?g_2@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
25+
@g = internal global [10 x float] zeroinitializer, align 4
26+
@g_1 = internal global [10 x float] zeroinitializer, align 4
27+
@"$Globals" = external constant %ConstantBuffer
28+
29+
; Function Attrs: nounwind
30+
define void @frag_main(%struct.PSOut* noalias sret %agg.result) #0 {
31+
entry:
32+
%i.i = alloca i32, align 4
33+
%copy.i = alloca [10 x float], align 4
34+
%wrapper_result = alloca %struct.PSOut, align 4
35+
store i32 0, i32* %i.i, align 4, !dbg !23, !tbaa !29 ; line:10 col:12
36+
br label %for.cond.i, !dbg !33 ; line:10 col:8
37+
38+
for.cond.i: ; preds = %for.body.i, %entry
39+
%0 = load i32, i32* %i.i, align 4, !dbg !34, !tbaa !29 ; line:10 col:19
40+
%cmp.i = icmp slt i32 %0, 10, !dbg !35 ; line:10 col:21
41+
br i1 %cmp.i, label %for.body.i, label %end.block, !dbg !36 ; line:10 col:3
42+
43+
for.body.i: ; preds = %for.cond.i
44+
%1 = load i32, i32* %i.i, align 4, !dbg !37, !tbaa !29 ; line:11 col:16
45+
%src_in_g = getelementptr inbounds [10 x float], [10 x float]* @g, i32 0, i32 %1, !dbg !38 ; line:11 col:14
46+
%2 = load float, float* %src_in_g, align 4, !dbg !38, !tbaa !39 ; line:11 col:14
47+
%3 = load i32, i32* %i.i, align 4, !dbg !41, !tbaa !29 ; line:11 col:9
48+
%dest = getelementptr inbounds [10 x float], [10 x float]* @g_1, i32 0, i32 %3, !dbg !42 ; line:11 col:5
49+
store float %2, float* %dest, align 4, !dbg !43, !tbaa !39 ; line:11 col:12
50+
%4 = load i32, i32* %i.i, align 4, !dbg !44, !tbaa !29 ; line:10 col:28
51+
%inc.i = add nsw i32 %4, 1, !dbg !44 ; line:10 col:28
52+
store i32 %inc.i, i32* %i.i, align 4, !dbg !44, !tbaa !29 ; line:10 col:28
53+
br label %for.cond.i, !dbg !36 ; line:10 col:3
54+
55+
end.block: ; preds = %for.cond.i
56+
%5 = bitcast [10 x float]* %copy.i to i8*, !dbg !45 ; line:13 col:20
57+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ([10 x float]* @g to i8*), i64 40, i32 1, i1 false) #0, !dbg !45 ; line:13 col:20
58+
%6 = bitcast [10 x float]* %copy.i to i8*, !dbg !46 ; line:14 col:7
59+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([10 x float]* @g to i8*), i8* %6, i64 40, i32 1, i1 false) #0, !dbg !46 ; line:14 col:7
60+
%value = getelementptr inbounds %struct.PSOut, %struct.PSOut* %wrapper_result, i32 0, i32 0, !dbg !47 ; line:20 col:18
61+
store <4 x float> zeroinitializer, <4 x float>* %value, align 4, !dbg !48, !tbaa !49 ; line:20 col:24
62+
%7 = bitcast %struct.PSOut* %agg.result to i8*, !dbg !50 ; line:21 col:10
63+
%8 = bitcast %struct.PSOut* %wrapper_result to i8*, !dbg !50 ; line:21 col:10
64+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 16, i32 1, i1 false), !dbg !50 ; line:21 col:10
65+
ret void, !dbg !51 ; line:21 col:3
66+
}
67+
68+
; Function Attrs: nounwind
69+
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
70+
71+
attributes #0 = { nounwind }
72+
73+
!llvm.module.flags = !{!0}
74+
!pauseresume = !{!1}
75+
!llvm.ident = !{!2}
76+
!dx.version = !{!3}
77+
!dx.valver = !{!4}
78+
!dx.shaderModel = !{!5}
79+
!dx.typeAnnotations = !{!6, !9}
80+
!dx.entryPoints = !{!14}
81+
!dx.fnprops = !{!20}
82+
!dx.options = !{!21, !22}
83+
84+
!0 = !{i32 2, !"Debug Info Version", i32 3}
85+
!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
86+
!2 = !{!"dxc(private) 1.8.0.14549 (main, 0781ded87-dirty)"}
87+
!3 = !{i32 1, i32 0}
88+
!4 = !{i32 1, i32 8}
89+
!5 = !{!"ps", i32 6, i32 0}
90+
!6 = !{i32 0, %struct.PSOut undef, !7}
91+
!7 = !{i32 16, !8}
92+
!8 = !{i32 6, !"value", i32 3, i32 0, i32 4, !"SV_Target0", i32 7, i32 9}
93+
!9 = !{i32 1, void (%struct.PSOut*)* @frag_main, !10}
94+
!10 = !{!11, !13}
95+
!11 = !{i32 0, !12, !12}
96+
!12 = !{}
97+
!13 = !{i32 1, !12, !12}
98+
!14 = !{void (%struct.PSOut*)* @frag_main, !"frag_main", null, !15, null}
99+
!15 = !{!16, null, !18, null}
100+
!16 = !{!17}
101+
!17 = !{i32 0, %struct.ByteAddressBuffer* @"\01?g_2@@3UByteAddressBuffer@@A", !"g_2", i32 0, i32 0, i32 1, i32 11, i32 0, null}
102+
!18 = !{!19}
103+
!19 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
104+
!20 = !{void (%struct.PSOut*)* @frag_main, i32 0, i1 false}
105+
!21 = !{i32 144}
106+
!22 = !{i32 -1}
107+
!23 = !DILocation(line: 10, column: 12, scope: !24, inlinedAt: !27)
108+
!24 = !DISubprogram(name: "inner", scope: !25, file: !25, line: 9, type: !26, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false)
109+
!25 = !DIFile(filename: "float.hlsl", directory: "")
110+
!26 = !DISubroutineType(types: !12)
111+
!27 = distinct !DILocation(line: 20, column: 26, scope: !28)
112+
!28 = !DISubprogram(name: "frag_main", scope: !25, file: !25, line: 18, type: !26, isLocal: false, isDefinition: true, scopeLine: 18, flags: DIFlagPrototyped, isOptimized: false, function: void (%struct.PSOut*)* @frag_main)
113+
!29 = !{!30, !30, i64 0}
114+
!30 = !{!"int", !31, i64 0}
115+
!31 = !{!"omnipotent char", !32, i64 0}
116+
!32 = !{!"Simple C/C++ TBAA"}
117+
!33 = !DILocation(line: 10, column: 8, scope: !24, inlinedAt: !27)
118+
!34 = !DILocation(line: 10, column: 19, scope: !24, inlinedAt: !27)
119+
!35 = !DILocation(line: 10, column: 21, scope: !24, inlinedAt: !27)
120+
!36 = !DILocation(line: 10, column: 3, scope: !24, inlinedAt: !27)
121+
!37 = !DILocation(line: 11, column: 16, scope: !24, inlinedAt: !27)
122+
!38 = !DILocation(line: 11, column: 14, scope: !24, inlinedAt: !27)
123+
!39 = !{!40, !40, i64 0}
124+
!40 = !{!"float", !31, i64 0}
125+
!41 = !DILocation(line: 11, column: 9, scope: !24, inlinedAt: !27)
126+
!42 = !DILocation(line: 11, column: 5, scope: !24, inlinedAt: !27)
127+
!43 = !DILocation(line: 11, column: 12, scope: !24, inlinedAt: !27)
128+
!44 = !DILocation(line: 10, column: 28, scope: !24, inlinedAt: !27)
129+
!45 = !DILocation(line: 13, column: 20, scope: !24, inlinedAt: !27)
130+
!46 = !DILocation(line: 14, column: 7, scope: !24, inlinedAt: !27)
131+
!47 = !DILocation(line: 20, column: 18, scope: !28)
132+
!48 = !DILocation(line: 20, column: 24, scope: !28)
133+
!49 = !{!31, !31, i64 0}
134+
!50 = !DILocation(line: 21, column: 10, scope: !28)
135+
!51 = !DILocation(line: 21, column: 3, scope: !28)
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
2+
3+
4+
; The pass replaces a memcpy from a zero-initialized global that does not have an intervening store.
5+
; When tracing through geps and bitcasts of uses of that global, the algorithm might
6+
; bottom out at replacing a load of a scalar float. Verify this works.
7+
8+
; In the following code, %2 should be replaced by i32 0
9+
; %2 = load i32, i32* %src_in_g,
10+
; It only has one use: being stored to one of the elements of @g_1
11+
12+
; CHECK: for.body.i:
13+
; CHECK: [[DEST:%[a-z0-9\.]+]] = getelementptr inbounds [10 x i32], [10 x i32]* @g_1, i32 0
14+
; CHECK: store i32 0, i32* [[DEST]]
15+
; CHECK: end.block:
16+
17+
18+
target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
19+
target triple = "dxil-ms-dx"
20+
21+
%struct.ByteAddressBuffer = type { i32 }
22+
%ConstantBuffer = type opaque
23+
%struct.PSOut = type { <4 x float> }
24+
25+
@"\01?g_2@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4
26+
@g = internal global [10 x i32] zeroinitializer, align 4
27+
@g_1 = internal global [10 x i32] zeroinitializer, align 4
28+
@"$Globals" = external constant %ConstantBuffer
29+
30+
; Function Attrs: nounwind
31+
define void @frag_main(%struct.PSOut* noalias sret %agg.result) #0 {
32+
entry:
33+
%i.i = alloca i32, align 4
34+
%copy.i = alloca [10 x i32], align 4
35+
%wrapper_result = alloca %struct.PSOut, align 4
36+
store i32 0, i32* %i.i, align 4, !dbg !23, !tbaa !29 ; line:10 col:12
37+
br label %for.cond.i, !dbg !33 ; line:10 col:8
38+
39+
for.cond.i: ; preds = %for.body.i, %entry
40+
%0 = load i32, i32* %i.i, align 4, !dbg !34, !tbaa !29 ; line:10 col:19
41+
%cmp.i = icmp slt i32 %0, 10, !dbg !35 ; line:10 col:21
42+
br i1 %cmp.i, label %for.body.i, label %end.block, !dbg !36 ; line:10 col:3
43+
44+
for.body.i: ; preds = %for.cond.i
45+
%1 = load i32, i32* %i.i, align 4, !dbg !37, !tbaa !29 ; line:11 col:16
46+
%src_in_g = getelementptr inbounds [10 x i32], [10 x i32]* @g, i32 0, i32 %1, !dbg !38 ; line:11 col:14
47+
%2 = load i32, i32* %src_in_g, align 4, !dbg !38, !tbaa !29 ; line:11 col:14
48+
%3 = load i32, i32* %i.i, align 4, !dbg !39, !tbaa !29 ; line:11 col:9
49+
%dest = getelementptr inbounds [10 x i32], [10 x i32]* @g_1, i32 0, i32 %3, !dbg !40 ; line:11 col:5
50+
store i32 %2, i32* %dest, align 4, !dbg !41, !tbaa !29 ; line:11 col:12
51+
%4 = load i32, i32* %i.i, align 4, !dbg !42, !tbaa !29 ; line:10 col:28
52+
%inc.i = add nsw i32 %4, 1, !dbg !42 ; line:10 col:28
53+
store i32 %inc.i, i32* %i.i, align 4, !dbg !42, !tbaa !29 ; line:10 col:28
54+
br label %for.cond.i, !dbg !36 ; line:10 col:3
55+
56+
end.block: ; preds = %for.cond.i
57+
%5 = bitcast [10 x i32]* %copy.i to i8*, !dbg !43 ; line:13 col:18
58+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ([10 x i32]* @g to i8*), i64 40, i32 1, i1 false) #0, !dbg !43 ; line:13 col:18
59+
%6 = bitcast [10 x i32]* %copy.i to i8*, !dbg !44 ; line:14 col:7
60+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([10 x i32]* @g to i8*), i8* %6, i64 40, i32 1, i1 false) #0, !dbg !44 ; line:14 col:7
61+
%value = getelementptr inbounds %struct.PSOut, %struct.PSOut* %wrapper_result, i32 0, i32 0, !dbg !45 ; line:20 col:18
62+
store <4 x float> zeroinitializer, <4 x float>* %value, align 4, !dbg !46, !tbaa !47 ; line:20 col:24
63+
%7 = bitcast %struct.PSOut* %agg.result to i8*, !dbg !48 ; line:21 col:10
64+
%8 = bitcast %struct.PSOut* %wrapper_result to i8*, !dbg !48 ; line:21 col:10
65+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 16, i32 1, i1 false), !dbg !48 ; line:21 col:10
66+
ret void, !dbg !49 ; line:21 col:3
67+
}
68+
69+
; Function Attrs: nounwind
70+
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
71+
72+
attributes #0 = { nounwind }
73+
74+
!llvm.module.flags = !{!0}
75+
!pauseresume = !{!1}
76+
!llvm.ident = !{!2}
77+
!dx.version = !{!3}
78+
!dx.valver = !{!4}
79+
!dx.shaderModel = !{!5}
80+
!dx.typeAnnotations = !{!6, !9}
81+
!dx.entryPoints = !{!14}
82+
!dx.fnprops = !{!20}
83+
!dx.options = !{!21, !22}
84+
85+
!0 = !{i32 2, !"Debug Info Version", i32 3}
86+
!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
87+
!2 = !{!"dxc(private) 1.8.0.14549 (main, 0781ded87-dirty)"}
88+
!3 = !{i32 1, i32 0}
89+
!4 = !{i32 1, i32 8}
90+
!5 = !{!"ps", i32 6, i32 0}
91+
!6 = !{i32 0, %struct.PSOut undef, !7}
92+
!7 = !{i32 16, !8}
93+
!8 = !{i32 6, !"value", i32 3, i32 0, i32 4, !"SV_Target0", i32 7, i32 9}
94+
!9 = !{i32 1, void (%struct.PSOut*)* @frag_main, !10}
95+
!10 = !{!11, !13}
96+
!11 = !{i32 0, !12, !12}
97+
!12 = !{}
98+
!13 = !{i32 1, !12, !12}
99+
!14 = !{void (%struct.PSOut*)* @frag_main, !"frag_main", null, !15, null}
100+
!15 = !{!16, null, !18, null}
101+
!16 = !{!17}
102+
!17 = !{i32 0, %struct.ByteAddressBuffer* @"\01?g_2@@3UByteAddressBuffer@@A", !"g_2", i32 0, i32 0, i32 1, i32 11, i32 0, null}
103+
!18 = !{!19}
104+
!19 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
105+
!20 = !{void (%struct.PSOut*)* @frag_main, i32 0, i1 false}
106+
!21 = !{i32 144}
107+
!22 = !{i32 -1}
108+
!23 = !DILocation(line: 10, column: 12, scope: !24, inlinedAt: !27)
109+
!24 = !DISubprogram(name: "inner", scope: !25, file: !25, line: 9, type: !26, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false)
110+
!25 = !DIFile(filename: "int.hlsl", directory: "")
111+
!26 = !DISubroutineType(types: !12)
112+
!27 = distinct !DILocation(line: 20, column: 26, scope: !28)
113+
!28 = !DISubprogram(name: "frag_main", scope: !25, file: !25, line: 18, type: !26, isLocal: false, isDefinition: true, scopeLine: 18, flags: DIFlagPrototyped, isOptimized: false, function: void (%struct.PSOut*)* @frag_main)
114+
!29 = !{!30, !30, i64 0}
115+
!30 = !{!"int", !31, i64 0}
116+
!31 = !{!"omnipotent char", !32, i64 0}
117+
!32 = !{!"Simple C/C++ TBAA"}
118+
!33 = !DILocation(line: 10, column: 8, scope: !24, inlinedAt: !27)
119+
!34 = !DILocation(line: 10, column: 19, scope: !24, inlinedAt: !27)
120+
!35 = !DILocation(line: 10, column: 21, scope: !24, inlinedAt: !27)
121+
!36 = !DILocation(line: 10, column: 3, scope: !24, inlinedAt: !27)
122+
!37 = !DILocation(line: 11, column: 16, scope: !24, inlinedAt: !27)
123+
!38 = !DILocation(line: 11, column: 14, scope: !24, inlinedAt: !27)
124+
!39 = !DILocation(line: 11, column: 9, scope: !24, inlinedAt: !27)
125+
!40 = !DILocation(line: 11, column: 5, scope: !24, inlinedAt: !27)
126+
!41 = !DILocation(line: 11, column: 12, scope: !24, inlinedAt: !27)
127+
!42 = !DILocation(line: 10, column: 28, scope: !24, inlinedAt: !27)
128+
!43 = !DILocation(line: 13, column: 18, scope: !24, inlinedAt: !27)
129+
!44 = !DILocation(line: 14, column: 7, scope: !24, inlinedAt: !27)
130+
!45 = !DILocation(line: 20, column: 18, scope: !28)
131+
!46 = !DILocation(line: 20, column: 24, scope: !28)
132+
!47 = !{!31, !31, i64 0}
133+
!48 = !DILocation(line: 21, column: 10, scope: !28)
134+
!49 = !DILocation(line: 21, column: 3, scope: !28)

0 commit comments

Comments
 (0)