Skip to content

Commit 9107715

Browse files
committed
[AMDGPU] Change default loop alignment
Align small loops aggresively to 32 bytes and larger loops to 16 bytes
1 parent 3134e69 commit 9107715

File tree

153 files changed

+8882
-6465
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

153 files changed

+8882
-6465
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17501,26 +17501,18 @@ Align SITargetLowering::computeKnownAlignForTargetInstr(
1750117501
Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
1750217502
const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
1750317503
const Align CacheLineAlign = Align(64);
17504-
17505-
// Pre-GFX10 target did not benefit from loop alignment
17506-
if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
17507-
getSubtarget()->hasInstFwdPrefetchBug())
17504+
if (!ML || DisableLoopAlignment)
1750817505
return PrefAlign;
17509-
17510-
// On GFX10 I$ is 4 x 64 bytes cache lines.
17511-
// By default prefetcher keeps one cache line behind and reads two ahead.
17512-
// We can modify it with S_INST_PREFETCH for larger loops to have two lines
17513-
// behind and one ahead.
17514-
// Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17515-
// If loop fits 64 bytes it always spans no more than two cache lines and
17516-
// does not need an alignment.
17517-
// Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17518-
// Else if loop is less or equal 192 bytes we need two lines behind.
17519-
1752017506
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1752117507
const MachineBasicBlock *Header = ML->getHeader();
1752217508
if (Header->getAlignment() != PrefAlign)
1752317509
return Header->getAlignment(); // Already processed.
17510+
const MachineFunction *MF = Header->getParent();
17511+
const Function &Fn = MF->getFunction();
17512+
for (auto &BB : Fn)
17513+
for (auto &I : BB)
17514+
if (isa<llvm::UnreachableInst>(&I))
17515+
return PrefAlign;
1752417516

1752517517
unsigned LoopSize = 0;
1752617518
for (const MachineBasicBlock *MBB : ML->blocks()) {
@@ -17531,13 +17523,41 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
1753117523

1753217524
for (const MachineInstr &MI : *MBB) {
1753317525
LoopSize += TII->getInstSizeInBytes(MI);
17534-
if (LoopSize > 192)
17535-
return PrefAlign;
1753617526
}
17527+
if (LoopSize > 192)
17528+
break;
17529+
}
17530+
17531+
if (!getSubtarget()->hasInstPrefetch() ||
17532+
getSubtarget()->hasInstFwdPrefetchBug()) {
17533+
// Align loops < 32 bytes agrressively
17534+
if (LoopSize <= 32)
17535+
return Align(32);
17536+
// Align larger loops less aggressively
17537+
if (!ML->isInnermost())
17538+
return PrefAlign;
17539+
return Align(16);
17540+
}
17541+
17542+
// On GFX10 I$ is 4 x 64 bytes cache lines.
17543+
// By default prefetcher keeps one cache line behind and reads two ahead.
17544+
// We can modify it with S_INST_PREFETCH for larger loops to have two lines
17545+
// behind and one ahead.
17546+
// Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17547+
// If loop fits 64 bytes it always spans no more than two cache lines and
17548+
// does not need an alignment driven by prefetch considerations.
17549+
// Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17550+
// Else if loop is less or equal 192 bytes we need two lines behind.
17551+
17552+
// Align larger loops less aggressively
17553+
if (LoopSize > 192) {
17554+
if (!ML->isInnermost())
17555+
return PrefAlign;
17556+
return Align(16);
1753717557
}
1753817558

1753917559
if (LoopSize <= 64)
17540-
return PrefAlign;
17560+
return Align(32);
1754117561

1754217562
if (LoopSize <= 128)
1754317563
return CacheLineAlign;

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 48 additions & 0 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 48 additions & 0 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
105105
; GFX10-NEXT: s_mov_b32 s5, 1
106106
; GFX10-NEXT: s_mov_b32 s6, 0
107107
; GFX10-NEXT: ; implicit-def: $sgpr7
108+
; GFX10-NEXT: .p2align
108109
; GFX10-NEXT: .LBB2_1: ; %loop
109110
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
110111
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6
@@ -154,6 +155,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
154155
; GFX10-NEXT: s_mov_b32 s6, 0
155156
; GFX10-NEXT: ; implicit-def: $sgpr7
156157
; GFX10-NEXT: s_branch .LBB3_2
158+
; GFX10-NEXT: .p2align
157159
; GFX10-NEXT: .LBB3_1: ; %loop_body
158160
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
159161
; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s6
@@ -247,6 +249,7 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
247249
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
248250
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
249251
; GFX10-NEXT: s_mov_b32 s2, 0
252+
; GFX10-NEXT: .p2align
250253
; GFX10-NEXT: .LBB4_2: ; %.preheader
251254
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
252255
; GFX10-NEXT: v_mov_b32_e32 v3, s12

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
1717
; GFX10-NEXT: s_mov_b32 s4, 0
1818
; GFX10-NEXT: s_mov_b32 s6, 0
1919
; GFX10-NEXT: ; implicit-def: $sgpr7
20+
; GFX10-NEXT: .p2align
2021
; GFX10-NEXT: .LBB0_1: ; %loop
2122
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2223
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
@@ -66,6 +67,7 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
6667
; GFX10-NEXT: s_or_b32 s7, s5, s6
6768
; GFX10-NEXT: ; implicit-def: $sgpr5
6869
; GFX10-NEXT: s_branch .LBB1_2
70+
; GFX10-NEXT: .p2align
6971
; GFX10-NEXT: .LBB1_1: ; %loop.cond
7072
; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
7173
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
@@ -137,6 +139,7 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
137139
; GFX10-NEXT: s_mov_b32 s4, 0
138140
; GFX10-NEXT: s_mov_b32 s6, 0
139141
; GFX10-NEXT: ; implicit-def: $sgpr7
142+
; GFX10-NEXT: .p2align
140143
; GFX10-NEXT: .LBB2_1: ; %loop
141144
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
142145
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
@@ -183,6 +186,7 @@ define void @divergent_i1_xor_used_outside_loop_twice(float %val, float %pre.con
183186
; GFX10-NEXT: s_mov_b32 s4, 0
184187
; GFX10-NEXT: s_mov_b32 s7, 0
185188
; GFX10-NEXT: ; implicit-def: $sgpr6
189+
; GFX10-NEXT: .p2align
186190
; GFX10-NEXT: .LBB3_1: ; %loop
187191
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
188192
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7
@@ -249,6 +253,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
249253
; GFX10-NEXT: ; implicit-def: $sgpr11
250254
; GFX10-NEXT: ; implicit-def: $sgpr9
251255
; GFX10-NEXT: s_branch .LBB4_3
256+
; GFX10-NEXT: .p2align
252257
; GFX10-NEXT: .LBB4_2: ; %Flow
253258
; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
254259
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
@@ -349,6 +354,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
349354
; GFX10-NEXT: s_mov_b32 s4, 0
350355
; GFX10-NEXT: ; implicit-def: $sgpr7
351356
; GFX10-NEXT: s_branch .LBB5_2
357+
; GFX10-NEXT: .p2align
352358
; GFX10-NEXT: .LBB5_1: ; %Flow
353359
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
354360
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
@@ -462,6 +468,7 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
462468
; GFX10-NEXT: ; implicit-def: $sgpr4
463469
; GFX10-NEXT: ; implicit-def: $sgpr3
464470
; GFX10-NEXT: s_branch .LBB6_2
471+
; GFX10-NEXT: .p2align
465472
; GFX10-NEXT: .LBB6_1: ; %loop.cond
466473
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
467474
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
@@ -540,6 +547,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
540547
; GFX10-NEXT: ; implicit-def: $sgpr7
541548
; GFX10-NEXT: ; implicit-def: $sgpr5
542549
; GFX10-NEXT: s_branch .LBB7_2
550+
; GFX10-NEXT: .p2align
543551
; GFX10-NEXT: .LBB7_1: ; %Flow
544552
; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
545553
; GFX10-NEXT: s_waitcnt_depctr 0xffe3

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
110110
; GFX10-NEXT: s_mov_b32 s0, 0
111111
; GFX10-NEXT: ; implicit-def: $sgpr5
112112
; GFX10-NEXT: s_branch .LBB2_2
113+
; GFX10-NEXT: .p2align
113114
; GFX10-NEXT: .LBB2_1: ; %Flow
114115
; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
115116
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -184,6 +185,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
184185
; GFX10-NEXT: s_mov_b32 s0, 0
185186
; GFX10-NEXT: ; implicit-def: $sgpr5
186187
; GFX10-NEXT: s_branch .LBB3_3
188+
; GFX10-NEXT: .p2align
187189
; GFX10-NEXT: .LBB3_1: ; %Flow3
188190
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
189191
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -282,6 +284,7 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
282284
; GFX10-NEXT: s_mov_b32 s0, 0
283285
; GFX10-NEXT: ; implicit-def: $sgpr5
284286
; GFX10-NEXT: s_branch .LBB4_4
287+
; GFX10-NEXT: .p2align
285288
; GFX10-NEXT: .LBB4_1: ; %Flow5
286289
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
287290
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -410,6 +413,7 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
410413
; GFX10-NEXT: ; implicit-def: $sgpr7
411414
; GFX10-NEXT: ; implicit-def: $sgpr5
412415
; GFX10-NEXT: s_branch .LBB5_2
416+
; GFX10-NEXT: .p2align
413417
; GFX10-NEXT: .LBB5_1: ; %Flow
414418
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
415419
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -572,6 +576,7 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2,
572576
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
573577
; GFX10-NEXT: v_cmp_le_i32_e64 s0, v4, v0
574578
; GFX10-NEXT: s_mov_b32 s4, 0
579+
; GFX10-NEXT: .p2align
575580
; GFX10-NEXT: .LBB6_6: ; %.inner_loop
576581
; GFX10-NEXT: ; Parent Loop BB6_2 Depth=1
577582
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
99
; GFX10-NEXT: s_mov_b32 s5, 1
1010
; GFX10-NEXT: s_mov_b32 s6, 0
1111
; GFX10-NEXT: ; implicit-def: $sgpr7
12+
; GFX10-NEXT: .p2align
1213
; GFX10-NEXT: .LBB0_1: ; %loop
1314
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1415
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6
@@ -56,6 +57,7 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) {
5657
; GFX10-NEXT: s_mov_b32 s5, 1
5758
; GFX10-NEXT: s_mov_b32 s6, 0
5859
; GFX10-NEXT: ; implicit-def: $sgpr7
60+
; GFX10-NEXT: .p2align
5961
; GFX10-NEXT: .LBB1_1: ; %loop
6062
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
6163
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6
@@ -106,6 +108,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
106108
; GFX10-NEXT: ; implicit-def: $sgpr10
107109
; GFX10-NEXT: ; implicit-def: $sgpr9
108110
; GFX10-NEXT: s_branch .LBB2_3
111+
; GFX10-NEXT: .p2align
109112
; GFX10-NEXT: .LBB2_1: ; %loop.body
110113
; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1
111114
; GFX10-NEXT: v_mov_b32_e32 v4, s6
@@ -214,6 +217,7 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
214217
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6
215218
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo
216219
; GFX10-NEXT: flat_load_dword v0, v[6:7]
220+
; GFX10-NEXT: .p2align
217221
; GFX10-NEXT: .LBB3_2: ; %InnerHeader
218222
; GFX10-NEXT: ; Parent Loop BB3_1 Depth=1
219223
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
@@ -305,6 +309,7 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
305309
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6
306310
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo
307311
; GFX10-NEXT: flat_load_dword v0, v[6:7]
312+
; GFX10-NEXT: .p2align
308313
; GFX10-NEXT: .LBB4_2: ; %InnerHeader
309314
; GFX10-NEXT: ; Parent Loop BB4_1 Depth=1
310315
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
@@ -396,6 +401,7 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
396401
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8
397402
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo
398403
; GFX10-NEXT: flat_load_dword v0, v[8:9]
404+
; GFX10-NEXT: .p2align
399405
; GFX10-NEXT: .LBB5_2: ; %InnerHeader
400406
; GFX10-NEXT: ; Parent Loop BB5_1 Depth=1
401407
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ define void @temporal_divergent_i32(float %val, ptr %addr) {
77
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88
; GFX10-NEXT: s_mov_b32 s5, -1
99
; GFX10-NEXT: s_mov_b32 s4, 0
10+
; GFX10-NEXT: .p2align
1011
; GFX10-NEXT: .LBB0_1: ; %loop
1112
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1213
; GFX10-NEXT: s_add_i32 s5, s5, 1
@@ -42,6 +43,7 @@ define void @temporal_divergent_i32_multiple_use(float %val, ptr %addr, ptr %add
4243
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4344
; GFX10-NEXT: s_mov_b32 s5, -1
4445
; GFX10-NEXT: s_mov_b32 s4, 0
46+
; GFX10-NEXT: .p2align
4547
; GFX10-NEXT: .LBB1_1: ; %loop
4648
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
4749
; GFX10-NEXT: s_add_i32 s5, s5, 1

llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
205205
; CHECK-NEXT: v_subrev_u32_e32 v0, s0, v0
206206
; CHECK-NEXT: s_mov_b64 s[0:1], 0
207207
; CHECK-NEXT: s_branch .LBB5_2
208+
; CHECK-NEXT: .p2align
208209
; CHECK-NEXT: .LBB5_1: ; %Flow
209210
; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1
210211
; CHECK-NEXT: s_and_b64 s[4:5], exec, s[2:3]

llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) {
3838
; CHECK-NEXT: s_mov_b32 s1, 0
3939
; CHECK-NEXT: v_mov_b32_e32 v0, s1
4040
; CHECK-NEXT: s_branch .LBB0_4
41-
; CHECK-NEXT: .p2align 6
41+
; CHECK-NOT: .p2align 6
4242
; CHECK-NEXT: .LBB0_3: ; %bb6
4343
; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1
4444
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s3

0 commit comments

Comments
 (0)