Skip to content

Commit 2697c8c

Browse files
authored
[LowerMemIntrinsics] Factor control flow generation out of the memcpy lowering (#169039)
So far, memcpy with known size, memcpy with unknown size, memmove with known size, and memmove with unknown size have individual optimized loop lowering implementations, while memset and memset.pattern use an unoptimized loop lowering. This patch extracts the parts of the memcpy lowerings (for known and unknown sizes) that generate the control flow for the loop expansion into an `insertLoopExpansion` function. The `createMemCpyLoop(Unk|K)nownSize` functions then only collect the necessary arguments for `insertLoopExpansion`, call it, and fill the generated loop basic blocks. The immediate benefit of this is that logic from the two memcpy lowerings is deduplicated. Moreover, it enables follow-up patches that will use `insertLoopExpansion` to optimize the memset and memset.pattern implementations similarly to memcpy, since they can use the exact same control flow patterns. The test changes are due to more consistent and useful basic block names in the loop expansion and an improvement in basic block ordering: previously, the basic block that determines if the residual loop is executed would be put at the end of the function, now it is put before the residual loop body. Otherwise, the generated code should be equivalent. This patch doesn't affect memmove; deduplicating its logic would also be nice, but to extract all CF generation from the memmove lowering, `insertLoopExpansion` would need to be able to also create code that iterates backwards over the argument buffers. That would make `insertLoopExpansion` a lot more complex for a code path that's only used for memmove, so it's probably not worth refactoring. For SWDEV-543208.
1 parent 8feb676 commit 2697c8c

File tree

10 files changed

+733
-656
lines changed

10 files changed

+733
-656
lines changed

llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp

Lines changed: 310 additions & 208 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
1212
; LOOP-NEXT: s_mov_b32 s3, 0xf000
1313
; LOOP-NEXT: v_mov_b32_e32 v5, s1
1414
; LOOP-NEXT: v_mov_b32_e32 v4, s0
15-
; LOOP-NEXT: .LBB0_1: ; %load-store-loop
15+
; LOOP-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
1616
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
1717
; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4
1818
; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc
@@ -177,7 +177,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
177177
; LOOP-NEXT: buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30
178178
; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31
179179
; LOOP-NEXT: s_cbranch_vccnz .LBB0_1
180-
; LOOP-NEXT: ; %bb.2: ; %memcpy-split
180+
; LOOP-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
181181
; LOOP-NEXT: s_mov_b32 s2, 0
182182
; LOOP-NEXT: s_mov_b32 s3, 0xf000
183183
; LOOP-NEXT: s_mov_b64 s[0:1], 0

llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
255255
; SDAG-GFX942-NEXT: s_mov_b32 s17, s10
256256
; SDAG-GFX942-NEXT: s_mov_b32 s2, s9
257257
; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
258-
; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop
258+
; SDAG-GFX942-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
259259
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
260260
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
261261
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
@@ -312,7 +312,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
312312
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
313313
; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
314314
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1
315-
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
315+
; SDAG-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
316316
; SDAG-GFX942-NEXT: s_endpgm
317317
;
318318
; SDAG-GFX1100-LABEL: memcpy_known:
@@ -341,7 +341,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
341341
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16
342342
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
343343
; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
344-
; SDAG-GFX1100-NEXT: .LBB0_1: ; %load-store-loop
344+
; SDAG-GFX1100-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
345345
; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
346346
; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16
347347
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -400,7 +400,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
400400
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
401401
; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
402402
; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB0_1
403-
; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
403+
; SDAG-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
404404
; SDAG-GFX1100-NEXT: s_endpgm
405405
;
406406
; GISEL-GFX942-LABEL: memcpy_known:
@@ -419,7 +419,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
419419
; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
420420
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
421421
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
422-
; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop
422+
; GISEL-GFX942-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
423423
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
424424
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
425425
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
@@ -477,7 +477,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
477477
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
478478
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
479479
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
480-
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
480+
; GISEL-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
481481
; GISEL-GFX942-NEXT: s_endpgm
482482
;
483483
; GISEL-GFX1100-LABEL: memcpy_known:
@@ -497,7 +497,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
497497
; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9
498498
; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10
499499
; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11
500-
; GISEL-GFX1100-NEXT: .LBB0_1: ; %load-store-loop
500+
; GISEL-GFX1100-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
501501
; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
502502
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
503503
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
@@ -553,7 +553,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
553553
; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
554554
; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x2000, v0
555555
; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB0_1
556-
; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
556+
; GISEL-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
557557
; GISEL-GFX1100-NEXT: s_endpgm
558558
call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
559559
ret void
@@ -787,7 +787,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
787787
; SDAG-GFX942-NEXT: s_mov_b32 s17, s10
788788
; SDAG-GFX942-NEXT: s_mov_b32 s2, s9
789789
; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
790-
; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop
790+
; SDAG-GFX942-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body
791791
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
792792
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
793793
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
@@ -844,7 +844,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
844844
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
845845
; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
846846
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1
847-
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
847+
; SDAG-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
848848
; SDAG-GFX942-NEXT: s_endpgm
849849
;
850850
; SDAG-GFX1100-LABEL: memcpy_known_medium:
@@ -873,7 +873,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
873873
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16
874874
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
875875
; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
876-
; SDAG-GFX1100-NEXT: .LBB1_1: ; %load-store-loop
876+
; SDAG-GFX1100-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body
877877
; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
878878
; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16
879879
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -932,7 +932,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
932932
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
933933
; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
934934
; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB1_1
935-
; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
935+
; SDAG-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
936936
; SDAG-GFX1100-NEXT: s_endpgm
937937
;
938938
; GISEL-GFX942-LABEL: memcpy_known_medium:
@@ -951,7 +951,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
951951
; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
952952
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
953953
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
954-
; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop
954+
; GISEL-GFX942-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body
955955
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
956956
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
957957
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
@@ -1009,7 +1009,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
10091009
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
10101010
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
10111011
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
1012-
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
1012+
; GISEL-GFX942-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
10131013
; GISEL-GFX942-NEXT: s_endpgm
10141014
;
10151015
; GISEL-GFX1100-LABEL: memcpy_known_medium:
@@ -1029,7 +1029,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
10291029
; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9
10301030
; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10
10311031
; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11
1032-
; GISEL-GFX1100-NEXT: .LBB1_1: ; %load-store-loop
1032+
; GISEL-GFX1100-NEXT: .LBB1_1: ; %static-memcpy-expansion-main-body
10331033
; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
10341034
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
10351035
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
@@ -1085,7 +1085,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
10851085
; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
10861086
; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x100, v0
10871087
; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1
1088-
; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
1088+
; GISEL-GFX1100-NEXT: ; %bb.2: ; %static-memcpy-post-expansion
10891089
; GISEL-GFX1100-NEXT: s_endpgm
10901090
call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 256, i1 false)
10911091
ret void

0 commit comments

Comments
 (0)