Skip to content

Commit 9af4a85

Browse files
authored
AMDGPU: Add test which shows unnecessary register alignment (#158168)
The b96 tr loads are a special case that does not require even aligned VGPRs
1 parent ba3b3e3 commit 9af4a85

File tree

2 files changed

+120
-0
lines changed

2 files changed

+120
-0
lines changed

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,69 @@ entry:
158158
store <4 x bfloat> %val, ptr addrspace(1) %use
159159
ret void
160160
}
161+
162+
; This is a special case that does not require aligned VGPRs. Make
163+
; sure no copies are required for the unaligned ABI return value.
164+
define { i32, <3 x i32> } @ds_read_b96_tr_b6_no_align2_requirement(ptr addrspace(3) %ptr) {
165+
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
166+
; GFX950-SDAG: ; %bb.0:
167+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168+
; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32
169+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
170+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
171+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v2
172+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v3
173+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v4
174+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
175+
;
176+
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
177+
; GFX950-GISEL: ; %bb.0:
178+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179+
; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32
180+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
181+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
182+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v2
183+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v3
184+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v4
185+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
186+
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
187+
%val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
188+
%insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
189+
%insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
190+
ret { i32, <3 x i32> } %insert1
191+
}
192+
193+
define void @ds_read_b96_tr_b6_no_align2_requirement_agpr(ptr addrspace(3) %ptr) {
194+
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
195+
; GFX950-SDAG: ; %bb.0:
196+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197+
; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
198+
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
199+
; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, v0
200+
; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, v1
201+
; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, v2
202+
; GFX950-SDAG-NEXT: ;;#ASMSTART
203+
; GFX950-SDAG-NEXT: ; use a1 a2 a3
204+
; GFX950-SDAG-NEXT: ;;#ASMEND
205+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
206+
;
207+
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
208+
; GFX950-GISEL: ; %bb.0:
209+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210+
; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
211+
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
212+
; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, v0
213+
; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, v1
214+
; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, v2
215+
; GFX950-GISEL-NEXT: ;;#ASMSTART
216+
; GFX950-GISEL-NEXT: ; use a1 a2 a3
217+
; GFX950-GISEL-NEXT: ;;#ASMEND
218+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
219+
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
220+
%val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
221+
%val0 = extractelement <3 x i32> %val, i32 0
222+
%val1 = extractelement <3 x i32> %val, i32 1
223+
%val2 = extractelement <3 x i32> %val, i32 2
224+
call void asm sideeffect "; use $0 $1 $2", "{a1},{a2},{a3}"(i32 %val0, i32 %val1, i32 %val2)
225+
ret void
226+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,3 +320,57 @@ entry:
320320
store <8 x bfloat> %val, ptr addrspace(1) %use
321321
ret void
322322
}
323+
324+
; This is a special case that does not require aligned VGPRs. Make
325+
; sure no copies are required for the unaligned ABI return value.
326+
define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
327+
; GFX1250-LABEL: global_load_tr6_b96_vaddr_no_align2_requirement:
328+
; GFX1250: ; %bb.0:
329+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
330+
; GFX1250-NEXT: s_wait_kmcnt 0x0
331+
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v[0:1], off offset:32
332+
; GFX1250-NEXT: s_wait_loadcnt 0x0
333+
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
334+
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
335+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
336+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
337+
%val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
338+
%insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
339+
%insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
340+
ret { i32, <3 x i32> } %insert1
341+
}
342+
343+
define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
344+
; GFX1250-LABEL: global_load_tr6_b96_saddr_no_align2_requirement:
345+
; GFX1250: ; %bb.0:
346+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
347+
; GFX1250-NEXT: s_wait_kmcnt 0x0
348+
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
349+
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
350+
; GFX1250-NEXT: s_wait_loadcnt 0x0
351+
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
352+
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
353+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
354+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
355+
%val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
356+
%insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
357+
%insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
358+
ret { i32, <3 x i32> } %insert1
359+
}
360+
361+
define { i32, <3 x i32> } @ds_load_tr6_b96_no_align2_requirement(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
362+
; GFX1250-LABEL: ds_load_tr6_b96_no_align2_requirement:
363+
; GFX1250: ; %bb.0:
364+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
365+
; GFX1250-NEXT: s_wait_kmcnt 0x0
366+
; GFX1250-NEXT: ds_load_tr6_b96 v[2:4], v0 offset:32
367+
; GFX1250-NEXT: s_wait_dscnt 0x0
368+
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
369+
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
370+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
371+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
372+
%val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
373+
%insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
374+
%insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
375+
ret { i32, <3 x i32> } %insert1
376+
}

0 commit comments

Comments
 (0)