@@ -158,3 +158,69 @@ entry:
158158 store <4 x bfloat> %val , ptr addrspace (1 ) %use
159159 ret void
160160}
161+
162+ ; This is a special case that does not require aligned VGPRs. Make
163+ ; sure no copies are required for the unaligned ABI return value.
164+ define { i32 , <3 x i32 > } @ds_read_b96_tr_b6_no_align2_requirement (ptr addrspace (3 ) %ptr ) {
165+ ; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
166+ ; GFX950-SDAG: ; %bb.0:
167+ ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168+ ; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32
169+ ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
170+ ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
171+ ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v2
172+ ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v3
173+ ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v4
174+ ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
175+ ;
176+ ; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
177+ ; GFX950-GISEL: ; %bb.0:
178+ ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179+ ; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32
180+ ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
181+ ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
182+ ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v2
183+ ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v3
184+ ; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v4
185+ ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
186+ %gep = getelementptr i64 , ptr addrspace (3 ) %ptr , i32 4
187+ %val = call <3 x i32 > @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3 (ptr addrspace (3 ) %gep )
188+ %insert0 = insertvalue { i32 , <3 x i32 > } poison, i32 0 , 0
189+ %insert1 = insertvalue { i32 , <3 x i32 > } %insert0 , <3 x i32 > %val , 1
190+ ret { i32 , <3 x i32 > } %insert1
191+ }
192+
193+ define void @ds_read_b96_tr_b6_no_align2_requirement_agpr (ptr addrspace (3 ) %ptr ) {
194+ ; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
195+ ; GFX950-SDAG: ; %bb.0:
196+ ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197+ ; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
198+ ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
199+ ; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, v0
200+ ; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, v1
201+ ; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, v2
202+ ; GFX950-SDAG-NEXT: ;;#ASMSTART
203+ ; GFX950-SDAG-NEXT: ; use a1 a2 a3
204+ ; GFX950-SDAG-NEXT: ;;#ASMEND
205+ ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
206+ ;
207+ ; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
208+ ; GFX950-GISEL: ; %bb.0:
209+ ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210+ ; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
211+ ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
212+ ; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, v0
213+ ; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, v1
214+ ; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, v2
215+ ; GFX950-GISEL-NEXT: ;;#ASMSTART
216+ ; GFX950-GISEL-NEXT: ; use a1 a2 a3
217+ ; GFX950-GISEL-NEXT: ;;#ASMEND
218+ ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
219+ %gep = getelementptr i64 , ptr addrspace (3 ) %ptr , i32 4
220+ %val = call <3 x i32 > @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3 (ptr addrspace (3 ) %gep )
221+ %val0 = extractelement <3 x i32 > %val , i32 0
222+ %val1 = extractelement <3 x i32 > %val , i32 1
223+ %val2 = extractelement <3 x i32 > %val , i32 2
224+ call void asm sideeffect "; use $0 $1 $2" , "{a1},{a2},{a3}" (i32 %val0 , i32 %val1 , i32 %val2 )
225+ ret void
226+ }
0 commit comments