Skip to content

Commit b7714b7

Browse files
authored
[AMDGPU][GFX13] Add Soffset to loadMBUFScalarOperandsFromVGPR (#3359)
GFX13: Soffset of buffer_load_format is not loaded from VGPR. Below s8 is invalid, should load v8 to SPGR: ``` global_load_b32 v8, v[0:1], off offset:16 ; global_load_b128 v[4:7], v[0:1], off ; v_readfirstlane_b32 s4, v4 ; 002708: 7e080504 v_readfirstlane_b32 s5, v5 ; 00270c: 7e0a0505 v_readfirstlane_b32 s6, v6 ; 002710: 7e0c0506 v_readfirstlane_b32 s7, v7 ; 002714: 7e0e0507 buffer_load_format_x v9, v0, s[4:7], s8 idxen ; 002734: c4000008 80800809 00000000 ```
1 parent 5978c08 commit b7714b7

File tree

2 files changed

+63
-13
lines changed

2 files changed

+63
-13
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7310,6 +7310,17 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
73107310
return CreatedBB;
73117311
}
73127312

7313+
bool isSoffsetLegal = true;
7314+
int SoffsetIdx =
7315+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7316+
if (SoffsetIdx != -1) {
7317+
MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7318+
if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7319+
!RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7320+
isSoffsetLegal = false;
7321+
}
7322+
}
7323+
73137324
// Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
73147325
//
73157326
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
@@ -7321,8 +7332,15 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
73217332
? AMDGPU::OpName::rsrc
73227333
: AMDGPU::OpName::srsrc;
73237334
MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7324-
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7325-
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7335+
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
7336+
if (isSoffsetLegal) {
7337+
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7338+
} else {
7339+
MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7340+
CreatedBB =
7341+
loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc, Soffset}, MDT);
7342+
}
7343+
}
73267344

73277345
AMDGPU::OpName SampOpName =
73287346
isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
@@ -7530,17 +7548,6 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
75307548
}
75317549

75327550
// Legalize MUBUF instructions.
7533-
bool isSoffsetLegal = true;
7534-
int SoffsetIdx =
7535-
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7536-
if (SoffsetIdx != -1) {
7537-
MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7538-
if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7539-
!RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7540-
isSoffsetLegal = false;
7541-
}
7542-
}
7543-
75447551
bool isRsrcLegal = true;
75457552
int RsrcIdx =
75467553
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,48 @@ main_body:
738738
ret float %fdata
739739
}
740740

741+
define amdgpu_ps i32 @buffer_load_x_i32__vgpr_rsrc__vgpr_soffset(ptr addrspace(1) %in) {
742+
; GFX13-LABEL: struct_buffer_load_format_i32__vgpr_rsrc__vgpr_soffset:
743+
; GFX13: ; %bb.0:
744+
; GFX13-NEXT: s_clause 0x1
745+
; GFX13-NEXT: global_load_b32 v4, v[0:1], off offset:16
746+
; GFX13-NEXT: global_load_b128 v[0:3], v[0:1], off
747+
; GFX13-NEXT: v_mov_b32_e32 v5, 0
748+
; GFX13-NEXT: s_mov_b32 s2, exec_lo
749+
; GFX13-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
750+
; GFX13-NEXT: s_wait_loadcnt 0x0
751+
; GFX13-NEXT: v_readfirstlane_b32 s4, v0
752+
; GFX13-NEXT: v_readfirstlane_b32 s5, v1
753+
; GFX13-NEXT: v_readfirstlane_b32 s6, v2
754+
; GFX13-NEXT: v_readfirstlane_b32 s7, v3
755+
; GFX13-NEXT: v_readfirstlane_b32 s3, v4
756+
; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
757+
; GFX13-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
758+
; GFX13-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
759+
; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
760+
; GFX13-NEXT: v_cmp_eq_u32_e64 s1, s3, v4
761+
; GFX13-NEXT: s_and_b32 s0, vcc_lo, s0
762+
; GFX13-NEXT: s_and_b32 s0, s0, s1
763+
; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
764+
; GFX13-NEXT: s_and_saveexec_b32 s0, s0
765+
; GFX13-NEXT: buffer_load_format_x v6, v5, s[4:7], s3 idxen
766+
; GFX13-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
767+
; GFX13-NEXT: ; implicit-def: $vgpr4
768+
; GFX13-NEXT: ; implicit-def: $vgpr5
769+
; GFX13-NEXT: s_xor_b32 exec_lo, exec_lo, s0
770+
; GFX13-NEXT: s_cbranch_execnz .LBB0_1
771+
; GFX13-NEXT: ; %bb.2:
772+
; GFX13-NEXT: s_mov_b32 exec_lo, s2
773+
; GFX13-NEXT: s_wait_loadcnt 0x0
774+
; GFX13-NEXT: v_readfirstlane_b32 s0, v6
775+
; GFX13-NEXT: ; return to shader part epilog
776+
%vgpr = load <5 x i32>, ptr addrspace(1) %in, align 4
777+
%offset = extractelement <5 x i32> %vgpr, i64 4
778+
%rsrc = shufflevector <5 x i32> %vgpr, <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
779+
%val = call i32 @llvm.amdgcn.struct.buffer.load.format.i32.v4i32(<4 x i32> %rsrc, i32 0, i32 0, i32 %offset, i32 0)
780+
ret i32 %val
781+
}
782+
741783
define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
742784
; GFX6-LABEL: buffer_load_xy:
743785
; GFX6: ; %bb.0: ; %main_body
@@ -1349,6 +1391,7 @@ declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i3
13491391
declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #0
13501392
declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0
13511393
declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32) #0
1394+
declare i32 @llvm.amdgcn.struct.buffer.load.format.i32.v4i32(<4 x i32>, i32, i32, i32, i32 immarg) #0
13521395
declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
13531396
declare { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
13541397
declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0

0 commit comments

Comments
 (0)