Skip to content

Commit d0f8358

Browse files
committed
[AMDGPU][ISel][GFX12] Restrict scalar subword loads to PAL
On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when: * the stride is not a multiple of 4, or * the stride is 0 and the num-records is not a multiple of 4 At the moment, these instructions are only generated for PAL. But in this case, it is guaranteed that the buffers stride/num-records are aligned to 4. This patch prevents the emission of scalar subword loads to PAL, where the bug would never be triggered, and avoid it in HSA (where it could be triggered, but it's not used). Solves SWDEV-498239
1 parent 91bad35 commit d0f8358

File tree

2 files changed

+26
-118
lines changed

2 files changed

+26
-118
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6803,13 +6803,38 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
68036803
unsigned Size = Ty.getSizeInBits();
68046804
MachineFunction &MF = B.getMF();
68056805
unsigned Opc = 0;
6806+
6807+
const unsigned MemSize = (Size + 7) / 8;
6808+
const Align MemAlign = B.getDataLayout().getABITypeAlign(
6809+
getTypeForLLT(Ty, MF.getFunction().getContext()));
6810+
6811+
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
6812+
MachineMemOperand *MMO = MF.getMachineMemOperand(
6813+
MachinePointerInfo(),
6814+
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6815+
MachineMemOperand::MOInvariant,
6816+
MemSize, MemAlign);
6817+
68066818
if (Size < 32 && ST.hasScalarSubwordLoads()) {
68076819
assert(Size == 8 || Size == 16);
6820+
if (!ST.hasScalarSubwordBufferLoads()) {
6821+
// fallback to S_BUFFER_LOAD_UBYTE/USHORT
6822+
MI.getOperand(1).setIntrinsicID(Intrinsic::amdgcn_raw_buffer_load);
6823+
6824+
Register Zero = B.buildConstant(S32, 0).getReg(0);
6825+
MI.insert(MI.operands_begin() + 4,
6826+
MachineOperand::CreateReg(Zero, false));
6827+
6828+
MI.addMemOperand(MF, MMO);
6829+
Observer.changedInstr(MI);
6830+
return true;
6831+
}
6832+
68086833
Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
68096834
: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
68106835
// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
68116836
// destination register.
6812-
Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6837+
Dst = B.getMRI()->createGenericVirtualRegister(S32);
68136838
} else {
68146839
Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
68156840
Dst = OrigDst;
@@ -6834,15 +6859,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
68346859
MI.setDesc(B.getTII().get(Opc));
68356860
MI.removeOperand(1); // Remove intrinsic ID
68366861

6837-
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
6838-
const unsigned MemSize = (Size + 7) / 8;
6839-
const Align MemAlign = B.getDataLayout().getABITypeAlign(
6840-
getTypeForLLT(Ty, MF.getFunction().getContext()));
6841-
MachineMemOperand *MMO = MF.getMachineMemOperand(
6842-
MachinePointerInfo(),
6843-
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6844-
MachineMemOperand::MOInvariant,
6845-
MemSize, MemAlign);
68466862
MI.addMemOperand(MF, MMO);
68476863
if (Dst != OrigDst) {
68486864
MI.getOperand(0).setReg(Dst);

llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll

Lines changed: 0 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -428,14 +428,6 @@ define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr a
428428
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
429429
; DAG-DEFAULT-NEXT: s_endpgm
430430
;
431-
; GISEL-LABEL: s_buffer_load_byte_imm_offset:
432-
; GISEL: ; %bb.0: ; %main_body
433-
; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
434-
; GISEL-NEXT: s_wait_kmcnt 0x0
435-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
436-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
437-
; GISEL-NEXT: s_endpgm
438-
;
439431
; DAG-PAL-LABEL: s_buffer_load_byte_imm_offset:
440432
; DAG-PAL: ; %bb.0: ; %main_body
441433
; DAG-PAL-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
@@ -459,14 +451,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspa
459451
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
460452
; DAG-DEFAULT-NEXT: s_endpgm
461453
;
462-
; GISEL-LABEL: s_buffer_load_byte_sgpr:
463-
; GISEL: ; %bb.0: ; %main_body
464-
; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
465-
; GISEL-NEXT: s_wait_kmcnt 0x0
466-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
467-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
468-
; GISEL-NEXT: s_endpgm
469-
;
470454
; DAG-PAL-LABEL: s_buffer_load_byte_sgpr:
471455
; DAG-PAL: ; %bb.0: ; %main_body
472456
; DAG-PAL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
@@ -490,14 +474,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %sr
490474
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
491475
; DAG-DEFAULT-NEXT: s_endpgm
492476
;
493-
; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
494-
; GISEL: ; %bb.0: ; %main_body
495-
; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
496-
; GISEL-NEXT: s_wait_kmcnt 0x0
497-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
498-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
499-
; GISEL-NEXT: s_endpgm
500-
;
501477
; DAG-PAL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
502478
; DAG-PAL: ; %bb.0: ; %main_body
503479
; DAG-PAL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
@@ -535,16 +511,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr
535511
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
536512
; DAG-DEFAULT-NEXT: s_endpgm
537513
;
538-
; GISEL-LABEL: s_buffer_load_ubyte_imm_offset:
539-
; GISEL: ; %bb.0: ; %main_body
540-
; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
541-
; GISEL-NEXT: s_wait_kmcnt 0x0
542-
; GISEL-NEXT: s_and_b32 s0, s0, 0xff
543-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
544-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
545-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
546-
; GISEL-NEXT: s_endpgm
547-
;
548514
; DAG-PAL-LABEL: s_buffer_load_ubyte_imm_offset:
549515
; DAG-PAL: ; %bb.0: ; %main_body
550516
; DAG-PAL-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
@@ -570,16 +536,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrsp
570536
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
571537
; DAG-DEFAULT-NEXT: s_endpgm
572538
;
573-
; GISEL-LABEL: s_buffer_load_ubyte_sgpr:
574-
; GISEL: ; %bb.0: ; %main_body
575-
; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
576-
; GISEL-NEXT: s_wait_kmcnt 0x0
577-
; GISEL-NEXT: s_and_b32 s0, s0, 0xff
578-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
579-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
580-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
581-
; GISEL-NEXT: s_endpgm
582-
;
583539
; DAG-PAL-LABEL: s_buffer_load_ubyte_sgpr:
584540
; DAG-PAL: ; %bb.0: ; %main_body
585541
; DAG-PAL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
@@ -605,16 +561,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %s
605561
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
606562
; DAG-DEFAULT-NEXT: s_endpgm
607563
;
608-
; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
609-
; GISEL: ; %bb.0: ; %main_body
610-
; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
611-
; GISEL-NEXT: s_wait_kmcnt 0x0
612-
; GISEL-NEXT: s_and_b32 s0, s0, 0xff
613-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
614-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
615-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
616-
; GISEL-NEXT: s_endpgm
617-
;
618564
; DAG-PAL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
619565
; DAG-PAL: ; %bb.0: ; %main_body
620566
; DAG-PAL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
@@ -654,14 +600,6 @@ define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr
654600
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
655601
; DAG-DEFAULT-NEXT: s_endpgm
656602
;
657-
; GISEL-LABEL: s_buffer_load_short_imm_offset:
658-
; GISEL: ; %bb.0: ; %main_body
659-
; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
660-
; GISEL-NEXT: s_wait_kmcnt 0x0
661-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
662-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
663-
; GISEL-NEXT: s_endpgm
664-
;
665603
; DAG-PAL-LABEL: s_buffer_load_short_imm_offset:
666604
; DAG-PAL: ; %bb.0: ; %main_body
667605
; DAG-PAL-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
@@ -685,14 +623,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrsp
685623
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
686624
; DAG-DEFAULT-NEXT: s_endpgm
687625
;
688-
; GISEL-LABEL: s_buffer_load_short_sgpr:
689-
; GISEL: ; %bb.0: ; %main_body
690-
; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
691-
; GISEL-NEXT: s_wait_kmcnt 0x0
692-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
693-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
694-
; GISEL-NEXT: s_endpgm
695-
;
696626
; DAG-PAL-LABEL: s_buffer_load_short_sgpr:
697627
; DAG-PAL: ; %bb.0: ; %main_body
698628
; DAG-PAL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
@@ -716,14 +646,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %s
716646
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
717647
; DAG-DEFAULT-NEXT: s_endpgm
718648
;
719-
; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
720-
; GISEL: ; %bb.0: ; %main_body
721-
; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
722-
; GISEL-NEXT: s_wait_kmcnt 0x0
723-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
724-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
725-
; GISEL-NEXT: s_endpgm
726-
;
727649
; DAG-PAL-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
728650
; DAG-PAL: ; %bb.0: ; %main_body
729651
; DAG-PAL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
@@ -761,16 +683,6 @@ define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr
761683
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
762684
; DAG-DEFAULT-NEXT: s_endpgm
763685
;
764-
; GISEL-LABEL: s_buffer_load_ushort_imm_offset:
765-
; GISEL: ; %bb.0: ; %main_body
766-
; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
767-
; GISEL-NEXT: s_wait_kmcnt 0x0
768-
; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
769-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
770-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
771-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
772-
; GISEL-NEXT: s_endpgm
773-
;
774686
; DAG-PAL-LABEL: s_buffer_load_ushort_imm_offset:
775687
; DAG-PAL: ; %bb.0: ; %main_body
776688
; DAG-PAL-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
@@ -796,16 +708,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrs
796708
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
797709
; DAG-DEFAULT-NEXT: s_endpgm
798710
;
799-
; GISEL-LABEL: s_buffer_load_ushort_sgpr:
800-
; GISEL: ; %bb.0: ; %main_body
801-
; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
802-
; GISEL-NEXT: s_wait_kmcnt 0x0
803-
; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
804-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
805-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
806-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
807-
; GISEL-NEXT: s_endpgm
808-
;
809711
; DAG-PAL-LABEL: s_buffer_load_ushort_sgpr:
810712
; DAG-PAL: ; %bb.0: ; %main_body
811713
; DAG-PAL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
@@ -831,16 +733,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %
831733
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
832734
; DAG-DEFAULT-NEXT: s_endpgm
833735
;
834-
; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
835-
; GISEL: ; %bb.0: ; %main_body
836-
; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
837-
; GISEL-NEXT: s_wait_kmcnt 0x0
838-
; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
839-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
840-
; GISEL-NEXT: v_mov_b32_e32 v2, s0
841-
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
842-
; GISEL-NEXT: s_endpgm
843-
;
844736
; DAG-PAL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
845737
; DAG-PAL: ; %bb.0: ; %main_body
846738
; DAG-PAL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64

0 commit comments

Comments
 (0)