Skip to content

Commit b0ee92b

Browse files
authored
[AMDGPU] Restrict scale operands of WMMA to low 256 VGPRs (#157526)
These cannot accept high registers.
1 parent 6d032c4 commit b0ee92b

File tree

5 files changed

+56
-32
lines changed

5 files changed

+56
-32
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,13 +627,21 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
627627
TRI = ST.getRegisterInfo();
628628
TII = ST.getInstrInfo();
629629

630+
// Instructions to re-legalize after changing register classes
631+
SmallVector<MachineInstr *, 8> Relegalize;
632+
630633
for (MachineBasicBlock &MBB : MF) {
631634
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
632635
++I) {
633636
MachineInstr &MI = *I;
634637

635638
switch (MI.getOpcode()) {
636639
default:
640+
// scale_src has a register class restricted to low 256 VGPRs, changing
641+
// registers to VGPR may not take it into acount.
642+
if (TII->isWMMA(MI) &&
643+
AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::scale_src0))
644+
Relegalize.push_back(&MI);
637645
continue;
638646
case AMDGPU::COPY: {
639647
const TargetRegisterClass *SrcRC, *DstRC;
@@ -791,6 +799,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
791799
for (auto *MI : PHINodes) {
792800
processPHINode(*MI);
793801
}
802+
while (!Relegalize.empty())
803+
TII->legalizeOperands(*Relegalize.pop_back_val(), MDT);
804+
794805
if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)
795806
hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
796807

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6574,6 +6574,21 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
65746574
!RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
65756575
legalizeOpWithMove(MI, VOP3Idx[2]);
65766576

6577+
if (isWMMA(MI)) {
6578+
// scale_src has a register class restricted to low 256 VGPRs, we may need
6579+
// to insert a copy to the restricted VGPR class.
6580+
int ScaleSrc0Idx =
6581+
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
6582+
if (ScaleSrc0Idx != -1) {
6583+
int ScaleSrc1Idx =
6584+
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
6585+
if (!isOperandLegal(MI, ScaleSrc0Idx))
6586+
legalizeOpWithMove(MI, ScaleSrc0Idx);
6587+
if (!isOperandLegal(MI, ScaleSrc1Idx))
6588+
legalizeOpWithMove(MI, ScaleSrc1Idx);
6589+
}
6590+
}
6591+
65776592
// Fix the register class of packed FP32 instructions on gfx12+. See
65786593
// SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
65796594
if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,8 +1518,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
15181518
dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
15191519
(ins));
15201520
dag MatrixScaleSrc = !if(HasMatrixScale,
1521-
!if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1),
1522-
(ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)),
1521+
!if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1),
1522+
(ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)),
15231523
(ins));
15241524
dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
15251525
MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -901,9 +901,9 @@ bb:
901901
define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
902902
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale:
903903
; GFX1250: ; %bb.0: ; %bb
904-
; GFX1250-NEXT: s_movk_i32 s1, 0x64
905-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
906-
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
904+
; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64
905+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
906+
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
907907
; GFX1250-NEXT: s_clause 0x1
908908
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
909909
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
@@ -1499,9 +1499,9 @@ bb:
14991499
define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
15001500
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale:
15011501
; GFX1250: ; %bb.0: ; %bb
1502-
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
1503-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1504-
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
1502+
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64
1503+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
1504+
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
15051505
; GFX1250-NEXT: s_clause 0x1
15061506
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
15071507
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
@@ -2291,9 +2291,9 @@ bb:
22912291
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
22922292
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
22932293
; GFX1250: ; %bb.0: ; %bb
2294-
; GFX1250-NEXT: s_movk_i32 s1, 0x64
2295-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2296-
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
2294+
; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64
2295+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2296+
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
22972297
; GFX1250-NEXT: s_clause 0x3
22982298
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
22992299
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
@@ -2373,9 +2373,9 @@ bb:
23732373
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
23742374
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
23752375
; GFX1250: ; %bb.0: ; %bb
2376-
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
2377-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2378-
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
2376+
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64
2377+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2378+
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
23792379
; GFX1250-NEXT: s_clause 0x3
23802380
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
23812381
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,14 +1512,13 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x
15121512
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
15131513
; GFX1250: ; %bb.0: ; %bb
15141514
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
1515-
; GFX1250-NEXT: s_movk_i32 s0, 0x65
1516-
; GFX1250-NEXT: s_movk_i32 s1, 0x64
1517-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1515+
; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
1516+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1517+
; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v34
15181518
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
15191519
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
15201520
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
1521-
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
1522-
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1521+
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
15231522
; GFX1250-NEXT: s_clause 0x1
15241523
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
15251524
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -1619,14 +1618,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16
16191618
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
16201619
; GFX1250: ; %bb.0: ; %bb
16211620
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
1622-
; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
1623-
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
1624-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1621+
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65
1622+
; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64
1623+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
16251624
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
16261625
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
16271626
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
16281627
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
1629-
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1628+
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
16301629
; GFX1250-NEXT: s_clause 0x1
16311630
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
16321631
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -2621,19 +2620,18 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
26212620
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
26222621
; GFX1250: ; %bb.0: ; %bb
26232622
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
2624-
; GFX1250-NEXT: s_movk_i32 s0, 0x65
2625-
; GFX1250-NEXT: s_movk_i32 s1, 0x64
2626-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2623+
; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
2624+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
2625+
; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v26
26272626
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
26282627
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
26292628
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
26302629
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
26312630
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
26322631
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
26332632
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
2634-
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
26352633
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2636-
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2634+
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
26372635
; GFX1250-NEXT: s_clause 0x3
26382636
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
26392637
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
@@ -2774,9 +2772,9 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
27742772
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
27752773
; GFX1250: ; %bb.0: ; %bb
27762774
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
2777-
; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
2778-
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
2779-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2775+
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65
2776+
; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64
2777+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
27802778
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
27812779
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
27822780
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
@@ -2786,7 +2784,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
27862784
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
27872785
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
27882786
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2789-
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2787+
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
27902788
; GFX1250-NEXT: s_clause 0x3
27912789
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
27922790
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32

0 commit comments

Comments
 (0)