[AMDGPU] Restrict scale operands of WMMA to low 256 VGPRs (#157526)

rampitec · web-flow · commit b0ee92be9474 · 2025-09-08T15:44:51.000-07:00
These cannot accept high registers.
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -627,13 +627,21 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
 
+  // Instructions to re-legalize after changing register classes
+  SmallVector<MachineInstr *, 8> Relegalize;
+
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
          ++I) {
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
       default:
+        // scale_src has a register class restricted to low 256 VGPRs, changing
+        // registers to VGPR may not take it into acount.
+        if (TII->isWMMA(MI) &&
+            AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::scale_src0))
+          Relegalize.push_back(&MI);
         continue;
       case AMDGPU::COPY: {
         const TargetRegisterClass *SrcRC, *DstRC;
@@ -791,6 +799,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
   for (auto *MI : PHINodes) {
     processPHINode(*MI);
   }
+  while (!Relegalize.empty())
+    TII->legalizeOperands(*Relegalize.pop_back_val(), MDT);
+
   if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)
     hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6574,6 +6574,21 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
       !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
     legalizeOpWithMove(MI, VOP3Idx[2]);
 
+  if (isWMMA(MI)) {
+    // scale_src has a register class restricted to low 256 VGPRs, we may need
+    // to insert a copy to the restricted VGPR class.
+    int ScaleSrc0Idx =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
+    if (ScaleSrc0Idx != -1) {
+      int ScaleSrc1Idx =
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
+      if (!isOperandLegal(MI, ScaleSrc0Idx))
+        legalizeOpWithMove(MI, ScaleSrc0Idx);
+      if (!isOperandLegal(MI, ScaleSrc1Idx))
+        legalizeOpWithMove(MI, ScaleSrc1Idx);
+    }
+  }
+
   // Fix the register class of packed FP32 instructions on gfx12+. See
   // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
   if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1518,8 +1518,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
                                    (ins));
   dag MatrixScaleSrc = !if(HasMatrixScale,
-                           !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1),
-                                        (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)),
+                           !if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1),
+                                        (ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)),
                            (ins));
   dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
                                              MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -901,9 +901,9 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
-; GFX1250-NEXT:    s_movk_i32 s1, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT:    v_mov_b32_e32 v42, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1499,9 +1499,9 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
-; GFX1250-NEXT:    s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -2291,9 +2291,9 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
-; GFX1250-NEXT:    s_movk_i32 s1, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT:    v_mov_b32_e32 v42, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
@@ -2373,9 +2373,9 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
-; GFX1250-NEXT:    s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -1512,14 +1512,13 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
-; GFX1250-NEXT:    s_movk_i32 s0, 0x65
-; GFX1250-NEXT:    s_movk_i32 s1, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v43, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v34
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
 ; GFX1250-NEXT:    v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
 ; GFX1250-NEXT:    v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
-; GFX1250-NEXT:    v_mov_b32_e32 v41, v34
-; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -1619,14 +1618,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
-; GFX1250-NEXT:    s_mov_b64 s[0:1], 0x65
-; GFX1250-NEXT:    s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x65
+; GFX1250-NEXT:    v_mov_b64_e32 v[44:45], 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
 ; GFX1250-NEXT:    v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
 ; GFX1250-NEXT:    v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
 ; GFX1250-NEXT:    v_mov_b32_e32 v41, v34
-; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -2621,19 +2620,18 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
 ; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
-; GFX1250-NEXT:    s_movk_i32 s0, 0x65
-; GFX1250-NEXT:    s_movk_i32 s1, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v43, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
-; GFX1250-NEXT:    v_mov_b32_e32 v41, v26
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
@@ -2774,9 +2772,9 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
 ; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
-; GFX1250-NEXT:    s_mov_b64 s[0:1], 0x65
-; GFX1250-NEXT:    s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x65
+; GFX1250-NEXT:    v_mov_b64_e32 v[44:45], 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
@@ -2786,7 +2784,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
 ; GFX1250-NEXT:    v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
 ; GFX1250-NEXT:    v_mov_b32_e32 v41, v26
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32