Skip to content

Commit b09f4db

Browse files
VigneshwarJarsenm
andcommitted
[AMDGPU] Bugfix for scaled MFMA parsing FP literals (llvm#142493)
bugfix on parsing FP literals for scale values in the scaled MFMA. Due to the change in order of operands between MCinst and parsed operands, the FP literal imms for scale values were not parsed correctly. --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent a06c2c8 commit b09f4db

File tree

5 files changed

+462
-8
lines changed

5 files changed

+462
-8
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8808,6 +8808,7 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88088808
OptionalImmIndexMap OptionalIdx;
88098809
unsigned Opc = Inst.getOpcode();
88108810
unsigned I = 1;
8811+
int CbszOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
88118812

88128813
const MCInstrDesc &Desc = MII.get(Opc);
88138814

@@ -8816,8 +8817,15 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88168817

88178818
for (unsigned E = Operands.size(); I != E; ++I) {
88188819
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[I]);
8819-
8820-
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
8820+
int NumOperands = Inst.getNumOperands();
8821+
// The order of operands in MCInst and parsed operands are different.
8822+
// Adding dummy cbsz and blgp operands at corresponding MCInst operand
8823+
// indices for parsing scale values correctly.
8824+
if (NumOperands == CbszOpIdx) {
8825+
Inst.addOperand(MCOperand::createImm(0));
8826+
Inst.addOperand(MCOperand::createImm(0));
8827+
}
8828+
if (isRegOrImmWithInputMods(Desc, NumOperands)) {
88218829
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
88228830
} else if (Op.isImmModifier()) {
88238831
OptionalIdx[Op.getImmTy()] = I;
@@ -8827,12 +8835,18 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88278835
}
88288836

88298837
// Insert CBSZ and BLGP operands for F8F6F4 variants
8830-
int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
8831-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ,
8832-
0, InsertPos);
8833-
InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8834-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP,
8835-
0, InsertPos);
8838+
auto CbszIdx = OptionalIdx.find(AMDGPUOperand::ImmTyCBSZ);
8839+
if (CbszIdx != OptionalIdx.end()) {
8840+
int CbszVal = ((AMDGPUOperand &)*Operands[CbszIdx->second]).getImm();
8841+
Inst.getOperand(CbszOpIdx).setImm(CbszVal);
8842+
}
8843+
8844+
int BlgpOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8845+
auto BlgpIdx = OptionalIdx.find(AMDGPUOperand::ImmTyBLGP);
8846+
if (BlgpIdx != OptionalIdx.end()) {
8847+
int BlgpVal = ((AMDGPUOperand &)*Operands[BlgpIdx->second]).getImm();
8848+
Inst.getOperand(BlgpOpIdx).setImm(BlgpVal);
8849+
}
88368850

88378851
// Add dummy src_modifiers
88388852
Inst.addOperand(MCOperand::createImm(0));

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,6 +2030,205 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
20302030
ret void
20312031
}
20322032

2033+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2034+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2035+
; SDAG: ; %bb.0:
2036+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2037+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2038+
; SDAG-NEXT: s_movk_i32 s6, 0x41
2039+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2040+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2041+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2042+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2043+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2044+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2045+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2046+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2047+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2048+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2049+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2050+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2051+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2052+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2053+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2054+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2055+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2056+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2057+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2058+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2059+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2060+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2061+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2062+
; SDAG-NEXT: s_nop 1
2063+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2064+
; SDAG-NEXT: s_nop 7
2065+
; SDAG-NEXT: s_nop 3
2066+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2067+
; SDAG-NEXT: s_endpgm
2068+
;
2069+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2070+
; GISEL: ; %bb.0:
2071+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2072+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2073+
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2074+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2075+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2076+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2077+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2078+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2079+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2080+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2081+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2082+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2083+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2084+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2085+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2086+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2087+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2088+
; GISEL-NEXT: s_nop 1
2089+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2090+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2091+
; GISEL-NEXT: s_nop 7
2092+
; GISEL-NEXT: s_nop 2
2093+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2094+
; GISEL-NEXT: s_endpgm
2095+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
2096+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2097+
ret void
2098+
}
2099+
2100+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2101+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2102+
; SDAG: ; %bb.0:
2103+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2104+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2105+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2106+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2107+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2108+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2109+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2110+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2111+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2112+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2113+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2114+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2115+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2116+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2117+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2118+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2119+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2120+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2121+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2122+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2123+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2124+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2125+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2126+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2127+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2128+
; SDAG-NEXT: s_nop 1
2129+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2130+
; SDAG-NEXT: s_nop 7
2131+
; SDAG-NEXT: s_nop 3
2132+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2133+
; SDAG-NEXT: s_endpgm
2134+
;
2135+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2136+
; GISEL: ; %bb.0:
2137+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2138+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2139+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2140+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2141+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2142+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2143+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2144+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2145+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2146+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2147+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2148+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2149+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2150+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2151+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2152+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2153+
; GISEL-NEXT: s_nop 0
2154+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2155+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2156+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2157+
; GISEL-NEXT: s_nop 7
2158+
; GISEL-NEXT: s_nop 1
2159+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2160+
; GISEL-NEXT: s_endpgm
2161+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
2162+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2163+
ret void
2164+
}
2165+
2166+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2167+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2168+
; SDAG: ; %bb.0:
2169+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2170+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2171+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2172+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2173+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2174+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2175+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2176+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2177+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2178+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2179+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2180+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2181+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2182+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2183+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2184+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2185+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2186+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2187+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2188+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2189+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2190+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2191+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2192+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2193+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2194+
; SDAG-NEXT: s_nop 1
2195+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2196+
; SDAG-NEXT: s_nop 7
2197+
; SDAG-NEXT: s_nop 3
2198+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2199+
; SDAG-NEXT: s_endpgm
2200+
;
2201+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2202+
; GISEL: ; %bb.0:
2203+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2204+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2205+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2206+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2207+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2208+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2209+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2210+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2211+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2212+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2213+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2214+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2215+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2216+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2217+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2218+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2219+
; GISEL-NEXT: s_nop 0
2220+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2221+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2222+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2223+
; GISEL-NEXT: s_nop 7
2224+
; GISEL-NEXT: s_nop 1
2225+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2226+
; GISEL-NEXT: s_endpgm
2227+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
2228+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2229+
ret void
2230+
}
2231+
20332232
; This should be optimized to avoid the scale
20342233
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
20352234
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:

0 commit comments

Comments
 (0)