@@ -2024,6 +2024,205 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
20242024 ret void
20252025}
20262026
2027+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2028+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2029+ ; SDAG: ; %bb.0:
2030+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2031+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2032+ ; SDAG-NEXT: s_movk_i32 s6, 0x41
2033+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2034+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2035+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2036+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2037+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2038+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2039+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2040+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2041+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2042+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2043+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2044+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2045+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2046+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2047+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2048+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2049+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2050+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2051+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2052+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2053+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2054+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2055+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2056+ ; SDAG-NEXT: s_nop 1
2057+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2058+ ; SDAG-NEXT: s_nop 7
2059+ ; SDAG-NEXT: s_nop 3
2060+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2061+ ; SDAG-NEXT: s_endpgm
2062+ ;
2063+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2064+ ; GISEL: ; %bb.0:
2065+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2066+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2067+ ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2068+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2069+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2070+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2071+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2072+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2073+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2074+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2075+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2076+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2077+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2078+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2079+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2080+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2081+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2082+ ; GISEL-NEXT: s_nop 1
2083+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2084+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2085+ ; GISEL-NEXT: s_nop 7
2086+ ; GISEL-NEXT: s_nop 2
2087+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2088+ ; GISEL-NEXT: s_endpgm
2089+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 65 , i32 1 , i32 1065353216 )
2090+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2091+ ret void
2092+ }
2093+
2094+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2095+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2096+ ; SDAG: ; %bb.0:
2097+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2098+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2099+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2100+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2101+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2102+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2103+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2104+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2105+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2106+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2107+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2108+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2109+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2110+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2111+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2112+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2113+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2114+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2115+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2116+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2117+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2118+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2119+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2120+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2121+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2122+ ; SDAG-NEXT: s_nop 1
2123+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2124+ ; SDAG-NEXT: s_nop 7
2125+ ; SDAG-NEXT: s_nop 3
2126+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2127+ ; SDAG-NEXT: s_endpgm
2128+ ;
2129+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2130+ ; GISEL: ; %bb.0:
2131+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2132+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2133+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2134+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2135+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2136+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2137+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2138+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2139+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2140+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2141+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2142+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2143+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2144+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2145+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2146+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2147+ ; GISEL-NEXT: s_nop 0
2148+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2149+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2150+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2151+ ; GISEL-NEXT: s_nop 7
2152+ ; GISEL-NEXT: s_nop 1
2153+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2154+ ; GISEL-NEXT: s_endpgm
2155+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 -2 )
2156+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2157+ ret void
2158+ }
2159+
2160+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2161+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2162+ ; SDAG: ; %bb.0:
2163+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2164+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2165+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2166+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2167+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2168+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2169+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2170+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2171+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2172+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2173+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2174+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2175+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2176+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2177+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2178+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2179+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2180+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2181+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2182+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2183+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2184+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2185+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2186+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2187+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2188+ ; SDAG-NEXT: s_nop 1
2189+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2190+ ; SDAG-NEXT: s_nop 7
2191+ ; SDAG-NEXT: s_nop 3
2192+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2193+ ; SDAG-NEXT: s_endpgm
2194+ ;
2195+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2196+ ; GISEL: ; %bb.0:
2197+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2198+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2199+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2200+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2201+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2202+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2203+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2204+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2205+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2206+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2207+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2208+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2209+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2210+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2211+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2212+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2213+ ; GISEL-NEXT: s_nop 0
2214+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2215+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2216+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2217+ ; GISEL-NEXT: s_nop 7
2218+ ; GISEL-NEXT: s_nop 1
2219+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2220+ ; GISEL-NEXT: s_endpgm
2221+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 1042479491 )
2222+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2223+ ret void
2224+ }
2225+
20272226; This should be optimized to avoid the scale
20282227define <4 x float > @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %scale0 , i32 %scale1 ) {
20292228; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
0 commit comments