@@ -2030,6 +2030,205 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
20302030 ret void
20312031}
20322032
2033+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2034+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2035+ ; SDAG: ; %bb.0:
2036+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2037+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2038+ ; SDAG-NEXT: s_movk_i32 s6, 0x41
2039+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2040+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2041+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2042+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2043+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2044+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2045+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2046+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2047+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2048+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2049+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2050+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2051+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2052+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2053+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2054+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2055+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2056+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2057+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2058+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2059+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2060+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2061+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2062+ ; SDAG-NEXT: s_nop 1
2063+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2064+ ; SDAG-NEXT: s_nop 7
2065+ ; SDAG-NEXT: s_nop 3
2066+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2067+ ; SDAG-NEXT: s_endpgm
2068+ ;
2069+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2070+ ; GISEL: ; %bb.0:
2071+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2072+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2073+ ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2074+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2075+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2076+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2077+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2078+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2079+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2080+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2081+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2082+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2083+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2084+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2085+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2086+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2087+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2088+ ; GISEL-NEXT: s_nop 1
2089+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2090+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2091+ ; GISEL-NEXT: s_nop 7
2092+ ; GISEL-NEXT: s_nop 2
2093+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2094+ ; GISEL-NEXT: s_endpgm
2095+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 65 , i32 1 , i32 1065353216 )
2096+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2097+ ret void
2098+ }
2099+
2100+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2101+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2102+ ; SDAG: ; %bb.0:
2103+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2104+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2105+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2106+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2107+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2108+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2109+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2110+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2111+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2112+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2113+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2114+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2115+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2116+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2117+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2118+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2119+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2120+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2121+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2122+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2123+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2124+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2125+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2126+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2127+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2128+ ; SDAG-NEXT: s_nop 1
2129+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2130+ ; SDAG-NEXT: s_nop 7
2131+ ; SDAG-NEXT: s_nop 3
2132+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2133+ ; SDAG-NEXT: s_endpgm
2134+ ;
2135+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2136+ ; GISEL: ; %bb.0:
2137+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2138+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2139+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2140+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2141+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2142+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2143+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2144+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2145+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2146+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2147+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2148+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2149+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2150+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2151+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2152+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2153+ ; GISEL-NEXT: s_nop 0
2154+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2155+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2156+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2157+ ; GISEL-NEXT: s_nop 7
2158+ ; GISEL-NEXT: s_nop 1
2159+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2160+ ; GISEL-NEXT: s_endpgm
2161+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 -2 )
2162+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2163+ ret void
2164+ }
2165+
2166+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2167+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2168+ ; SDAG: ; %bb.0:
2169+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2170+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2171+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2172+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2173+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2174+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2175+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2176+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2177+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2178+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2179+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2180+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2181+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2182+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2183+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2184+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2185+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2186+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2187+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2188+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2189+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2190+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2191+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2192+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2193+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2194+ ; SDAG-NEXT: s_nop 1
2195+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2196+ ; SDAG-NEXT: s_nop 7
2197+ ; SDAG-NEXT: s_nop 3
2198+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2199+ ; SDAG-NEXT: s_endpgm
2200+ ;
2201+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2202+ ; GISEL: ; %bb.0:
2203+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2204+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2205+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2206+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2207+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2208+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2209+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2210+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2211+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2212+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2213+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2214+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2215+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2216+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2217+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2218+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2219+ ; GISEL-NEXT: s_nop 0
2220+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2221+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2222+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2223+ ; GISEL-NEXT: s_nop 7
2224+ ; GISEL-NEXT: s_nop 1
2225+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2226+ ; GISEL-NEXT: s_endpgm
2227+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 1042479491 )
2228+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2229+ ret void
2230+ }
2231+
20332232; This should be optimized to avoid the scale
20342233define <4 x float > @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %scale0 , i32 %scale1 ) {
20352234; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
0 commit comments