@@ -3121,6 +3121,159 @@ bb:
31213121 ret void
31223122}
31233123
3124+ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64 (ptr addrspace (1 ) %arg ) #0 {
3125+ ; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3126+ ; NOLIT-SRCC: ; %bb.0: ; %bb
3127+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3128+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 64
3129+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 64
3130+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 64
3131+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 64
3132+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 64
3133+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 64
3134+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 64
3135+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 64
3136+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 64
3137+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 64
3138+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 64
3139+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 64
3140+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 64
3141+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 64
3142+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 64
3143+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64
3144+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3145+ ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3146+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
3147+ ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
3148+ ; NOLIT-SRCC-NEXT: s_nop 7
3149+ ; NOLIT-SRCC-NEXT: s_nop 1
3150+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
3151+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
3152+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
3153+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
3154+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3155+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3156+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3157+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3158+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
3159+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
3160+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
3161+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
3162+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
3163+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
3164+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
3165+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
3166+ ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3167+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3168+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3169+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3170+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3171+ ; NOLIT-SRCC-NEXT: s_endpgm
3172+ ;
3173+ ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3174+ ; LIT-SRCC: ; %bb.0: ; %bb
3175+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3176+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3177+ ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3178+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
3179+ ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3180+ ; LIT-SRCC-NEXT: s_nop 7
3181+ ; LIT-SRCC-NEXT: s_nop 1
3182+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
3183+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
3184+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
3185+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
3186+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3187+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3188+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3189+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3190+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
3191+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
3192+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
3193+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
3194+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
3195+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
3196+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
3197+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
3198+ ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3199+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3200+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3201+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3202+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3203+ ; LIT-SRCC-NEXT: s_endpgm
3204+ ;
3205+ ; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3206+ ; GFX90A: ; %bb.0: ; %bb
3207+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 1
3208+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 2
3209+ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3210+ ; GFX90A-NEXT: s_nop 0
3211+ ; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3212+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
3213+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3214+ ; GFX90A-NEXT: s_nop 7
3215+ ; GFX90A-NEXT: s_nop 0
3216+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
3217+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
3218+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
3219+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3220+ ; GFX90A-NEXT: s_endpgm
3221+ ;
3222+ ; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3223+ ; GFX942: ; %bb.0: ; %bb
3224+ ; GFX942-NEXT: v_mov_b32_e32 v0, 1
3225+ ; GFX942-NEXT: v_mov_b32_e32 v1, 2
3226+ ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3227+ ; GFX942-NEXT: s_nop 0
3228+ ; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3229+ ; GFX942-NEXT: v_mov_b32_e32 v0, 0
3230+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3231+ ; GFX942-NEXT: s_nop 7
3232+ ; GFX942-NEXT: s_nop 0
3233+ ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
3234+ ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
3235+ ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
3236+ ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3237+ ; GFX942-NEXT: s_endpgm
3238+ ;
3239+ ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3240+ ; GFX942-VGPR: ; %bb.0: ; %bb
3241+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 1
3242+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
3243+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3244+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3245+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3246+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
3247+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
3248+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
3249+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
3250+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
3251+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
3252+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
3253+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
3254+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
3255+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
3256+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
3257+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
3258+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
3259+ ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3260+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
3261+ ; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v17, v18, v[0:15] cbsz:1 abid:2 blgp:3
3262+ ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3263+ ; GFX942-VGPR-NEXT: s_nop 7
3264+ ; GFX942-VGPR-NEXT: s_nop 1
3265+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3266+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3267+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3268+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3269+ ; GFX942-VGPR-NEXT: s_endpgm
3270+ bb:
3271+ %in.1 = load <16 x i32 >, ptr addrspace (1 ) %arg
3272+ %mai.1 = tail call <16 x i32 > @llvm.amdgcn.mfma.i32.16x16x4i8 (i32 1 , i32 2 , <16 x i32 > splat (i32 64 ), i32 1 , i32 2 , i32 3 )
3273+ store <16 x i32 > %mai.1 , ptr addrspace (1 ) %arg
3274+ ret void
3275+ }
3276+
31243277define amdgpu_kernel void @test_mfma_i32_4x4x4i8 (ptr addrspace (1 ) %arg ) #0 {
31253278; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8:
31263279; NOLIT-SRCC: ; %bb.0: ; %bb
@@ -3239,6 +3392,200 @@ bb:
32393392 ret void
32403393}
32413394
3395+ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1 (ptr addrspace (1 ) %arg ) #0 {
3396+ ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3397+ ; NOLIT-SRCC: ; %bb.0: ; %bb
3398+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3399+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1
3400+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1
3401+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1
3402+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1
3403+ ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3404+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3405+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3406+ ; NOLIT-SRCC-NEXT: s_nop 0
3407+ ; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
3408+ ; NOLIT-SRCC-NEXT: s_nop 3
3409+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3410+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3411+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3412+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3413+ ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3414+ ; NOLIT-SRCC-NEXT: s_nop 0
3415+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3416+ ; NOLIT-SRCC-NEXT: s_endpgm
3417+ ;
3418+ ; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3419+ ; LIT-SRCC: ; %bb.0: ; %bb
3420+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3421+ ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3422+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3423+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3424+ ; LIT-SRCC-NEXT: s_nop 0
3425+ ; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
3426+ ; LIT-SRCC-NEXT: s_nop 3
3427+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3428+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3429+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3430+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3431+ ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3432+ ; LIT-SRCC-NEXT: s_nop 0
3433+ ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3434+ ; LIT-SRCC-NEXT: s_endpgm
3435+ ;
3436+ ; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3437+ ; GFX90A: ; %bb.0: ; %bb
3438+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 1
3439+ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3440+ ; GFX90A-NEXT: v_mov_b32_e32 v2, 2
3441+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 0
3442+ ; GFX90A-NEXT: s_nop 0
3443+ ; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
3444+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3445+ ; GFX90A-NEXT: s_nop 3
3446+ ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
3447+ ; GFX90A-NEXT: s_endpgm
3448+ ;
3449+ ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3450+ ; GFX942: ; %bb.0: ; %bb
3451+ ; GFX942-NEXT: v_mov_b32_e32 v0, 1
3452+ ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3453+ ; GFX942-NEXT: v_mov_b32_e32 v2, 2
3454+ ; GFX942-NEXT: v_mov_b32_e32 v1, 0
3455+ ; GFX942-NEXT: s_nop 0
3456+ ; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
3457+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3458+ ; GFX942-NEXT: s_nop 3
3459+ ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
3460+ ; GFX942-NEXT: s_endpgm
3461+ ;
3462+ ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3463+ ; GFX942-VGPR: ; %bb.0: ; %bb
3464+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
3465+ ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3466+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3467+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3468+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3469+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2
3470+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
3471+ ; GFX942-VGPR-NEXT: s_nop 0
3472+ ; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v5, v[0:3] cbsz:1 abid:2 blgp:3
3473+ ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3474+ ; GFX942-VGPR-NEXT: s_nop 3
3475+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3476+ ; GFX942-VGPR-NEXT: s_endpgm
3477+ bb:
3478+ %in.1 = load <4 x i32 >, ptr addrspace (1 ) %arg
3479+ %mai.1 = tail call <4 x i32 > @llvm.amdgcn.mfma.i32.4x4x4i8 (i32 1 , i32 2 , <4 x i32 > splat (i32 1 ), i32 1 , i32 2 , i32 3 )
3480+ store <4 x i32 > %mai.1 , ptr addrspace (1 ) %arg
3481+ ret void
3482+ }
3483+
3484+ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1 (ptr addrspace (1 ) %arg ) #0 {
3485+ ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3486+ ; NOLIT-SRCC: ; %bb.0:
3487+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
3488+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
3489+ ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3490+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
3491+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
3492+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
3493+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
3494+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
3495+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3496+ ; NOLIT-SRCC-NEXT: s_nop 0
3497+ ; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
3498+ ; NOLIT-SRCC-NEXT: s_nop 3
3499+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3500+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3501+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3502+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3503+ ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3504+ ; NOLIT-SRCC-NEXT: s_nop 0
3505+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3506+ ; NOLIT-SRCC-NEXT: s_endpgm
3507+ ;
3508+ ; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3509+ ; LIT-SRCC: ; %bb.0:
3510+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
3511+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
3512+ ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3513+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
3514+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
3515+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
3516+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
3517+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
3518+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3519+ ; LIT-SRCC-NEXT: s_nop 0
3520+ ; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
3521+ ; LIT-SRCC-NEXT: s_nop 3
3522+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3523+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3524+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3525+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3526+ ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3527+ ; LIT-SRCC-NEXT: s_nop 0
3528+ ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3529+ ; LIT-SRCC-NEXT: s_endpgm
3530+ ;
3531+ ; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3532+ ; GFX90A: ; %bb.0:
3533+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41
3534+ ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
3535+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 1
3536+ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3537+ ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
3538+ ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
3539+ ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
3540+ ; GFX90A-NEXT: v_mov_b32_e32 v2, 2
3541+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
3542+ ; GFX90A-NEXT: s_nop 0
3543+ ; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
3544+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3545+ ; GFX90A-NEXT: s_nop 3
3546+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3547+ ; GFX90A-NEXT: s_endpgm
3548+ ;
3549+ ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3550+ ; GFX942: ; %bb.0:
3551+ ; GFX942-NEXT: v_mov_b32_e32 v1, 0x41
3552+ ; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
3553+ ; GFX942-NEXT: v_mov_b32_e32 v1, 1
3554+ ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3555+ ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
3556+ ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
3557+ ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
3558+ ; GFX942-NEXT: v_mov_b32_e32 v2, 2
3559+ ; GFX942-NEXT: v_mov_b32_e32 v0, 0
3560+ ; GFX942-NEXT: s_nop 0
3561+ ; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
3562+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3563+ ; GFX942-NEXT: s_nop 3
3564+ ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3565+ ; GFX942-NEXT: s_endpgm
3566+ ;
3567+ ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3568+ ; GFX942-VGPR: ; %bb.0:
3569+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1
3570+ ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3571+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41
3572+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3573+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3574+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3575+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2
3576+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
3577+ ; GFX942-VGPR-NEXT: s_nop 0
3578+ ; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
3579+ ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3580+ ; GFX942-VGPR-NEXT: s_nop 3
3581+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3582+ ; GFX942-VGPR-NEXT: s_endpgm
3583+ %in.1 = load <4 x i32 >, ptr addrspace (1 ) %arg
3584+ %mai.1 = tail call <4 x i32 > @llvm.amdgcn.mfma.i32.4x4x4i8 (i32 1 , i32 2 , <4 x i32 > splat (i32 65 ), i32 1 , i32 2 , i32 3 )
3585+ store <4 x i32 > %mai.1 , ptr addrspace (1 ) %arg
3586+ ret void
3587+ }
3588+
32423589define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc (ptr addrspace (1 ) %arg ) #0 {
32433590; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
32443591; NOLIT-SRCC: ; %bb.0: ; %bb
0 commit comments