@@ -20,7 +20,7 @@ func.func @matmul_32_32_32(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
2020 // GENERALIZED: linalg.generic
2121 // SPECIALIZED: linalg.matmul
2222 // CHECK: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
23- // CHECK-SAME: promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 8], subgroup = [1, 1, 0],
23+ // CHECK-SAME: promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [1, 1, 0],
2424 // CHECK-SAME: workgroup = [32, 32, 0]}>}
2525 %0 = linalg.matmul ins (%arg0 , %arg1 : !TA , !TB ) outs (%arg2 : !TC ) -> !TC
2626 iree_tensor_ext.dispatch.tensor.store %0 , %arg3 , offsets = [0 , 0 ], sizes = [32 , 32 ], strides = [1 , 1 ] : !TC -> !DTC
@@ -37,7 +37,7 @@ func.func @matmul_32_32_32(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
3737// CHECK-SAME: workgroup_size = [256, 1, 1] subgroup_size = 64, {
3838func.func @matmul_4096_4096_4096 (%arg0: !TA , %arg1: !TB , %arg2: !TC , %arg3: !DTC ) {
3939 // CHECK: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
40- // CHECK-SAME: promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 4], subgroup = [4, 4, 0], workgroup = [128, 128, 0]}>
40+ // CHECK-SAME: promote_operands = [0, 1], reduction = [0, 0, 4], subgroup = [4, 4, 0], workgroup = [128, 128, 0]}>
4141 %0 = linalg.matmul ins (%arg0 , %arg1 : !TA , !TB ) outs (%arg2 : !TC ) -> !TC
4242 iree_tensor_ext.dispatch.tensor.store %0 , %arg3 , offsets = [0 , 0 ], sizes = [4096 , 4096 ], strides = [1 , 1 ] : !TC -> !DTC
4343 return
@@ -53,7 +53,7 @@ func.func @matmul_4096_4096_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC
5353// CHECK-SAME: workgroup_size = [256, 1, 1] subgroup_size = 64, {
5454func.func @matmul_4096_32_4096 (%arg0: !TA , %arg1: !TB , %arg2: !TC , %arg3: !DTC ) {
5555 // CHECK: #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
56- // CHECK-SAME: promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 8], subgroup = [2, 4, 0],
56+ // CHECK-SAME: promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [2, 4, 0],
5757 // CHECK-SAME: workgroup = [64, 128, 0]}>}
5858 %0 = linalg.matmul ins (%arg0 , %arg1 : !TA , !TB ) outs (%arg2 : !TC ) -> !TC
5959 iree_tensor_ext.dispatch.tensor.store %0 , %arg3 , offsets = [0 , 0 ], sizes = [4096 , 4096 ], strides = [1 , 1 ] : !TC -> !DTC
@@ -71,7 +71,7 @@ func.func @matmul_4096_32_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC)
7171// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
7272func.func @matmul_4096_1_4096 (%arg0: !TA , %arg1: !TB , %arg2: !TC , %arg3: !DTC ) {
7373 // CHECK: #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
74- // CHECK-SAME: padding = [32, 32, 4], promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 1], subgroup = [1, 2, 0], workgroup = [32, 32, 0]}
74+ // CHECK-SAME: padding = [32, 32, 4], promote_operands = [0, 1], reduction = [0, 0, 1], subgroup = [1, 2, 0], workgroup = [32, 32, 0]}
7575 %0 = linalg.matmul ins (%arg0 , %arg1 : !TA , !TB ) outs (%arg2 : !TC ) -> !TC
7676 iree_tensor_ext.dispatch.tensor.store %0 , %arg3 , offsets = [0 , 0 ], sizes = [4096 , 4096 ], strides = [1 , 1 ] : !TC -> !DTC
7777 return
0 commit comments