@@ -330,6 +330,57 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg
330330
331331// -----
332332
333+ module {
334+ func.func @aligned_dynamic_matmul_with_two_reduce_dim (%arg0: tensor <192 x?x16 xf32 >, %arg1: tensor <?x16 x16 xf32 >) -> tensor <192 x16 xf32 > {
335+ %cst = arith.constant 0.000000e+00 : f32
336+ %0 = tensor.empty () : tensor <192 x16 xf32 >
337+ %1 = linalg.fill ins (%cst : f32 ) outs (%0 : tensor <192 x16 xf32 >) -> tensor <192 x16 xf32 >
338+ %2 = linalg.generic {index ing_maps = [affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d1 , d3 )>, affine_map <(d0 , d1 , d2 , d3 ) -> (d1 , d2 , d3 )>, affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d2 )>], iterator_types = [" parallel" , " reduction" , " parallel" , " reduction" ]} ins (%arg0 , %arg1 : tensor <192 x?x16 xf32 >, tensor <?x16 x16 xf32 >) outs (%1 : tensor <192 x16 xf32 >) {
339+ ^bb0 (%in: f32 , %in_0: f32 , %out: f32 ):
340+ %3 = arith.mulf %in , %in_0 : f32
341+ %4 = arith.addf %out , %3 : f32
342+ linalg.yield %4 : f32
343+ } -> tensor <192 x16 xf32 >
344+ return %2 : tensor <192 x16 xf32 >
345+ }
346+ }
347+
348+ // CHECK-LABEL: func.func @aligned_dynamic_matmul_with_two_reduce_dim
349+ // CHECK-SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 1, 1] subgroup_size = 64
350+ // CHECK: linalg.generic
351+ // CHECK-SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
352+ // CHECK-SAME: promote_operands = [0, 1]
353+ // CHECK-SAME: reduction = [0, 1, 0, 4],
354+ // CHECK-SAME: subgroup = [2, 0, 1, 0],
355+ // CHECK-SAME: workgroup = [64, 0, 16, 0]}
356+
357+ // -----
358+
359+ module {
360+ func.func @unaligned_dynamic_matmul_with_two_reduce_dim (%arg0: tensor <196 x?x4 xf32 >, %arg1: tensor <?x16 x4 xf32 >) -> tensor <196 x16 xf32 > {
361+ %cst = arith.constant 0.000000e+00 : f32
362+ %0 = tensor.empty () : tensor <196 x16 xf32 >
363+ %1 = linalg.fill ins (%cst : f32 ) outs (%0 : tensor <196 x16 xf32 >) -> tensor <196 x16 xf32 >
364+ %2 = linalg.generic {index ing_maps = [affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d1 , d3 )>, affine_map <(d0 , d1 , d2 , d3 ) -> (d1 , d2 , d3 )>, affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d2 )>], iterator_types = [" parallel" , " reduction" , " parallel" , " reduction" ]} ins (%arg0 , %arg1 : tensor <196 x?x4 xf32 >, tensor <?x16 x4 xf32 >) outs (%1 : tensor <196 x16 xf32 >) {
365+ ^bb0 (%in: f32 , %in_0: f32 , %out: f32 ):
366+ %3 = arith.mulf %in , %in_0 : f32
367+ %4 = arith.addf %out , %3 : f32
368+ linalg.yield %4 : f32
369+ } -> tensor <196 x16 xf32 >
370+ return %2 : tensor <196 x16 xf32 >
371+ }
372+ }
373+
374+ // CHECK-LABEL: func.func @unaligned_dynamic_matmul_with_two_reduce_dim
375+ // CHECK-SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
376+ // CHECK: linalg.generic
377+ // CHECK-SAME: promote_operands = [0, 1]
378+ // CHECK-SAME: reduction = [0, 4, 0, 4],
379+ // CHECK-SAME: thread = [1, 0, 1, 0],
380+ // CHECK-SAME: workgroup = [4, 0, 16, 0]}
381+
382+ // -----
383+
333384module {
334385func.func @unaligned_to_intrinsic_batched_matmul_tiling_check (%lhs : tensor <12 x577 x577 xf32 >, %rhs : tensor <12 x577 x1024 xf32 >) -> tensor <12 x577 x1024 xf32 > {
335386 %c0 = arith.constant 0.0 : f32
0 commit comments