@@ -2117,3 +2117,33 @@ func.func @decode_reduction_f32(%arg0: tensor<32x262144xf16>, %arg1: tensor<32xf
21172117// CHECK-SAME: lowering_config = #[[CONFIG0]]
21182118// CHECK: linalg.generic
21192119// CHECK-SAME: lowering_config = #[[CONFIG1]]
2120+
2121+ // -----
2122+
2123+ #executable_target_embedded_elf_x86_64 = #hal.executable.target <" llvm-cpu" , " embedded-elf-x86_64" , {cpu_features = " +avx512f" , native_vector_size = 64 : i64 , target_triple = " x86_64-unknown-unknown-eabi-elf" }>
2124+ func.func @attention_reshape_pack (%arg0: index , %arg1: tensor <4 x2 x?x32 xf16 >, %arg2: tensor <?x4 x32 xf16 >, %arg3: tensor <?x4 x32 xf16 >, %arg4: tensor <4 x2 x?x?xf16 >) -> tensor <?x256 x1 x1 xf16 > attributes {hal.executable.target = #executable_target_embedded_elf_x86_64 } {
2125+ %cst = arith.constant 0.000000e+00 : f16
2126+ %cst_0 = arith.constant 1.767580e-01 : f16
2127+ %0 = tensor.empty (%arg0 ) : tensor <?x4 x2 x32 xf16 >
2128+ %1 = tensor.empty (%arg0 ) : tensor <4 x2 x?x32 xf16 >
2129+ %2 = iree_linalg_ext.attention {index ing_maps = [affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d0 , d1 , d2 , d4 )>, affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d5 , d0 , d4 )>, affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d5 , d0 , d3 )>, affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> ()>, affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d0 , d1 , d2 , d5 )>, affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d0 , d1 , d2 , d3 )>]} ins (%arg1 , %arg2 , %arg3 , %cst_0 , %arg4 : tensor <4 x2 x?x32 xf16 >, tensor <?x4 x32 xf16 >, tensor <?x4 x32 xf16 >, f16 , tensor <4 x2 x?x?xf16 >) outs (%1 : tensor <4 x2 x?x32 xf16 >) {
2130+ ^bb0 (%arg5: f32 ):
2131+ iree_linalg_ext.yield %arg5 : f32
2132+ } -> tensor <4 x2 x?x32 xf16 >
2133+ %3 = linalg.generic {index ing_maps = [affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d1 , d2 , d3 )>, affine_map <(d0 , d1 , d2 , d3 ) -> (d2 , d0 , d1 , d3 )>], iterator_types = [" parallel" , " parallel" , " parallel" , " parallel" ]} ins (%2 : tensor <4 x2 x?x32 xf16 >) outs (%0 : tensor <?x4 x2 x32 xf16 >) {
2134+ ^bb0 (%in: f16 , %out: f16 ):
2135+ linalg.yield %in : f16
2136+ } -> tensor <?x4 x2 x32 xf16 >
2137+ %collapsed = tensor.collapse_shape %3 [[0 ], [1 , 2 , 3 ]] : tensor <?x4 x2 x32 xf16 > into tensor <?x256 xf16 >
2138+ %4 = tensor.empty (%arg0 ) : tensor <?x256 x1 x1 xf16 >
2139+ %pack = linalg.pack %collapsed padding_value (%cst : f16 ) outer_dims_perm = [0 , 1 ] inner_dims_pos = [0 , 1 ] inner_tiles = [1 , 1 ] into %4 : tensor <?x256 xf16 > -> tensor <?x256 x1 x1 xf16 >
2140+ return %pack : tensor <?x256 x1 x1 xf16 >
2141+ }
2142+ // CHECK-DAG: #[[CONFIG0:.+]] = #iree_cpu.lowering_config<distribution = [1, 1, 64, 16, 0, 0], vector_common_parallel = [1, 1, 4, 16, 0, 0], vector_reduction = [0, 0, 0, 0, 0, 32]>
2143+ // CHECK-DAG: #[[CONFIG1:.+]] = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 4, 16]>
2144+ // CHECK-NOT: #iree_cpu.lowering_config
2145+ // CHECK: func.func @attention_reshape_pack
2146+ // CHECK: iree_linalg_ext.attention
2147+ // CHECK-SAME: lowering_config = #[[CONFIG0]]
2148+ // CHECK: linalg.generic
2149+ // CHECK-SAME: lowering_config = #[[CONFIG1]]
0 commit comments