@@ -703,16 +703,17 @@ module attributes {transform.with_named_sequence} {
703703// CHECK-DAG: #[[$MAPJ:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 73) mod 32) floordiv 2) * 32)>
704704
705705// CHECK-LABEL: func.func @simple_fill(
706- func.func @simple_fill (%arg0: memref <128 x f32 >) -> memref <128 x f32 > {
706+ func.func @simple_fill (%arg0: memref <128 x 256 x f32 >) -> memref <128 x 256 x f32 > {
707707 %c0 = arith.constant 0 : index
708- %cst = arith.constant dense <0.000000e+00 > : vector <32 x f32 >
708+ %cst = arith.constant dense <0.000000e+00 > : vector <16 x 32 x f32 >
709709 // CHECK: %[[C6:.*]] = arith.constant 6 : index
710710 // CHECK: gpu.launch
711711 scf.forall (%arg1 ) in (1 ) {
712712 // CHECK: %[[BIDX:.*]] = gpu.block_id x
713713 // CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]]
714714 %0 = affine.apply #map (%arg1 )
715- %subview = memref.subview %arg0 [%0 ] [128 ] [1 ] : memref <128 xf32 > to memref <128 xf32 , strided <[1 ], offset : ?>>
715+ %subview = memref.subview %arg0 [%0 , 0 ] [128 , 256 ] [1 , 1 ]
716+ : memref <128 x256 xf32 > to memref <128 x256 xf32 , strided <[256 , 1 ], offset : ?>>
716717
717718 // %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions
718719 // involving threadIdx.x/y by the map_nested_forall_to_threads
@@ -725,19 +726,19 @@ func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
725726 // CHECK: scf.if %[[COND]]
726727 // CHECK: %[[I:.*]] = affine.apply #[[$MAPI]]()[%[[TIDX]], %[[TIDY]]]
727728 // CHECK: %[[J:.*]] = affine.apply #[[$MAPJ]]()[%[[TIDX]], %[[TIDY]]]
728- // CHECK: memref.subview %{{.*}}[%[[I]]] [ %[[J]]]
729+ // CHECK: memref.subview %{{.*}}[%[[I]], %[[J]]]
729730 %1 = affine.apply #map1 (%arg2 )
730731 %2 = affine.apply #map1 (%arg3 )
731- %subview_0 = memref.subview %subview [%1 ] [%2 ] [1 ] : memref <128 xf32 , strided <[1 ], offset : ?>> to memref <?xf32 , strided <[1 ], offset : ?>>
732- vector.transfer_write %cst , %subview_0 [%c0 ] {in_bounds = [true ]} : vector <32 xf32 >, memref <?xf32 , strided <[1 ], offset : ?>>
732+ %subview_0 = memref.subview %subview [%1 , %2 ] [16 , 32 ] [1 , 1 ]
733+ : memref <128 x256 xf32 , strided <[256 , 1 ], offset : ?>> to memref <16 x32 xf32 , strided <[256 , 1 ], offset : ?>>
734+ vector.transfer_write %cst , %subview_0 [%c0 , %c0 ] {in_bounds = [true , true ]}
735+ : vector <16 x32 xf32 >, memref <16 x32 xf32 , strided <[256 , 1 ], offset : ?>>
733736
734737 // This could be obtained e.g. if a previous transformation mapped this loop
735738 // to lanes. This can aslo be written by hand as valid IR.
736739 } {mapping = [#gpu.lane <linear_dim_0 >, #gpu.lane <linear_dim_1 >]}
737-
738- memref.copy %subview , %subview : memref <128 xf32 , strided <[1 ], offset : ?>> to memref <128 xf32 , strided <[1 ], offset : ?>>
739740 } {mapping = [#gpu.block <x >]}
740- return %arg0 : memref <128 x f32 >
741+ return %arg0 : memref <128 x 256 x f32 >
741742}
742743
743744module attributes {transform.with_named_sequence } {
0 commit comments