@@ -691,3 +691,67 @@ module attributes {transform.with_named_sequence} {
691691 transform.yield
692692 }
693693}
694+
695+ // -----
696+
697+ #map = affine_map <(d0 ) -> (d0 * 128 )>
698+ #map1 = affine_map <(d0 ) -> (d0 * 32 )>
699+
700+ // CHECK-DAG: #[[$MAPB:.*]] = affine_map<()[s0] -> (s0 * 128)>
701+ // CHECK-DAG: #[[$MAPLANE:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 73) mod 32)>
702+ // CHECK-DAG: #[[$MAPI:.*]] = affine_map<()[s0, s1] -> (s0 * 32 + s1 * 2336 - ((s0 + s1 * 73) floordiv 2) * 64)>
703+ // CHECK-DAG: #[[$MAPJ:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 73) mod 32) floordiv 2) * 32)>
704+
705+ // CHECK-LABEL: func.func @simple_fill(
706+ func.func @simple_fill (%arg0: memref <128 x256 xf32 >) -> memref <128 x256 xf32 > {
707+ %c0 = arith.constant 0 : index
708+ %cst = arith.constant dense <0.000000e+00 > : vector <16 x32 xf32 >
709+ // CHECK: %[[C6:.*]] = arith.constant 6 : index
710+ // CHECK: gpu.launch
711+ scf.forall (%arg1 ) in (1 ) {
712+ // CHECK: %[[BIDX:.*]] = gpu.block_id x
713+ // CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]]
714+ %0 = affine.apply #map (%arg1 )
715+ %subview = memref.subview %arg0 [%0 , 0 ] [128 , 256 ] [1 , 1 ]
716+ : memref <128 x256 xf32 > to memref <128 x256 xf32 , strided <[256 , 1 ], offset : ?>>
717+
718+ // %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions
719+ // involving threadIdx.x/y by the map_nested_forall_to_threads
720+ // transformation. This results in a if (linear_thread_id < 6) conditional.
721+ scf.forall (%arg2 , %arg3 ) in (2 , 3 ) {
722+ // CHECK: %[[TIDX:.*]] = gpu.thread_id x
723+ // CHECK: %[[TIDY:.*]] = gpu.thread_id y
724+ // CHECK: %[[LID:.*]] = affine.apply #[[$MAPLANE]]()[%[[TIDX]], %[[TIDY]]]
725+ // CHECK: %[[COND:.*]] = arith.cmpi ult, %[[LID]], %[[C6]]
726+ // CHECK: scf.if %[[COND]]
727+ // CHECK: %[[I:.*]] = affine.apply #[[$MAPI]]()[%[[TIDX]], %[[TIDY]]]
728+ // CHECK: %[[J:.*]] = affine.apply #[[$MAPJ]]()[%[[TIDX]], %[[TIDY]]]
729+ // CHECK: memref.subview %{{.*}}[%[[I]], %[[J]]]
730+ %1 = affine.apply #map1 (%arg2 )
731+ %2 = affine.apply #map1 (%arg3 )
732+ %subview_0 = memref.subview %subview [%1 , %2 ] [16 , 32 ] [1 , 1 ]
733+ : memref <128 x256 xf32 , strided <[256 , 1 ], offset : ?>> to memref <16 x32 xf32 , strided <[256 , 1 ], offset : ?>>
734+ vector.transfer_write %cst , %subview_0 [%c0 , %c0 ] {in_bounds = [true , true ]}
735+ : vector <16 x32 xf32 >, memref <16 x32 xf32 , strided <[256 , 1 ], offset : ?>>
736+
737+ // This could be obtained e.g. if a previous transformation mapped this loop
738+ // to lanes. This can aslo be written by hand as valid IR.
739+ } {mapping = [#gpu.lane <linear_dim_0 >, #gpu.lane <linear_dim_1 >]}
740+ } {mapping = [#gpu.block <x >]}
741+ return %arg0 : memref <128 x256 xf32 >
742+ }
743+
744+ module attributes {transform.with_named_sequence } {
745+ transform.named_sequence @__transform_main (%module_op: !transform.any_op {transform.readonly }) {
746+ %func = transform.structured.match ops {[" func.func" ]} in %module_op
747+ : (!transform.any_op ) -> !transform.any_op
748+ %gpu_launch = transform.gpu.map_forall_to_blocks %func generate_gpu_launch
749+ : (!transform.any_op ) -> !transform.any_op
750+
751+ // This transformation maps scf.forall ivs to a particular mapping of thread
752+ // ids (laneid, threadid, warpid or warpgroupid).
753+ transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [73 , 5 , 1 ]
754+ : (!transform.any_op ) -> !transform.any_op
755+ transform.yield
756+ }
757+ }
0 commit comments