@@ -120,6 +120,36 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
120120 "Call finish() after each kernel launch.">
121121 ];
122122}
123+
124+ def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125+ let summary = "GPU tiling and fusion path.";
126+ let description = [{
127+ This pass tiles linalg operations and creates two nested scf.forall loops. When converting to gpu.launch,
128+ the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
129+ on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
130+ defaults to the pass options.
131+ }];
132+ let options = [
133+ Option<"numEus", "num-eus", "size_t",
134+ /*default=*/"448",
135+ "Number of Execution Units.">,
136+ Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
137+ /*default=*/"8",
138+ "Number of Execution Units per slice.">,
139+ Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
140+ /*default=*/"8",
141+ "Number of threads per Execution Unit.">,
142+ Option<"localMemSize", "local-mem-size", "size_t",
143+ /*default=*/"131072",
144+ "The size of the local memory, shared across a work-group.">,
145+ Option<"vectorWidth", "vector-width", "size_t",
146+ /*default=*/"512",
147+ "The maximum width of EU's vector registers.">,
148+ Option<"workGroupSize", "work-group-size", "size_t",
149+ /*default=*/"64",
150+ "The maximum workgroup size.">
151+ ];
152+ }
123153#endif // GC_USE_IMEX
124154
125155def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",
0 commit comments