intel
diff --git a/‎mlir/include/mlir/Dialect/GPU/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎mlir/include/mlir/Dialect/GPU/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h‎
Lines changed: 75 additions & 0 deletions b/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td‎
Lines changed: 175 additions & 0 deletions b/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td‎
Lines changed: 175 additions & 0 deletions
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(TransformOps)
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS GPUTransformOps.td)
+mlir_tablegen(GPUTransformOps.h.inc -gen-op-decls)
+mlir_tablegen(GPUTransformOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRGPUTransformOpsIncGen)
+
+add_mlir_doc(GPUTransformOps GPUTransformOps Dialects/ -gen-op-doc)
@@ -0,0 +1,75 @@
+//===- GPUTransformOps.h - GPU transform ops --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
+#define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
+
+#include "mlir/Dialect/PDL/IR/PDLTypes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace gpu {
+class GpuOp;
+} // namespace gpu
+} // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// GPU Transform Operations
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"
+
+namespace mlir {
+class DialectRegistry;
+namespace transform {
+namespace gpu {
+
+/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// op to GPU threads. Mapping is one-to-one and the induction variables of
+/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
+/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
+/// which case, the union of the number of threads is computed and may result in
+/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
+/// supported. Dynamic block dim sizes are currently not supported.
+DiagnosedSilenceableFailure
+mapNestedForeachToThreadsImp(RewriterBase &rewriter, Operation *target,
+                             const SmallVectorImpl<int64_t> &blockDim,
+                             bool syncAfterDistribute,
+                             llvm::Optional<TransformOpInterface> transformOp);
+
+/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
+/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
+/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
+/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
+/// dim sizes are currently not supported.
+DiagnosedSilenceableFailure mapForeachToBlocksImp(
+    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
+                      SmallVectorImpl<Value> &)>
+        blockIdGenerator,
+    SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp);
+
+/// Finds the top level scf::ForeachThreadOp of given target.
+DiagnosedSilenceableFailure
+findTopLevelForeachThreadOp(Operation *target,
+                            scf::ForeachThreadOp &topLevelForeachThreadOp,
+                            TransformOpInterface transformOp);
+
+} // namespace gpu
+} // namespace transform
+
+namespace gpu {
+void registerTransformDialectExtension(DialectRegistry &registry);
+} // namespace gpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
@@ -0,0 +1,175 @@
+//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_TRANSFORM_OPS
+#define GPU_TRANSFORM_OPS
+
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/IR/TransformEffects.td"
+include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/PDL/IR/PDLTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+def MapNestedForeachToThreads : 
+  Op<Transform_Dialect, "gpu.map_nested_foreach_to_threads",
+    [FunctionalStyleTransformOpTrait, 
+     MemoryEffectsOpInterface,
+     TransformEachOpTrait,
+     TransformOpInterface]> {
+  let description = [{
+      Target the `gpu.launch op` and rewrite all `scf.foreach_thread`
+      nested in it to distributed `gpu.thread_id` attribute.
+
+      The operation searches for `scf.foreach_thread` ops nested under `target` 
+      and maps each such op to GPU threads. Mapping is one-to-one and the 
+      induction variables of `scf.foreach_thread` are rewritten to 
+      `gpu.thread_id` according to the `thread_dim_mapping` attribute. 
+      
+      Sibling `scf.foreach_thread` are supported in which case, the union of 
+      the number of threads is computed and may result in predication.
+
+      Multiple scf.foreach_thread are supported per `gpu.launch` in which case, 
+      the max of all the threads is computed and taken for the global 
+      `gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the 
+      whole thread range result in predicated computations.
+
+      Dynamic `scf.foreach_thread` trip counts are currently not supported. 
+      Dynamic block dim sizes are currently not supported.
+
+      Only **bufferized** `scf.foreach_thread` are currently supported.
+      Only `scf.foreach_thread` distributed to **at most 3 dimensions** are 
+      currently supported.
+    
+      Barriers are inserted after each scf.foreach_thread op for now.
+
+      The operation alters the block size of the given gpu_launch using 
+      blockDim argument.
+
+      #### Return modes:
+      
+      This operation ignores non-gpu_launch ops and drops them in the return.
+
+      If any scf.foreach_thread with tensors is found, the transform definitely 
+      fails.    
+
+      If all the scf.foreach_thread operations contained within the LaunchOp 
+      referred to by the `target` PDLOperation lower to GPU properly, the 
+      transform succeeds. Otherwise the transform definitely fails.
+      
+      The returned handle points to the same LaunchOp operand, consuming it and
+      producing a new SSA value to satisfy chaining and linearity of the IR 
+      properties.
+
+      #### Example:
+
+      ```
+      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
+                 threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
+        scf.foreach_thread (%i, %j) in (7, 9) {
+          ... // body 1
+        } {thread_dim_mapping = [1, 0, 2]}
+        scf.foreach_thread (%i) in (12) {
+          ... // body 2
+        }
+        gpu.terminator
+      }
+      ```
+      is translated to:
+
+      ```
+      %bdimX = arith.constant 12 : index
+      %bdimY = arith.constant 9 : index
+      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
+             threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
+        if (threadIdx.x < 9 && threadIdx.y < 7) {
+          ... // body 1
+        }
+        gpu.barrier
+        if (threadIdx.y < 1) {
+          ... // body 2
+        }
+        gpu.barrier
+        gpu.terminator
+      }      
+      ```
+    }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
+                   DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
+  let results = (outs PDL_Operation:$result);
+
+  let assemblyFormat = "$target attr-dict";
+  let extraClassDeclaration = [{    
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::Operation *target, 
+        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+
+def MapForeachToBlocks : 
+  Op<Transform_Dialect, "gpu.map_foreach_to_blocks",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait]> {
+  let description = [{
+    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
+    to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute 
+    is set, then first generates `gpu_launch` and moves the top level 
+    `scf.foreach_thread` inside.
+
+    The operation searches top level `scf.foreach_thread` ops under 
+    `gpu_launch` and maps each such op to GPU blocks. Mapping is 
+    one-to-one and the induction variables of `scf.foreach_thread` are 
+    rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
+
+    Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
+    Dynamic block dim sizes are currently not supported.
+
+    Only **bufferized** scf.foreach_thread are currently supported.
+    Only scf.foreach_thread distributed to **at most 3 dimensions** are 
+    currently supported.
+
+    The operation alters the block size of the given gpu_launch using 
+    gridDim argument.
+
+    #### Return modes:
+    
+    This operation ignores non-gpu_launch ops and drops them in the return.
+
+    If any scf.foreach_thread with tensors is found, the transform definitely 
+    fails.    
+
+    If all the scf.foreach_thread operations contained within the LaunchOp 
+    referred to by the `target` PDLOperation lower to GPU properly, the 
+    transform succeeds. Otherwise the transform definitely fails.
+
+    The returned handle points to the same LaunchOp operand, consuming it and
+    producing a new SSA value to satisfy chaining and linearity of the IR 
+    properties.
+  }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
+                   UnitAttr:$generate_gpu_launch);
+  let results = (outs PDL_Operation:$result);
+
+  let assemblyFormat = "$target attr-dict";
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::Operation *target, 
+        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+#endif // GPU_TRANSFORM_OPS
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`add_subdirectory(IR)`
`2`	`2`	`add_subdirectory(Transforms)`
	`3`	`+add_subdirectory(TransformOps)`