llvm
diff --git a/‎.clang-tidy
Lines changed: 0 additions & 19 deletions b/‎.clang-tidy
Lines changed: 0 additions & 19 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 0 deletions b/‎README.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/polygeist/Ops.h
Lines changed: 1 addition & 0 deletions b/‎include/polygeist/Ops.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/polygeist/Passes/Passes.h
Lines changed: 11 additions & 2 deletions b/‎include/polygeist/Passes/Passes.h
Lines changed: 11 additions & 2 deletions
diff --git a/‎include/polygeist/Passes/Passes.td
Lines changed: 20 additions & 1 deletion b/‎include/polygeist/Passes/Passes.td
Lines changed: 20 additions & 1 deletion
diff --git a/‎include/polygeist/PolygeistOps.td
Lines changed: 31 additions & 0 deletions b/‎include/polygeist/PolygeistOps.td
Lines changed: 31 additions & 0 deletions
diff --git a/‎lib/polygeist/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎lib/polygeist/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/polygeist/ExecutionEngine/CMakeLists.txt
Lines changed: 25 additions & 0 deletions b/‎lib/polygeist/ExecutionEngine/CMakeLists.txt
Lines changed: 25 additions & 0 deletions
diff --git a/‎lib/polygeist/ExecutionEngine/CudaRuntimeWrappers.cpp
Lines changed: 73 additions & 0 deletions b/‎lib/polygeist/ExecutionEngine/CudaRuntimeWrappers.cpp
Lines changed: 73 additions & 0 deletions
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.10)
 
 include(CheckCXXSourceCompiles)
 
+set(POLYGEIST_ENABLE_CUDA 0 CACHE BOOL "Enable CUDA compilation support")
+
 if(POLICY CMP0068)
   cmake_policy(SET CMP0068 NEW)
   set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 
@@ -30,6 +30,8 @@ ninja
 ninja check-mlir
 ```
 
+To enable compilation to cuda add `-DMLIR_ENABLE_CUDA_RUNNER=1` and remove `-DLLVM_TARGETS_TO_BUILD="host"` from the cmake arguments. (You may need to specify `CUDACXX`, `CUDA_PATH`, or `CMAKE_CUDA_COMPILER`)
+
 2. Build Polygeist:
 ```sh
 mkdir build
@@ -44,6 +46,8 @@ ninja
 ninja check-polygeist-opt && ninja check-cgeist
 ```
 
+To enable compilation to cuda add `-DPOLYGEIST_ENABLE_CUDA=1`
+
 #### Option 2: Using unified LLVM, MLIR, Clang, and Polygeist build
 
 Polygeist can also be built as an external LLVM project using [LLVM_EXTERNAL_PROJECTS](https://llvm.org/docs/CMake.html#llvm-related-variables).
 
@@ -10,6 +10,7 @@
 #define POLYGEISTOPS_H
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinTypes.h"
 
@@ -22,12 +22,21 @@ std::unique_ptr<Pass> createCPUifyPass(StringRef method = "");
 std::unique_ptr<Pass> createBarrierRemovalContinuation();
 std::unique_ptr<Pass> detectReductionPass();
 std::unique_ptr<Pass> createRemoveTrivialUsePass();
-std::unique_ptr<Pass> createParallelLowerPass();
+std::unique_ptr<Pass> createParallelLowerPass(bool wrapParallelOps = false);
+std::unique_ptr<Pass> createCudaRTLowerPass();
 std::unique_ptr<Pass>
 createConvertPolygeistToLLVMPass(const LowerToLLVMOptions &options,
-                                 bool useCStyleMemRef);
+                                 bool useCStyleMemRef, bool onlyGpuModules);
 std::unique_ptr<Pass> createConvertPolygeistToLLVMPass();
 std::unique_ptr<Pass> createForBreakToWhilePass();
+std::unique_ptr<Pass>
+createConvertParallelToGPUPass1(bool useOriginalThreadNums = false);
+std::unique_ptr<Pass> createConvertParallelToGPUPass2();
+std::unique_ptr<Pass> createGpuSerializeToCubinPass(
+    StringRef triple, StringRef arch, StringRef features, int llvmOptLevel,
+    int ptxasOptLevel, std::string ptxasPath, std::string libDevicePath,
+    bool outputIntermediate);
+void registerGpuSerializeToCubinPass();
 
 void populateForBreakToWhilePatterns(RewritePatternSet &patterns);
 } // namespace polygeist
 
@@ -13,8 +13,15 @@ def Mem2Reg : Pass<"mem2reg"> {
   let constructor = "mlir::polygeist::createMem2RegPass()";
 }
 
+def CudaRTLower : Pass<"cudart-lower", "mlir::ModuleOp"> {
+  let summary = "Lower cudart functions to cpu versions";
+  let dependentDialects =
+      ["memref::MemRefDialect", "func::FuncDialect", "LLVM::LLVMDialect"];
+  let constructor = "mlir::polygeist::createCudaRTLowerPass()";
+}
+
 def ParallelLower : Pass<"parallel-lower", "mlir::ModuleOp"> {
-  let summary = "Replace scf.if and similar with affine.if";
+  let summary = "Lower gpu launch op to parallel ops";
   let dependentDialects =
       ["memref::MemRefDialect", "func::FuncDialect", "LLVM::LLVMDialect"];
   let constructor = "mlir::polygeist::createParallelLowerPass()";
@@ -35,6 +42,18 @@ def SCFCPUify : Pass<"cpuify"> {
   ];
 }
 
+def ConvertParallelToGPU1 : Pass<"convert-parallel-to-gpu1"> {
+  let summary = "Convert parallel loops to gpu";
+  let constructor = "mlir::polygeist::createConvertParallelToGPUPass1()";
+  let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
+}
+
+def ConvertParallelToGPU2 : Pass<"convert-parallel-to-gpu2"> {
+  let summary = "Convert parallel loops to gpu";
+  let constructor = "mlir::polygeist::createConvertParallelToGPUPass2()";
+  let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
+}
+
 def InnerSerialization : Pass<"inner-serialize"> {
   let summary = "remove scf.barrier";
   let constructor = "mlir::polygeist::createInnerSerializationPass()";
 
@@ -12,6 +12,7 @@
 include "Dialect.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
 
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
@@ -66,6 +67,36 @@ def SubIndexOp : Polygeist_Op<"subindex", [
   }];
 }
 
+def GPUWrapperOp : Polygeist_Op<"gpu_wrapper", [
+  RecursiveMemoryEffects,
+  SingleBlockImplicitTerminator<"polygeist::PolygeistYieldOp">]>,
+  Arguments<(ins Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)> {
+  let summary = "Wraps a parallel op to prevent code motion and transformations";
+  let results = (outs Index : $result);
+  let regions = (region SizedRegion<1>:$region);
+  let skipDefaultBuilders = 1;
+  let builders = [OpBuilder<(ins
+      "Value":$blockSizeX, "Value":$blockSizeY, "Value":$blockSizeZ)>];
+
+}
+
+def GPUErrorOp : Polygeist_Op<"gpu_error", [
+  RecursiveMemoryEffects,
+  SingleBlockImplicitTerminator<"polygeist::PolygeistYieldOp">]>,
+  Arguments<(ins)> {
+  let summary = "Gets the error returned by the gpu operation inside";
+  // TODO should be i32, not index
+  let results = (outs Index : $result);
+  let regions = (region SizedRegion<1>:$region);
+  let skipDefaultBuilders = 1;
+  let builders = [OpBuilder<(ins)>];
+
+}
+
+def PolygeistYieldOp : Polygeist_Op<"polygeist_yield", [Pure, ReturnLike, Terminator,
+    ParentOneOf<["GPUWrapperOp", "GPUErrorOp"]>]> {
+  let summary = "Polygeist ops terminator";
+}
 
 def StreamToTokenOp : Polygeist_Op<"stream2token", [
   Pure
 
@@ -1,3 +1,5 @@
+add_subdirectory(ExecutionEngine)
+
 add_mlir_dialect_library(MLIRPolygeist
 Dialect.cpp
 Ops.cpp
 
@@ -0,0 +1,25 @@
+
+if(POLYGEIST_ENABLE_CUDA)
+  find_package(CUDA)
+  enable_language(CUDA)
+
+  add_mlir_library(polygeist_cuda_runtime
+    SHARED
+    CudaRuntimeWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+  )
+
+  find_library(CUDA_RUNTIME_LIBRARY cuda)
+
+  set_property(TARGET polygeist_cuda_runtime PROPERTY CXX_STANDARD 14)
+  target_include_directories(polygeist_cuda_runtime
+    PRIVATE
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  )
+  target_link_libraries(polygeist_cuda_runtime
+    PRIVATE
+    ${CUDA_RUNTIME_LIBRARY}
+  )
+
+endif()
@@ -0,0 +1,73 @@
+//===- PolygeistCudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the CUDA library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/CRunnerUtils.h"
+
+#include <stdio.h>
+
+#include "cuda.h"
+
+#ifdef _WIN32
+#define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
+#else
+#define MLIR_CUDA_WRAPPERS_EXPORT
+#endif // _WIN32
+
+#define CUDA_REPORT_IF_ERROR(expr)                                             \
+  [](CUresult result) {                                                        \
+    if (!result)                                                               \
+      return result;                                                           \
+    const char *name = nullptr;                                                \
+    cuGetErrorName(result, &name);                                             \
+    if (!name)                                                                 \
+      name = "<unknown>";                                                      \
+    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
+    return result;                                                             \
+  }(expr)
+
+thread_local static int32_t defaultDevice = 0;
+
+// Make the primary context of the current default device current for the
+// duration
+//  of the instance and restore the previous context on destruction.
+class ScopedContext {
+public:
+  ScopedContext() {
+    // Static reference to CUDA primary context for device ordinal
+    // defaultDevice.
+    static CUcontext context = [] {
+      CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
+      CUdevice device;
+      CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+      CUcontext ctx;
+      // Note: this does not affect the current context.
+      CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));
+      return ctx;
+    }();
+
+    CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
+  }
+
+  ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
+};
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT int32_t mgpuLaunchKernelErr(
+    CUfunction function, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
+    intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
+    CUstream stream, void **params, void **extra) {
+  ScopedContext scopedContext;
+  return CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ,
+                                             blockX, blockY, blockZ, smem,
+                                             stream, params, extra));
+}
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+add_subdirectory(ExecutionEngine)`
	`2`	`+`
`1`	`3`	`add_mlir_dialect_library(MLIRPolygeist`
`2`	`4`	`Dialect.cpp`
`3`	`5`	`Ops.cpp`