llvm
diff --git a/‎include/polygeist/Passes/Passes.h
Lines changed: 1 addition & 0 deletions b/‎include/polygeist/Passes/Passes.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/polygeist/Passes/Passes.td
Lines changed: 6 additions & 0 deletions b/‎include/polygeist/Passes/Passes.td
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/polygeist/ExecutionEngine/CudaRuntimeWrappers.cpp
Lines changed: 14 additions & 2 deletions b/‎lib/polygeist/ExecutionEngine/CudaRuntimeWrappers.cpp
Lines changed: 14 additions & 2 deletions
diff --git a/‎lib/polygeist/ExecutionEngine/RocmRuntimeWrappers.cpp
Lines changed: 11 additions & 0 deletions b/‎lib/polygeist/ExecutionEngine/RocmRuntimeWrappers.cpp
Lines changed: 11 additions & 0 deletions
diff --git a/‎lib/polygeist/Passes/ConvertParallelToGPU.cpp
Lines changed: 82 additions & 1 deletion b/‎lib/polygeist/Passes/ConvertParallelToGPU.cpp
Lines changed: 82 additions & 1 deletion
@@ -47,6 +47,7 @@ std::unique_ptr<Pass>
 createConvertParallelToGPUPass1(bool useOriginalThreadNums = false);
 std::unique_ptr<Pass>
 createConvertParallelToGPUPass2(bool emitGPUKernelLaunchBounds = true);
+std::unique_ptr<Pass> createMergeGPUModulesPass();
 std::unique_ptr<Pass> createGpuSerializeToCubinPass(
     StringRef arch, StringRef features, int llvmOptLevel, int ptxasOptLevel,
     std::string ptxasPath, std::string libDevicePath, bool outputIntermediate);
 
@@ -68,6 +68,12 @@ def ConvertParallelToGPU2 : Pass<"convert-parallel-to-gpu2"> {
   let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
 }
 
+def MergeGPUModulesPass : Pass<"merge-gpu-modules", "mlir::ModuleOp"> {
+  let summary = "Merge all gpu modules into one";
+  let constructor = "mlir::polygeist::createMergeGPUModulesPass()";
+  let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "gpu::GPUDialect"];
+}
+
 def InnerSerialization : Pass<"inner-serialize"> {
   let summary = "remove scf.barrier";
   let constructor = "mlir::polygeist::createInnerSerializationPass()";
 
@@ -169,6 +169,10 @@ extern "C" void __cudaRegisterFunction(void **fatCubinHandle, void *hostFun,
                                        int32_t thread_limit, void *tid,
                                        void *bid, void *bDim, void *gDim,
                                        void *wSize);
+extern "C" void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
+                                  char *deviceAddress, const char *deviceName,
+                                  int ext, size_t size, int constant,
+                                  int global);
 extern "C" void **__cudaRegisterFatBinary(void *fatCubin);
 extern "C" void __cudaRegisterFatBinaryEnd(void **fatCubinHandle);
 extern "C" void __cudaUnregisterFatBinary(void **fatCubinHandle);
@@ -181,17 +185,25 @@ __mgpurtRegisterFunction(void **fatCubinHandle, void *hostFun, void *deviceFun,
                          thread_limit, tid, bid, bDim, gDim, wSize);
 }
 
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+__mgpurtRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
+                    const char *deviceName, int ext, size_t size, int constant,
+                    int global) {
+  __cudaRegisterVar(fatCubinHandle, hostVar, deviceAddress, deviceName, ext,
+                    size, constant, global);
+}
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void **
 __mgpurtRegisterFatBinary(void *fatCubin) {
   return __cudaRegisterFatBinary(fatCubin);
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 __mgpurtRegisterFatBinaryEnd(void **fatCubinHandle) {
-  return __cudaRegisterFatBinaryEnd(fatCubinHandle);
+  __cudaRegisterFatBinaryEnd(fatCubinHandle);
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 __mgpurtUnregisterFatBinary(void **fatCubinHandle) {
-  return __cudaUnregisterFatBinary(fatCubinHandle);
+  __cudaUnregisterFatBinary(fatCubinHandle);
 }
@@ -93,6 +93,10 @@ extern "C" void __hipRegisterFunction(void **fatCubinHandle, void *hostFun,
                                       int32_t thread_limit, void *tid,
                                       void *bid, void *bDim, void *gDim,
                                       void *wSize);
+extern "C" void __hipRegisterVar(void **fatCubinHandle, char *hostVar,
+                                 char *deviceAddress, const char *deviceName,
+                                 int ext, size_t size, int constant,
+                                 int global);
 extern "C" void **__hipRegisterFatBinary(void *fatCubin);
 extern "C" void __hipRegisterFatBinaryEnd(void **fatCubinHandle);
 extern "C" void __hipUnregisterFatBinary(void **fatCubinHandle);
@@ -104,6 +108,13 @@ __mgpurtRegisterFunction(void **fatCubinHandle, void *hostFun, void *deviceFun,
   __hipRegisterFunction(fatCubinHandle, hostFun, deviceFun, deviceName,
                         thread_limit, tid, bid, bDim, gDim, wSize);
 }
+extern "C" MLIR_HIP_WRAPPERS_EXPORT void
+__mgpurtRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
+                    const char *deviceName, int ext, size_t size, int constant,
+                    int global) {
+  __hipRegisterVar(fatCubinHandle, hostVar, deviceAddress, deviceName, ext,
+                   size, constant, global);
+}
 
 extern "C" MLIR_HIP_WRAPPERS_EXPORT void **
 __mgpurtRegisterFatBinary(void *fatCubin) {
 
@@ -950,7 +950,15 @@ struct HandleWrapperRootOps : public OpRewritePattern<polygeist::GPUWrapperOp> {
       bool read = hasEffect<MemoryEffects::Read>(effects);
       bool write = hasEffect<MemoryEffects::Write>(effects);
       SmallVector<Value, 1> cloned;
-      if (effects.empty()) {
+      // Special case for get_global because what if actually refers to is the
+      // device-side global, so this must remain in the gpu wrapper
+      if (isa<memref::GetGlobalOp>(op)) {
+        // This is the same as the case for a parallelizable read op
+        rewriter.setInsertionPoint(newWrapper.getBody()->getTerminator());
+        rewriter.clone(*op, splitMapping);
+        rewriter.setInsertionPoint(firstGridOp);
+        cloned = rewriter.clone(*op, parallelizedMapping)->getResults();
+      } else if (effects.empty()) {
         rewriter.setInsertionPoint(firstGridOp);
         rewriter.clone(*op, parallelizedMapping);
         rewriter.setInsertionPoint(newWrapper.getBody()->getTerminator());
@@ -1560,8 +1568,81 @@ struct ConvertParallelToGPU2Pass
   }
 };
 
+struct MergeGPUModulesPass
+    : public MergeGPUModulesPassBase<MergeGPUModulesPass> {
+  void runOnOperation() override {
+    auto m = getOperation();
+    Region &moduleRegion = m->getRegion(0);
+    OpBuilder mBuilder(moduleRegion);
+    std::string newModuleName = "__polygeist_gpu_module";
+    auto newGpuModule =
+        mBuilder.create<gpu::GPUModuleOp>(m->getLoc(), newModuleName);
+    OpBuilder gpumBuilder(newGpuModule->getRegion(0));
+    std::vector<gpu::GPUModuleOp> toErase;
+    m->walk([&](gpu::GPUModuleOp gpum) {
+      if (gpum == newGpuModule)
+        return;
+      toErase.push_back(gpum);
+      for (auto &op : *gpum.getBody()) {
+        auto cloneIf = [&](auto op) {
+          if (op) {
+            if (!SymbolTable::lookupSymbolIn(newGpuModule, op.getName())) {
+              gpumBuilder.clone(*op.getOperation());
+            }
+            return true;
+          }
+          return false;
+        };
+
+        if (auto f = dyn_cast<gpu::GPUFuncOp>(&op)) {
+          auto newF = cast<gpu::GPUFuncOp>(gpumBuilder.clone(op));
+          if (SymbolTable::lookupSymbolIn(newGpuModule, f.getName())) {
+            auto newKernelName =
+                std::string(f.getName()) +
+                std::to_string(reinterpret_cast<intptr_t>(f.getOperation()));
+            newF.setName(newKernelName);
+          }
+          auto symbolUses = SymbolTable::getSymbolUses(f.getOperation(), m);
+          assert(symbolUses);
+          for (auto symbolUse : *symbolUses) {
+            if (auto launchOp =
+                    dyn_cast<gpu::LaunchFuncOp>(symbolUse.getUser())) {
+              auto kernelSymbol =
+                  SymbolRefAttr::get(newGpuModule.getNameAttr(),
+                                     {SymbolRefAttr::get(newF.getNameAttr())});
+              launchOp->setAttr(
+                  gpu::LaunchFuncOp::getKernelAttrName(launchOp->getName()),
+                  kernelSymbol);
+            } else {
+              f.emitError("Unexpected user of gpu func op");
+              assert(0);
+            }
+          }
+        } else if (!(cloneIf(dyn_cast<memref::GlobalOp>(&op)) ||
+                     cloneIf(dyn_cast<LLVM::GlobalOp>(&op)) ||
+                     cloneIf(dyn_cast<func::FuncOp>(&op)) ||
+                     cloneIf(dyn_cast<LLVM::LLVMFuncOp>(&op)) ||
+                     isa<gpu::ModuleEndOp>(&op))) {
+          op.emitError("Unexpected global type in gpu module");
+          op.dump();
+          assert(0);
+        }
+      }
+    });
+
+    if (toErase.size() == 0)
+      newGpuModule->erase();
+
+    for (auto gpum : toErase)
+      gpum->erase();
+  }
+};
+
 } // namespace
 
+std::unique_ptr<Pass> mlir::polygeist::createMergeGPUModulesPass() {
+  return std::make_unique<MergeGPUModulesPass>();
+}
 std::unique_ptr<Pass>
 mlir::polygeist::createConvertParallelToGPUPass1(bool useOriginalThreadNums) {
   return std::make_unique<ConvertParallelToGPU1Pass>(useOriginalThreadNums);