Add cudart-to-hiprt conversion (#2016)

xys-syx · web-flow · commit 2685ba6a52ee · 2026-02-05T13:58:09.000-06:00
* add cudart-to-hiprt conversion, map cudaFree to hipFree

* fix

* fmt

* fmt

* add pass in raise pipeline

* fix
diff --git a/src/enzyme_ad/jax/Passes/ParallelLower.cpp b/src/enzyme_ad/jax/Passes/ParallelLower.cpp
@@ -31,6 +31,8 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DebugLog.h"
+#include <llvm/ADT/SmallVector.h>
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 
 #include "Enzyme/MLIR/Dialect/Ops.h"
 #include "Enzyme/MLIR/Passes/Passes.h"
@@ -45,6 +47,7 @@ namespace enzyme {
 #define GEN_PASS_DEF_PARALLELLOWER
 #define GEN_PASS_DEF_FIXGPUFUNC
 #define GEN_PASS_DEF_STRIPGPUINFO
+#define GEN_PASS_DEF_CONVERTCUDARTTOHIPRT
 #include "src/enzyme_ad/jax/Passes/Passes.h.inc"
 } // namespace enzyme
 } // namespace mlir
@@ -109,11 +112,13 @@ struct ConvertCudaRTtoCPU : public ConvertCudaRTtoCPUBase<ConvertCudaRTtoCPU> {
 struct ConvertCudaRTtoGPU : public ConvertCudaRTtoGPUBase<ConvertCudaRTtoGPU> {
   void runOnOperation() override;
 };
+*/
+
 struct ConvertCudaRTtoHipRT
-    : public ConvertCudaRTtoHipRTBase<ConvertCudaRTtoHipRT> {
+    : public enzyme::impl::ConvertCudaRTtoHipRTBase<ConvertCudaRTtoHipRT> {
   void runOnOperation() override;
 };
-*/
+
 struct FixGPUFunc : public enzyme::impl::FixGPUFuncBase<FixGPUFunc> {
   using FixGPUFuncBase::FixGPUFuncBase;
   void runOnOperation() override;
@@ -1330,18 +1335,19 @@ static void setCallee(LLVM::CallOp call, StringRef symName) {
   call.setCallee(symName);
 }
 template <typename CallOpTy, typename FuncOpTy>
-void replaceCallOp(ModuleOp m, CallOpTy call, llvm::StringRef callee) {
-  auto loc = call->getLoc();
-  OpBuilder moduleBuilder = OpBuilder::atBlockEnd(m.getBody());
+void replaceCallOp(ModuleOp m, CallOpTy call, llvm::StringRef callee,
+                   SmallPtrSetImpl<Operation *> &toErase) {
   OpBuilder callBuilder(call);
   auto funcOp = m.lookupSymbol<FuncOpTy>(callee);
   if (isHipCallEquivalent(callee)) {
     assert(funcOp);
     auto hipName = getHipName(callee);
     if (!m.lookupSymbol<FuncOpTy>(hipName)) {
+      OpBuilder moduleBuilder(funcOp.getOperation());
       auto hipFuncOp =
           cast<FuncOpTy>(moduleBuilder.clone(*funcOp.getOperation()));
       hipFuncOp.setSymName(hipName);
+      toErase.insert(funcOp.getOperation());
     }
     setCallee(call, hipName);
   } else {
@@ -1351,24 +1357,30 @@ void replaceCallOp(ModuleOp m, CallOpTy call, llvm::StringRef callee) {
   }
 }
 
-#if 0
 void ConvertCudaRTtoHipRT::runOnOperation() {
+  SmallPtrSet<Operation *, 8> toErase;
+
   getOperation().walk([&](LLVM::CallOp call) {
     if (!call.getCallee())
       return;
     auto name = *call.getCallee();
     if (!isCudartCall(name))
       return;
-    replaceCallOp<LLVM::CallOp, LLVM::LLVMFuncOp>(getOperation(), call, name);
+    replaceCallOp<LLVM::CallOp, LLVM::LLVMFuncOp>(getOperation(), call, name,
+                                                  toErase);
   });
 
   getOperation().walk([&](CallOp call) {
     auto name = call.getCallee();
     if (!isCudartCall(name))
       return;
-    replaceCallOp<CallOp, func::FuncOp>(getOperation(), call, name);
+    replaceCallOp<CallOp, func::FuncOp>(getOperation(), call, name, toErase);
   });
 
+  // Erase old CUDA function declarations after all calls are updated
+  for (Operation *op : toErase)
+    op->erase();
+
   OpBuilder builder(&getContext());
   getOperation().walk([&](mlir::NVVM::Barrier0Op op) {
     builder.setInsertionPoint(op);
@@ -1377,6 +1389,7 @@ void ConvertCudaRTtoHipRT::runOnOperation() {
   });
 }
 
+#if 0
 void ConvertCudaRTtoGPU::runOnOperation() {
   std::function<void(Operation * call, llvm::StringRef callee)> replaceWithOp =
       [&](Operation *call, llvm::StringRef callee) {
diff --git a/src/enzyme_ad/jax/Passes/Passes.td b/src/enzyme_ad/jax/Passes/Passes.td
@@ -1065,6 +1065,13 @@ def ParallelLower : Pass<"parallel-lower"> {
   ];
 }
 
+def ConvertCudaRTtoHipRT : Pass<"convert-cudart-to-hiprt", "mlir::ModuleOp"> {
+  let summary = "Convert CUDA runtime calls to HIP runtime calls";
+  let dependentDialects = [
+    "mlir::ROCDL::ROCDLDialect",
+  ];
+}
+
 def SCFParallelLoopUnroll : Pass<"scf-parallel-loop-unroll"> {
   let summary = "Unroll and interleave scf parallel loops";
   let dependentDialects = [
diff --git a/src/enzyme_ad/jax/raise.cpp b/src/enzyme_ad/jax/raise.cpp
@@ -131,6 +131,8 @@ extern "C" std::string runLLVMToMLIRRoundTrip(std::string input,
         pass_pipeline += "print{filename="+outfile+".mlir},";
       }
       pass_pipeline += "symbol-dce,enzyme,remove-unnecessary-enzyme-ops,lower-affine";
+      if (backend == "rocm")
+        pass_pipeline += ",convert-cudart-to-hiprt";
       if (backend != "cpu")
 	pass_pipeline += ",convert-parallel-to-gpu1,gpu-kernel-outlining,canonicalize,convert-parallel-to-gpu2{backend=";
       pass_pipeline += backend;
diff --git a/test/lit_tests/lowering/convert-cudart-to-hiprt.mlir b/test/lit_tests/lowering/convert-cudart-to-hiprt.mlir
@@ -0,0 +1,52 @@
+// RUN: enzymexlamlir-opt %s --pass-pipeline="builtin.module(convert-cudart-to-hiprt)" | FileCheck %s
+
+module {
+  llvm.func @cudaMalloc(!llvm.ptr, i64) -> i32
+  llvm.func @cudaFree(!llvm.ptr) -> i32
+  llvm.func @cudaMemcpy(!llvm.ptr, !llvm.ptr, i64, i32) -> i32
+  llvm.func @cudaDeviceSynchronize() -> i32
+  llvm.func @cudaMemset(!llvm.ptr, i32, i64) -> i32
+  llvm.func @cudaGetLastError() -> i32
+
+  llvm.func @test_llvm_cuda_calls(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64) -> i32 {
+    %c0 = llvm.mlir.constant(0 : i32) : i32
+    %c1 = llvm.mlir.constant(1 : i32) : i32
+
+    %0 = llvm.call @cudaMalloc(%arg0, %arg2) : (!llvm.ptr, i64) -> i32
+    %1 = llvm.call @cudaMemcpy(%arg0, %arg1, %arg2, %c1) : (!llvm.ptr, !llvm.ptr, i64, i32) -> i32
+    %2 = llvm.call @cudaMemset(%arg0, %c0, %arg2) : (!llvm.ptr, i32, i64) -> i32
+    %3 = llvm.call @cudaDeviceSynchronize() : () -> i32
+    %4 = llvm.call @cudaFree(%arg0) : (!llvm.ptr) -> i32
+    %5 = llvm.call @cudaGetLastError() : () -> i32
+
+    llvm.return %c0 : i32
+  }
+
+  llvm.func @test_nvvm_barrier_conversion(%arg0: !llvm.ptr) {
+    %0 = llvm.mlir.constant(42 : i32) : i32
+    llvm.store %0, %arg0 : i32, !llvm.ptr
+    nvvm.barrier0
+    %1 = llvm.load %arg0 : !llvm.ptr -> i32
+    llvm.return
+  }
+}
+
+// CHECK-DAG: llvm.func @hipMalloc(!llvm.ptr, i64) -> i32
+// CHECK-DAG: llvm.func @hipFree(!llvm.ptr) -> i32
+// CHECK-DAG: llvm.func @hipMemcpy(!llvm.ptr, !llvm.ptr, i64, i32) -> i32
+// CHECK-DAG: llvm.func @hipDeviceSynchronize() -> i32
+// CHECK-DAG: llvm.func @hipMemset(!llvm.ptr, i32, i64) -> i32
+// CHECK-DAG: llvm.func @hipGetLastError() -> i32
+
+// CHECK-LABEL: llvm.func @test_llvm_cuda_calls
+// CHECK: llvm.call @hipMalloc
+// CHECK: llvm.call @hipMemcpy
+// CHECK: llvm.call @hipMemset
+// CHECK: llvm.call @hipDeviceSynchronize
+// CHECK: llvm.call @hipFree
+// CHECK: llvm.call @hipGetLastError
+
+// CHECK-LABEL: llvm.func @test_nvvm_barrier_conversion
+// CHECK: llvm.store
+// CHECK: rocdl.barrier
+// CHECK: llvm.load

Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,8 @@ extern "C" std::string runLLVMToMLIRRoundTrip(std::string input,`
`131`	`131`	`pass_pipeline += "print{filename="+outfile+".mlir},";`
`132`	`132`	`}`
`133`	`133`	`pass_pipeline += "symbol-dce,enzyme,remove-unnecessary-enzyme-ops,lower-affine";`
	`134`	`+ if (backend == "rocm")`
	`135`	`+ pass_pipeline += ",convert-cudart-to-hiprt";`
`134`	`136`	`if (backend != "cpu")`
`135`	`137`	`pass_pipeline += ",convert-parallel-to-gpu1,gpu-kernel-outlining,canonicalize,convert-parallel-to-gpu2{backend=";`
`136`	`138`	`pass_pipeline += backend;`