EnzymeAD · wsmoses · Jan 28, 2026 · Jan 4, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/src/enzyme_ad/jax/Passes/GPULaunchRecognition.cpp b/src/enzyme_ad/jax/Passes/GPULaunchRecognition.cpp
@@ -6,6 +6,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/IRMapping.h"
 #include "src/enzyme_ad/jax/Dialect/Dialect.h"
@@ -39,7 +40,8 @@ struct GPULaunchRecognitionPass
     gpuModule = gpu::GPUModuleOp::create(
         moduleBuilder, getOperation()->getLoc(), gpuModuleName);
 
-    std::string sm;
+    std::string sm;  // NVIDIA Streaming Multiprocessor (sm_80)
+    std::string gfx; // AMD Graphics IP (gfx906)
     if (auto attr = dyn_cast_or_null<ArrayAttr>(func.getPassthroughAttr())) {
       for (auto a : attr) {
         if (auto ar = dyn_cast<ArrayAttr>(a)) {
@@ -49,8 +51,13 @@ struct GPULaunchRecognitionPass
           auto s1 = dyn_cast<StringAttr>(ar[1]);
           if (!s0 || !s1)
             continue;
-          if (s0.getValue() == "target-cpu")
-            sm = s1.getValue();
+          if (s0.getValue() == "target-cpu") {
+            std::string cpu = s1.getValue().str();
+            if (cpu.find("gfx") == 0)
+              gfx = cpu;
+            else
+              sm = cpu;
+          }
         }
       }
     }
@@ -60,18 +67,47 @@ struct GPULaunchRecognitionPass
       feat = attr.getFeaturesString();
     }
 
-    auto chip = sm;
-    if (chip.size() == 0)
-      chip = "sm_80";
-    auto features = feat;
-    if (features.size() == 0)
-      features = "+ptx73";
-
-    // TODO get these target attrs from somewhere
-    auto target = moduleBuilder.getAttr<NVVM::NVVMTargetAttr>(
-        /*optLevel=*/2, /*triple=*/"nvptx64-nvidia-cuda", chip, features,
-        /*flags=*/nullptr,
-        /*linkLibs=*/nullptr);
+    // auto chip = sm;
+    // if (chip.size() == 0)
+    //   chip = "sm_80";
+    // auto features = feat;
+    // if (features.size() == 0)
+    //   features = "+ptx73";
+
+    // // TODO get these target attrs from somewhere
+    // auto target = moduleBuilder.getAttr<NVVM::NVVMTargetAttr>(
+    //     /*optLevel=*/2, /*triple=*/"nvptx64-nvidia-cuda", chip, features,
+    //     /*flags=*/nullptr,
+    //     /*linkLibs=*/nullptr);
+
+    // I have not find how to get the abiVersion yet
+    Attribute target;
+    if (backend == "rocm") {
+      auto chip = gfx;
+      if (chip.size() == 0)
+        chip = "gfx900";
+      auto features = feat;
+      if (features.size() == 0)
+        features = "\"";
+      // Features come from target_features attribute (e.g., "+wavefrontsize64")
+      target = moduleBuilder.getAttr<ROCDL::ROCDLTargetAttr>(
+          /*optLevel=*/2, /*triple=*/"amdgcn-amd-amdhsa", chip, features,
+          /*abiVersion=*/"",
+          /*flags=*/nullptr,
+          /*linkLibs=*/nullptr);
+    } else {
+      // Default to CUDA/NVVM
+      auto chip = sm;
+      if (chip.size() == 0)
+        chip = "sm_80";
+      auto features = feat;
+      if (features.size() == 0)
+        features = "+ptx73";
+      target = moduleBuilder.getAttr<NVVM::NVVMTargetAttr>(
+          /*optLevel=*/2, /*triple=*/"nvptx64-nvidia-cuda", chip, features,
+          /*flags=*/nullptr,
+          /*linkLibs=*/nullptr);
+    }
     gpuModule.setTargetsAttr(moduleBuilder.getArrayAttr({target}));
 
     DataLayoutSpecInterface dataLayout = {};

diff --git a/src/enzyme_ad/jax/Passes/Passes.td b/src/enzyme_ad/jax/Passes/Passes.td
@@ -784,20 +784,29 @@ def CuDNNHLOOpt : Pass<"enzymexla-cudnn-hlo-opt"> {
 }
 
 def GPULaunchRecognition : Pass<"gpu-launch-recognition"> {
-  let summary = "Optimize stablehlo to emit cuDNN specific optimizations";
+  let summary = "Recognize and convert GPU kernel launches to GPU dialect operations";
   let dependentDialects = [
     "enzymexla::EnzymeXLADialect",
     "arith::ArithDialect", 
     "gpu::GPUDialect",
     "mlir::NVVM::NVVMDialect",
+    "mlir::ROCDL::ROCDLDialect",
     "mlir::DLTIDialect"
   ];
-  let options = [Option<
+  let options = [
+    Option<
        /*C++ variable name=*/"use_launch_func",
        /*CLI argument=*/"use_launch_func",
        /*type=*/"bool",
        /*default=*/"false",
-       /*description=*/"Convert Periodic Concat to Manual Computation with CollectivePermute">];
+       /*description=*/"Convert Periodic Concat to Manual Computation with CollectivePermute">,
+    Option<
+      /*C++ variable name=*/"backend",
+        /*CLI argument=*/"backend",
+        /*type=*/"std::string",
+        /*default=*/"\"cuda\"",
+        /*description=*/"HW backend">,
+    ];
 }
 
 def MergeGPUModulesPass : Pass<"merge-gpu-modules", "mlir::ModuleOp"> {

diff --git a/src/enzyme_ad/jax/raise.cpp b/src/enzyme_ad/jax/raise.cpp
@@ -85,7 +85,11 @@ extern "C" std::string runLLVMToMLIRRoundTrip(std::string input,
   std::string pass_pipeline =
       "inline{default-pipeline=canonicalize "
       "max-iterations=4},sroa-wrappers{set_private=false attributor=false},gpu-launch-"
-      "recognition,canonicalize,libdevice-funcs-raise,canonicalize,symbol-dce,";
+      "recognition{backend=";
+      pass_pipeline += backend;
+      pass_pipeline += "}";
+      pass_pipeline += "gpu-lanuch-"
+      "canonicalize,libdevice-funcs-raise,canonicalize,symbol-dce,";
 
   if (backend == "cpu")
     pass_pipeline += "parallel-lower{wrapParallelOps=false},";