ROCm
diff --git a/‎jaxlib/BUILD‎
Lines changed: 32 additions & 20 deletions b/‎jaxlib/BUILD‎
Lines changed: 32 additions & 20 deletions
diff --git a/‎jaxlib/cuda_plugin_extension.cc‎
Lines changed: 4 additions & 142 deletions b/‎jaxlib/cuda_plugin_extension.cc‎
Lines changed: 4 additions & 142 deletions
@@ -208,27 +208,47 @@ pybind_extension(
     ],
 )
 
-pybind_extension(
-    name = "cuda_plugin_extension",
-    srcs = ["cuda_plugin_extension.cc"],
-    module_name = "cuda_plugin_extension",
+cc_library(
+    name = "gpu_plugin_extension",
+    srcs = ["gpu_plugin_extension.cc"],
+    hdrs = ["gpu_plugin_extension.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
     deps = [
+        ":kernel_nanobind_helpers",
         "@com_google_absl//absl/status",
-        "@nanobind",
-        "//jaxlib:kernel_nanobind_helpers",
-        "@xla//third_party/python_runtime:headers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
         "@local_config_cuda//cuda:cuda_headers",
+        "@nanobind",
+        "@tsl//tsl/platform:statusor",
         "@xla//xla:util",
         "@xla//xla/ffi/api:c_api",
         "@xla//xla/pjrt:status_casters",
         "@xla//xla/pjrt/c:pjrt_c_api_gpu_extension_hdrs",
         "@xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@xla//xla/pjrt/c:pjrt_c_api_helpers",
-        # TODO(jieying): move to jaxlib after py_client_gpu is separated from py_client
         "@xla//xla/python:py_client_gpu",
+        "@xla//xla/tsl/python/lib/core:numpy",
+    ],
+)
+
+pybind_extension(
+    name = "cuda_plugin_extension",
+    srcs = ["cuda_plugin_extension.cc"],
+    module_name = "cuda_plugin_extension",
+    deps = [
+        ":gpu_plugin_extension",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@nanobind",
+        "@xla//xla/pjrt:status_casters",
         "@xla//xla/tsl/cuda:cublas",
         "@xla//xla/tsl/cuda:cudart",
-        "@xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -237,20 +257,12 @@ pybind_extension(
     srcs = ["rocm_plugin_extension.cc"],
     module_name = "rocm_plugin_extension",
     deps = [
-        "//jaxlib:kernel_nanobind_helpers",
-        "@com_google_absl//absl/status",
+        ":gpu_plugin_extension",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
         "@local_config_rocm//rocm:hip",
         "@local_config_rocm//rocm:rocm_headers",
         "@nanobind",
-        "@xla//third_party/python_runtime:headers",
-        "@xla//xla:util",
-        "@xla//xla/ffi/api:c_api",
-        "@xla//xla/pjrt:status_casters",
-        "@xla//xla/pjrt/c:pjrt_c_api_gpu_extension_hdrs",
-        "@xla//xla/pjrt/c:pjrt_c_api_hdrs",
-        "@xla//xla/pjrt/c:pjrt_c_api_helpers",
-        "@xla//xla/python:py_client_gpu",
-        "@xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
 
@@ -12,135 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <Python.h>
 
-#include <cstddef>
+#include <cstdint>
 #include <string>
-#include <string_view>
-#include <utility>
 
 #include "nanobind/nanobind.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
-#include "jaxlib/kernel_nanobind_helpers.h"
-#include "xla/ffi/api/c_api.h"
-#include "xla/pjrt/c/pjrt_c_api.h"
-#include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
-#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "jaxlib/gpu_plugin_extension.h"
 #include "xla/pjrt/status_casters.h"
-#include "xla/python/py_client_gpu.h"
-#include "xla/tsl/python/lib/core/numpy.h"
-#include "xla/util.h"
 
 namespace nb = nanobind;
 
 namespace xla {
 namespace {
-absl::Status RegisterCustomCallTarget(const PJRT_Api* c_api,
-                                      const char* fn_name_c_str,
-                                      size_t fn_name_size, nb::object fn,
-                                      int api_version,
-                                      XLA_FFI_Handler_Traits traits) {
-  if (c_api->extension_start == nullptr) {
-    return Unimplemented("The plugin does not have extension.");
-  }
-  const PJRT_Extension_Base* next =
-      reinterpret_cast<const PJRT_Extension_Base*>(c_api->extension_start);
-  while (next != nullptr &&
-         next->type !=
-             PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call) {
-    next = next->next;
-  }
-  if (next == nullptr) {
-    return Unimplemented("The plugin does not have a custom call extension.");
-  }
-  PJRT_Gpu_Register_Custom_Call* register_custom_call =
-    reinterpret_cast<const PJRT_Gpu_Custom_Call*>(next)->custom_call;
-
-  if (traits != 0) {
-    return Unimplemented("The plugin does not support custom call traits.");
-  }
-
-  PJRT_Gpu_Register_Custom_Call_Args args;
-  args.struct_size = PJRT_Gpu_Register_Custom_Call_Args_STRUCT_SIZE;
-  args.function_name = fn_name_c_str;
-  args.function_name_size = fn_name_size;
-
-#if PJRT_API_GPU_EXTENSION_VERSION >= 1
-  args.api_version = api_version;
-#endif
-
-  auto as_capsule = [](nb::object obj) -> absl::StatusOr<nb::capsule> {
-    nb::capsule capsule;
-    if (!nb::try_cast<nb::capsule>(obj, capsule)) {
-      return absl::InvalidArgumentError(
-          "Custom call target registration requires handlers as PyCapsules");
-    }
-    return capsule;
-  };
-
-#if PJRT_API_GPU_EXTENSION_VERSION <= 1
-  TF_ASSIGN_OR_RETURN(nb::capsule fn_execute, as_capsule(fn));
-  args.custom_call_function = fn_execute.data();
-  RETURN_STATUS_IF_PJRT_ERROR(register_custom_call(&args), c_api);
-  return absl::OkStatus();
-#else
-  args.handler_instantiate = nullptr;
-  args.handler_prepare = nullptr;
-  args.handler_initialize = nullptr;
-  args.handler_execute = nullptr;
-
-  // Register legacy custom call target (untyped void* API).
-  if (api_version == 0) {
-    TF_ASSIGN_OR_RETURN(nb::capsule capsule_execute, as_capsule(fn));
-    args.handler_execute = capsule_execute.data();
-    RETURN_STATUS_IF_PJRT_ERROR(register_custom_call(&args), c_api);
-    return absl::OkStatus();
-  }
-
-  // Register XLA FFI handler (typed API with explicit function signatures).
-  if (api_version == 1) {
-    auto capsule_execute = as_capsule(fn);
-    if (capsule_execute.ok()) {
-      args.handler_execute = capsule_execute->data();
-      RETURN_STATUS_IF_PJRT_ERROR(register_custom_call(&args), c_api);
-      return absl::OkStatus();
-    }
-
-    nb::dict bundle;
-    if (nb::try_cast<nb::dict>(fn, bundle)) {
-      auto handler = [&](const char* name) -> absl::StatusOr<void*> {
-        if (!bundle.contains(name)) return nullptr;
-        TF_ASSIGN_OR_RETURN(nb::capsule capsule, as_capsule(bundle[name]));
-        return capsule.data();
-      };
-
-      TF_ASSIGN_OR_RETURN(args.handler_instantiate, handler("instantiate"));
-      TF_ASSIGN_OR_RETURN(args.handler_prepare, handler("prepare"));
-      TF_ASSIGN_OR_RETURN(args.handler_initialize, handler("initialize"));
-      TF_ASSIGN_OR_RETURN(args.handler_execute, handler("execute"));
-      RETURN_STATUS_IF_PJRT_ERROR(register_custom_call(&args), c_api);
-      return absl::OkStatus();
-    }
-
-    return absl::InvalidArgumentError(
-        "Unsupported custom call target type for api_version=1");
-  }
-
-  return absl::UnimplementedError(absl::StrFormat(
-      "API version %d is not supported by RegisterCustomCallTarget. "
-      "Supported versions are 0 and 1.",
-      api_version));
-#endif
-}
-
-nb::dict Registrations() {
-  nb::dict dict;
-  dict["xla_python_gpu_callback"] =
-      jax::EncapsulateFunction(xla::XlaPythonGpuCallback);
-  return dict;
-}
-
 static std::string ToString(CUresult result) {
   const char* error_name;
   if (cuGetErrorName(result, &error_name)) {
@@ -155,31 +41,7 @@ static std::string ToString(CUresult result) {
 }  // namespace
 
 NB_MODULE(cuda_plugin_extension, m) {
-  tsl::ImportNumpy();
-  m.def(
-      "register_custom_call_target",
-      [](nb::capsule c_api, nb::object fn_name_py, nb::object fn,
-         nb::str xla_platform_name, int api_version,
-         XLA_FFI_Handler_Traits traits) {
-        const char* fn_name_c_str;
-        size_t fn_name_size;
-        nb::str fn_name_bn_str;
-        if (nb::try_cast<nb::str>(fn_name_py, fn_name_bn_str)) {
-          fn_name_c_str = fn_name_bn_str.c_str();
-          fn_name_size = nb::len(fn_name_bn_str);
-        } else{
-          nb::bytes bytes = nb::cast<nb::bytes>(fn_name_py);
-          fn_name_c_str = bytes.c_str();
-          fn_name_size = bytes.size();
-        }
-        xla::ThrowIfError(RegisterCustomCallTarget(
-            static_cast<const PJRT_Api*>(c_api.data()), fn_name_c_str,
-            fn_name_size, std::move(fn), api_version, traits));
-      },
-      nb::arg("c_api"), nb::arg("fn_name"), nb::arg("fn"),
-      nb::arg("xla_platform_name"), nb::arg("api_version") = 0,
-      nb::arg("traits") = 0);
-  m.def("registrations", &Registrations);
+  BuildGpuPluginExtension(m);
   m.def(
       "get_device_ordinal",
       [](std::intptr_t data_value) {