feat: add support for custom compile options in torch_xla.compile and PJRT backend

sshonTT · sshonTT · commit 5be40a001426 · 2025-08-21T18:56:29.000Z
This change introduces the ability to pass custom compile options from Python down to the PJRT backend, allowing users to fine-tune XLA compilation behavior without modifying core code.
Key changes:
* Python API
    * Added custom_compile_options parameter to torch_xla.compile for passing compile-time options as a dict (supports bool, float, int, and str values).
    * Added torch_xla.set_custom_compile_options() utility for setting compile options globally.
    * Added internal binding _XLAC._set_custom_compile_options().
* C++ Runtime
    * Added SetCustomCompileOptions() virtual method to ComputationClient and implemented it in PjRtComputationClient.
    * PjRtComputationClient now stores custom_compile_options_ and injects them into xla::CompileOptions.env_option_overrides during compilation.
    * Options are stringified before being passed to XLA for compatibility.
Motivation: This enables advanced users to pass through backend-specific tuning flags (e.g., enabling experimental optimizations, toggling partitioning strategies) without hardcoding them, improving flexibility for research and debugging workflows.
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -3187,6 +3187,16 @@ void InitXlaModuleBindings(py::module m) {
              XLA_ERROR() << "Could not get the buffer pointer for XLATensor "
                             "without a data handle or an IR.";
            })
+      .def("_set_custom_compile_options",
+           [](const py::dict& compile_options) {
+             std::unordered_map<std::string, std::string> options;
+             for (const auto& item : compile_options) {
+               std::string key = item.first.cast<std::string>();
+               options[key] = py::str(item.second).cast<std::string>();
+             }
+             runtime::GetComputationClientOrDie()->SetCustomCompileOptions(
+                 options);
+           })
       .def(
           // from an XLA tensor to a PyCapsule.
           // When consuming the PyCapsule, we should synchronize
diff --git a/torch_xla/csrc/runtime/computation_client.h b/torch_xla/csrc/runtime/computation_client.h
@@ -444,6 +444,9 @@ class ComputationClient {
   // after the last ':' character of the device string.
   static int64_t GetDeviceOrdinal(const std::string& device);
 
+  virtual void SetCustomCompileOptions(
+      const std::unordered_map<std::string, std::string>& options) = 0;
+
  protected:
   static constexpr auto spmd_device_str = "SPMD:0";
 
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.h b/torch_xla/csrc/runtime/ifrt_computation_client.h
@@ -174,6 +174,11 @@ class IfrtComputationClient : public ComputationClient {
     XLA_ERROR() << __FUNCTION__ << " not implemented";
   }
 
+  void SetCustomCompileOptions(
+      const std::unordered_map<std::string, std::string>& options) override {
+    XLA_ERROR() << __FUNCTION__ << " not implemented";
+  }
+
   // Creates a new instance of IfrtComputationClient and initializes it.
   static absl::StatusOr<absl_nonnull std::unique_ptr<IfrtComputationClient>>
   Create();
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cpp b/torch_xla/csrc/runtime/pjrt_computation_client.cpp
@@ -558,13 +558,18 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
 
   for (auto& instance : instances) {
     xla::CompileOptions compile_options;
+    for (auto& option : custom_compile_options_) {
+      compile_options.env_option_overrides.push_back(
+          {option.first, option.second});
+    }
     if (enable_cm_in_mp) {
       compile_options.executable_build_options.set_use_spmd_partitioning(true);
       compile_options.env_option_overrides.push_back(
           {"xla_tpu_decompose_all_gather_einsum", true});
       compile_options.env_option_overrides.push_back(
           {"xla_tpu_decompose_einsum_reduce_scatter", true});
     }
+
     if (instance.is_sharded) {
       // TODO(yeounoh) multi-host, multi-slice configurations
       compile_options.executable_build_options.set_use_spmd_partitioning(true);
@@ -1093,5 +1098,14 @@ void PjRtComputationClient::OnReadyCallback(
       [callback](absl::Status unused) { callback(); });
 }
 
+void PjRtComputationClient::SetCustomCompileOptions(
+    const std::unordered_map<std::string, std::string>& options) {
+  // Stringfy values
+  custom_compile_options_.clear();
+  for (const auto& [key, value] : options) {
+    custom_compile_options_[key] = value;
+  }
+}
+
 }  // namespace runtime
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.h b/torch_xla/csrc/runtime/pjrt_computation_client.h
@@ -172,6 +172,9 @@ class PjRtComputationClient : public ComputationClient {
   void OnReadyCallback(DataPtr data,
                        const std::function<void()>& callback) override;
 
+  void SetCustomCompileOptions(
+      const std::unordered_map<std::string, std::string>& options) override;
+
   // Creates a new instance of PjRtComputationClient and initializes it.
   static absl::StatusOr<absl_nonnull std::unique_ptr<PjRtComputationClient>>
   Create();
@@ -204,6 +207,7 @@ class PjRtComputationClient : public ComputationClient {
   // If not nullptr, invoke this instead of the actual XLA compilation. Used
   // only for testing.
   std::function<absl::Status()> fake_xla_compile_ = nullptr;
+  std::unordered_map<std::string, std::string> custom_compile_options_;
 
   xla::PjRtDevice* StringToPjRtDevice(const std::string& device);
 
diff --git a/torch_xla/torch_xla.py b/torch_xla/torch_xla.py
@@ -116,6 +116,7 @@ def compile(
     full_graph: Optional[bool] = False,
     name: Optional[str] = None,
     max_different_graphs: Optional[int] = None,
+    custom_compile_options: Optional[dict] = None,
 ):
   """
   Optimizes given model/function using torch_xla's LazyTensor tracing mode.
@@ -136,6 +137,8 @@ def compile(
       max_different_graphs (Optional[int]): number of different traced graphs of the given
         model/function that we are allowed to have. An error will be raised in case this limit
         is exceeded.
+      custom_compile_options (Optional[dict]): A dictionary of custom compile options to be set.
+        The keys are strings and the values can be of type bool, float, int, or str.
 
   Example::
 
@@ -214,7 +217,8 @@ def _compile():
       sync()
       torch_xla._XLAC._set_use_eager_mode(saved_eager_mode_status)
       torch_xla._XLAC._set_current_graph_name(saved_current_graph_name)
-
+  if custom_compile_options is not None and len(custom_compile_options) > 0:
+    torch_xla._XLAC._set_custom_compile_options(custom_compile_options)
   return _compile() if f is None else _compile()(f)
 
 
@@ -264,3 +268,17 @@ def launch(
     fn(xu.getenv_as(xenv.LOCAL_RANK, int), *args)
   else:
     xmp.spawn(fn, args=args, nprocs=nprocs, start_method=start_method)
+
+def set_custom_compile_options(
+    options: Optional[dict] = None,
+):
+  """Sets custom compile options for the XLA compilation.
+
+  Args:
+    options: A dictionary of custom compile options to be set.
+      The keys are strings and the values can be of type bool, float, int, or str.
+  """
+  if options is None:
+    options = {}
+  torch_xla._XLAC._set_custom_compile_options(options)
+