openxla
diff --git a/‎xla/backends/cpu/nanort/ifrt_client.cc‎
Lines changed: 33 additions & 4 deletions b/‎xla/backends/cpu/nanort/ifrt_client.cc‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎xla/pjrt/BUILD‎
Lines changed: 5 additions & 0 deletions b/‎xla/pjrt/BUILD‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎xla/pjrt/async_work_runner.cc‎
Lines changed: 42 additions & 0 deletions b/‎xla/pjrt/async_work_runner.cc‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎xla/pjrt/async_work_runner.h‎
Lines changed: 12 additions & 0 deletions b/‎xla/pjrt/async_work_runner.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎xla/pjrt/buffer_sequencing_event.cc‎
Lines changed: 1 addition & 1 deletion b/‎xla/pjrt/buffer_sequencing_event.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xla/pjrt/buffer_sequencing_event.h‎
Lines changed: 8 additions & 8 deletions b/‎xla/pjrt/buffer_sequencing_event.h‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎xla/pjrt/gpu/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎xla/pjrt/gpu/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xla/pjrt/gpu/se_gpu_pjrt_client.cc‎
Lines changed: 12 additions & 11 deletions b/‎xla/pjrt/gpu/se_gpu_pjrt_client.cc‎
Lines changed: 12 additions & 11 deletions
@@ -24,6 +24,7 @@ limitations under the License.
 #include <functional>
 #include <iterator>
 #include <memory>
+#include <new>
 #include <optional>
 #include <string>
 #include <utility>
@@ -48,6 +49,7 @@ limitations under the License.
 #include "xla/backends/cpu/alignment.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/layout.h"
@@ -828,6 +830,8 @@ class NanoExecutable final
     TF_ASSIGN_OR_RETURN(auto nano_executable,
                         client->nano_client()->Compile(computation));
 
+    TF_ASSIGN_OR_RETURN(auto donatable_input_indices,
+                        GetDonatableInputIndices(computation));
     TF_ASSIGN_OR_RETURN(auto program_shape, computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(auto proto_input_shardings,
                         GetInputShardings(program_shape, computation));
@@ -840,8 +844,8 @@ class NanoExecutable final
 
     return absl::WrapUnique(new NanoExecutable(
         client, std::move(computation), std::move(program_shape),
-        std::move(nano_executable), std::move(input_shardings),
-        std::move(output_shardings)));
+        std::move(nano_executable), std::move(donatable_input_indices),
+        std::move(input_shardings), std::move(output_shardings)));
   }
 
   ifrt::Client* client() const override { return client_; }
@@ -850,8 +854,7 @@ class NanoExecutable final
 
   absl::StatusOr<absl::Span<const int>> GetDonatableInputIndices()
       const override {
-    return absl::UnimplementedError(
-        "NanoExecutable::GetDonatableInputIndices is not implemented.");
+    return donatable_input_indices_;
   }
 
   absl::StatusOr<ExecuteResult> Execute(
@@ -1026,13 +1029,15 @@ class NanoExecutable final
   NanoExecutable(NanoIfrtClient* client, XlaComputation program,
                  ProgramShape program_shape,
                  std::unique_ptr<NanoRtExecutable> executable,
+                 std::vector<int> donatable_input_indices,
                  std::vector<ifrt::ShardingRef> input_shardings,
                  std::vector<ifrt::ShardingRef> output_shardings)
       : client_(client),
         devices_(ifrt::BasicDeviceList::Create(client->devices())),
         program_(std::move(program)),
         program_shape_(std::move(program_shape)),
         executable_(std::move(executable)),
+        donatable_input_indices_(std::move(donatable_input_indices)),
         input_shardings_(std::move(input_shardings)),
         output_shardings_(std::move(output_shardings)),
         user_context_(xla::ifrt::UserContextScope::current()) {}
@@ -1068,6 +1073,29 @@ class NanoExecutable final
     return result;
   }
 
+  // Returns a list of donatable input indices from the given HLO modules.
+  static absl::StatusOr<std::vector<int>> GetDonatableInputIndices(
+      const XlaComputation& xla_computation) {
+    const HloModuleProto& hlo_module_proto = xla_computation.proto();
+    std::vector<int> donatable_input_indices;
+    for (const auto& alias : hlo_module_proto.input_output_alias().entries()) {
+      if (alias.parameter_shape_index().empty()) {
+        donatable_input_indices.push_back(alias.parameter_number());
+      } else {
+        donatable_input_indices.push_back(alias.parameter_shape_index(0));
+      }
+    }
+    for (const auto& buffer_donor : hlo_module_proto.buffer_donor().entries()) {
+      if (buffer_donor.parameter_shape_index().empty()) {
+        donatable_input_indices.push_back(buffer_donor.parameter_number());
+      } else {
+        donatable_input_indices.push_back(
+            buffer_donor.parameter_shape_index(0));
+      }
+    }
+    return donatable_input_indices;
+  }
+
   static absl::StatusOr<std::vector<OpSharding>> GetInputShardings(
       const ProgramShape& program_shape, const XlaComputation& computation) {
     std::vector<OpSharding> shardings(program_shape.parameters().size());
@@ -1176,6 +1204,7 @@ class NanoExecutable final
   XlaComputation program_;
   ProgramShape program_shape_;
   std::unique_ptr<NanoRtExecutable> executable_;
+  std::vector<int> donatable_input_indices_;
   std::vector<ifrt::ShardingRef> input_shardings_;
   std::vector<ifrt::ShardingRef> output_shardings_;
   const xla::ifrt::UserContextRef user_context_;
 
@@ -219,6 +219,7 @@ cc_library(
         "local_device_state.h",
     ],
     deps = [
+        ":async_work_runner",
         ":event_pool",
         ":pjrt_common",
         ":semaphore",
@@ -676,6 +677,7 @@ cc_library(
     visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":abstract_tracked_device_buffer",
+        ":async_work_runner",
         ":common_pjrt_client",
         ":device_event",
         ":event_pool",
@@ -1263,10 +1265,12 @@ cc_library(
 
 cc_library(
     name = "async_work_runner",
+    srcs = ["async_work_runner.cc"],
     hdrs = ["async_work_runner.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
         "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:executor",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/types:span",
@@ -1284,6 +1288,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/types:span",
+        "@tsl//tsl/platform:unbounded_work_queue",
     ],
 )
 
 
@@ -0,0 +1,42 @@
+/* Copyright 2026 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/async_work_runner.h"
+
+#include <memory>
+#include <utility>
+
+#include "xla/tsl/concurrency/executor.h"
+
+namespace xla {
+
+namespace {
+
+class AsyncWorkRunnerExecutor : public tsl::Executor {
+ public:
+  explicit AsyncWorkRunnerExecutor(AsyncWorkRunner* runner) : runner_(runner) {}
+
+  void Execute(Task task) override { runner_->Schedule(std::move(task)); }
+
+ private:
+  AsyncWorkRunner* const runner_;
+};
+
+}  // namespace
+
+AsyncWorkRunner::AsyncWorkRunner()
+    : executor_(std::make_unique<AsyncWorkRunnerExecutor>(this)) {}
+
+}  // namespace xla
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef XLA_PJRT_ASYNC_WORK_RUNNER_H_
 #define XLA_PJRT_ASYNC_WORK_RUNNER_H_
 
+#include <memory>
+
 #include "absl/functional/any_invocable.h"
 #include "absl/types/span.h"
 #include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/executor.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
@@ -27,13 +30,22 @@ namespace xla {
 // pool (or concurrent work queue).
 class AsyncWorkRunner {
  public:
+  AsyncWorkRunner();
   virtual ~AsyncWorkRunner() = default;
 
   // `work` euqueued by `Schedule` may run on the calling thread.
   virtual void Schedule(absl::AnyInvocable<void() &&> work) = 0;
   virtual void ScheduleWhenReady(
       absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
       absl::AnyInvocable<void() &&> work) = 0;
+
+  // Returns an tsl::Executor implementation that is backed by this async work
+  // runner. The returned executor is owned by the async work runner and its
+  // lifetime is bound to the lifetime of the thread pool itself.
+  virtual tsl::Executor& AsExecutor() { return *executor_; }
+
+ private:
+  std::unique_ptr<tsl::Executor> executor_;
 };
 
 }  // namespace xla
 
@@ -133,7 +133,7 @@ void BufferSequencingEvent::ExecuteOrAddToFutureTasks(
   // Execute the `task` when definition event becomes available. If it's already
   // available, the task will be executed immediately.
   event_.AndThen([this, traced_task = std::move(traced_task)]() mutable {
-    thread_pool_->Schedule(std::move(traced_task));
+    async_work_runner_->Schedule(std::move(traced_task));
   });
 }
 
 
@@ -26,11 +26,11 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -71,18 +71,18 @@ class BufferSequencingEvent : tsl::AsyncPayload::KeepOnError {
     se::Stream* definition_stream;
   };
 
-  explicit BufferSequencingEvent(tsl::thread::ThreadPool* thread_pool)
-      : thread_pool_(thread_pool),
+  explicit BufferSequencingEvent(AsyncWorkRunner* async_work_runner)
+      : async_work_runner_(async_work_runner),
         event_(tsl::MakeUnconstructedAsyncValueRef<EventState>()) {}
 
-  explicit BufferSequencingEvent(tsl::thread::ThreadPool* thread_pool,
+  explicit BufferSequencingEvent(AsyncWorkRunner* async_work_runner,
                                  tsl::AsyncValueRef<EventState> event)
-      : thread_pool_(thread_pool), event_(event) {}
+      : async_work_runner_(async_work_runner), event_(event) {}
 
   static tsl::AsyncValueRef<BufferSequencingEvent> Create(
-      tsl::thread::ThreadPool* thread_pool) {
+      AsyncWorkRunner* async_work_runner) {
     return tsl::MakeConstructedAsyncValueRef<BufferSequencingEvent>(
-        thread_pool);
+        async_work_runner);
   }
 
   // Sets the sequencing event to 'event', which is recorded on 'stream'. Must
@@ -164,7 +164,7 @@ class BufferSequencingEvent : tsl::AsyncPayload::KeepOnError {
   // at the tail of the queue, i.e., for any newly enqueued command.
   absl::InlinedVector<se::Stream*, 2> streams_defined_on_ ABSL_GUARDED_BY(mu_);
 
-  tsl::thread::ThreadPool* thread_pool_;
+  AsyncWorkRunner* async_work_runner_;
 
   // Indicates if the buffer is in an error status. And error status is used to
   // propagate the error to the buffer consumers.
 
@@ -79,6 +79,7 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/hlo/builder:xla_computation",
         "//xla/pjrt:abstract_tracked_device_buffer",
+        "//xla/pjrt:async_work_runner",
         "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:device_event",
         "//xla/pjrt:event_pool",
 
@@ -64,6 +64,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/common_pjrt_client.h"
 #include "xla/pjrt/device_event.h"
@@ -152,11 +153,11 @@ limitations under the License.
 namespace xla {
 
 absl::Status RunCallbackOnStream(se::Stream* stream,
-                                 tsl::thread::ThreadPool* thread_pool,
+                                 AsyncWorkRunner* async_work_runner,
                                  absl::AnyInvocable<void() &&> callback) {
   return stream->DoHostCallbackWithStatus(
-      [cb = std::move(callback), thread_pool]() mutable {
-        thread_pool->Schedule(
+      [cb = std::move(callback), async_work_runner]() mutable {
+        async_work_runner->Schedule(
             [cb_ptr = new absl::AnyInvocable<void() &&>(std::move(cb))]() {
               std::move (*cb_ptr)();
               delete cb_ptr;
@@ -761,7 +762,7 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
     gpu::GpuCollectives* gpu_collectives =
         gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     usage_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
-        BufferSequencingEvent::Create(this->thread_pool()));
+        BufferSequencingEvent::Create(this->async_work_runner()));
 
     gpu::AcquiredCliquesMap acquired_cliques_map;
     for (int i = 0; i < buffers.size(); ++i) {
@@ -853,7 +854,7 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
     Future<> all_sends_future = JoinFutures(group_futures);
 
     all_sends_future.OnReady(
-        *this->thread_pool()->AsExecutor(),
+        this->async_work_runner()->AsExecutor(),
         [this, local_device_state, stream, promises = std::move(promises),
          usage_event, grouped_sends = std::move(grouped_sends)](
             const absl::Status& status) mutable {
@@ -870,7 +871,7 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
           // Asynchronously fulfill promises via a host callback, failing them
           // early if there is an issue registering the callback.
           absl::Status callback_status = RunCallbackOnStream(
-              stream, this->thread_pool(), [promises]() mutable {
+              stream, this->async_work_runner(), [promises]() mutable {
                 FulfillPromises(promises, absl::OkStatus());
               });
 
@@ -911,7 +912,7 @@ StreamExecutorGpuClient::PrepareReceiveBuffer(PjRtDevice* device, Shape shape) {
   se::Stream* stream = local_device->GetDeviceToDeviceStream();
 
   BufferSequencingEventRef definition_event =
-      BufferSequencingEvent::Create(this->thread_pool());
+      BufferSequencingEvent::Create(this->async_work_runner());
   TF_ASSIGN_OR_RETURN(
       auto buffer,
       DefineBuffer(
@@ -981,7 +982,7 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
     gpu::GpuCollectives* gpu_collectives =
         gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     definition_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
-        BufferSequencingEvent::Create(this->thread_pool()));
+        BufferSequencingEvent::Create(this->async_work_runner()));
 
     gpu::AcquiredCliquesMap acquired_cliques_map;
     for (int i = 0; i < shapes.size(); ++i) {
@@ -1064,7 +1065,7 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
         Future<> all_receives_future = JoinFutures(group_futures);
 
         all_receives_future.OnReady(
-            *this->thread_pool()->AsExecutor(),
+            this->async_work_runner()->AsExecutor(),
             [this, local_device_state, stream,
              grouped_receives = std::move(grouped_receives),
              definition_event = std::move(definition_event)](
@@ -1105,7 +1106,7 @@ void StreamExecutorGpuClient::ScheduleRemoteSend(
   }
 
   BufferSequencingEventRef usage_event =
-      BufferSequencingEvent::Create(this->thread_pool());
+      BufferSequencingEvent::Create(this->async_work_runner());
 
   // Keep memory alive until the event is done.
   usage_event.AndThen([raw_buffer]() {});
@@ -1259,7 +1260,7 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
       SetEventAsError(definition_event, s);
     }
   };
-  thread_pool()->Schedule(recv);
+  async_work_runner()->Schedule(recv);
 
   std::vector<std::unique_ptr<PjRtBuffer>> buffers;
   buffers.push_back(std::move(receive_prep_result.buffer));
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ void BufferSequencingEvent::ExecuteOrAddToFutureTasks(`
`133`	`133`	// Execute the `task` when definition event becomes available. If it's already
`134`	`134`	`// available, the task will be executed immediately.`
`135`	`135`	`event_.AndThen([this, traced_task = std::move(traced_task)]() mutable {`
`136`		`- thread_pool_->Schedule(std::move(traced_task));`
	`136`	`+ async_work_runner_->Schedule(std::move(traced_task));`
`137`	`137`	`});`
`138`	`138`	`}`
`139`	`139`