openxla
diff --git a/‎xla/pjrt/BUILD‎
Lines changed: 5 additions & 0 deletions b/‎xla/pjrt/BUILD‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎xla/pjrt/async_work_runner.cc‎
Lines changed: 42 additions & 0 deletions b/‎xla/pjrt/async_work_runner.cc‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎xla/pjrt/async_work_runner.h‎
Lines changed: 12 additions & 0 deletions b/‎xla/pjrt/async_work_runner.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎xla/pjrt/buffer_sequencing_event.cc‎
Lines changed: 1 addition & 1 deletion b/‎xla/pjrt/buffer_sequencing_event.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xla/pjrt/buffer_sequencing_event.h‎
Lines changed: 8 additions & 8 deletions b/‎xla/pjrt/buffer_sequencing_event.h‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎xla/pjrt/gpu/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎xla/pjrt/gpu/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xla/pjrt/gpu/se_gpu_pjrt_client.cc‎
Lines changed: 12 additions & 11 deletions b/‎xla/pjrt/gpu/se_gpu_pjrt_client.cc‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎xla/pjrt/gpu/se_gpu_pjrt_client_test.cc‎
Lines changed: 6 additions & 6 deletions b/‎xla/pjrt/gpu/se_gpu_pjrt_client_test.cc‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎xla/pjrt/local_device_state.cc‎
Lines changed: 3 additions & 2 deletions b/‎xla/pjrt/local_device_state.cc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎xla/pjrt/local_device_state.h‎
Lines changed: 2 additions & 1 deletion b/‎xla/pjrt/local_device_state.h‎
Lines changed: 2 additions & 1 deletion
@@ -219,6 +219,7 @@ cc_library(
         "local_device_state.h",
     ],
     deps = [
+        ":async_work_runner",
         ":event_pool",
         ":pjrt_common",
         ":semaphore",
@@ -676,6 +677,7 @@ cc_library(
     visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":abstract_tracked_device_buffer",
+        ":async_work_runner",
         ":common_pjrt_client",
         ":device_event",
         ":event_pool",
@@ -1263,10 +1265,12 @@ cc_library(
 
 cc_library(
     name = "async_work_runner",
+    srcs = ["async_work_runner.cc"],
     hdrs = ["async_work_runner.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
         "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:executor",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/types:span",
@@ -1284,6 +1288,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/types:span",
+        "@tsl//tsl/platform:unbounded_work_queue",
     ],
 )
 
 
@@ -0,0 +1,42 @@
+/* Copyright 2026 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/async_work_runner.h"
+
+#include <memory>
+#include <utility>
+
+#include "xla/tsl/concurrency/executor.h"
+
+namespace xla {
+
+namespace {
+
+class AsyncWorkRunnerExecutor : public tsl::Executor {
+ public:
+  explicit AsyncWorkRunnerExecutor(AsyncWorkRunner* runner) : runner_(runner) {}
+
+  void Execute(Task task) override { runner_->Schedule(std::move(task)); }
+
+ private:
+  AsyncWorkRunner* const runner_;
+};
+
+}  // namespace
+
+AsyncWorkRunner::AsyncWorkRunner()
+    : executor_(std::make_unique<AsyncWorkRunnerExecutor>(this)) {}
+
+}  // namespace xla
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef XLA_PJRT_ASYNC_WORK_RUNNER_H_
 #define XLA_PJRT_ASYNC_WORK_RUNNER_H_
 
+#include <memory>
+
 #include "absl/functional/any_invocable.h"
 #include "absl/types/span.h"
 #include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/executor.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
@@ -27,13 +30,22 @@ namespace xla {
 // pool (or concurrent work queue).
 class AsyncWorkRunner {
  public:
+  AsyncWorkRunner();
   virtual ~AsyncWorkRunner() = default;
 
   // `work` euqueued by `Schedule` may run on the calling thread.
   virtual void Schedule(absl::AnyInvocable<void() &&> work) = 0;
   virtual void ScheduleWhenReady(
       absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
       absl::AnyInvocable<void() &&> work) = 0;
+
+  // Returns an tsl::Executor implementation that is backed by this async work
+  // runner. The returned executor is owned by the async work runner and its
+  // lifetime is bound to the lifetime of the thread pool itself.
+  virtual tsl::Executor& AsExecutor() { return *executor_; }
+
+ private:
+  std::unique_ptr<tsl::Executor> executor_;
 };
 
 }  // namespace xla
 
@@ -133,7 +133,7 @@ void BufferSequencingEvent::ExecuteOrAddToFutureTasks(
   // Execute the `task` when definition event becomes available. If it's already
   // available, the task will be executed immediately.
   event_.AndThen([this, traced_task = std::move(traced_task)]() mutable {
-    thread_pool_->Schedule(std::move(traced_task));
+    async_work_runner_->Schedule(std::move(traced_task));
   });
 }
 
 
@@ -26,11 +26,11 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -71,18 +71,18 @@ class BufferSequencingEvent : tsl::AsyncPayload::KeepOnError {
     se::Stream* definition_stream;
   };
 
-  explicit BufferSequencingEvent(tsl::thread::ThreadPool* thread_pool)
-      : thread_pool_(thread_pool),
+  explicit BufferSequencingEvent(AsyncWorkRunner* async_work_runner)
+      : async_work_runner_(async_work_runner),
         event_(tsl::MakeUnconstructedAsyncValueRef<EventState>()) {}
 
-  explicit BufferSequencingEvent(tsl::thread::ThreadPool* thread_pool,
+  explicit BufferSequencingEvent(AsyncWorkRunner* async_work_runner,
                                  tsl::AsyncValueRef<EventState> event)
-      : thread_pool_(thread_pool), event_(event) {}
+      : async_work_runner_(async_work_runner), event_(event) {}
 
   static tsl::AsyncValueRef<BufferSequencingEvent> Create(
-      tsl::thread::ThreadPool* thread_pool) {
+      AsyncWorkRunner* async_work_runner) {
     return tsl::MakeConstructedAsyncValueRef<BufferSequencingEvent>(
-        thread_pool);
+        async_work_runner);
   }
 
   // Sets the sequencing event to 'event', which is recorded on 'stream'. Must
@@ -164,7 +164,7 @@ class BufferSequencingEvent : tsl::AsyncPayload::KeepOnError {
   // at the tail of the queue, i.e., for any newly enqueued command.
   absl::InlinedVector<se::Stream*, 2> streams_defined_on_ ABSL_GUARDED_BY(mu_);
 
-  tsl::thread::ThreadPool* thread_pool_;
+  AsyncWorkRunner* async_work_runner_;
 
   // Indicates if the buffer is in an error status. And error status is used to
   // propagate the error to the buffer consumers.
 
@@ -79,6 +79,7 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/hlo/builder:xla_computation",
         "//xla/pjrt:abstract_tracked_device_buffer",
+        "//xla/pjrt:async_work_runner",
         "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:device_event",
         "//xla/pjrt:event_pool",
 
@@ -64,6 +64,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/common_pjrt_client.h"
 #include "xla/pjrt/device_event.h"
@@ -152,11 +153,11 @@ limitations under the License.
 namespace xla {
 
 absl::Status RunCallbackOnStream(se::Stream* stream,
-                                 tsl::thread::ThreadPool* thread_pool,
+                                 AsyncWorkRunner* async_work_runner,
                                  absl::AnyInvocable<void() &&> callback) {
   return stream->DoHostCallbackWithStatus(
-      [cb = std::move(callback), thread_pool]() mutable {
-        thread_pool->Schedule(
+      [cb = std::move(callback), async_work_runner]() mutable {
+        async_work_runner->Schedule(
             [cb_ptr = new absl::AnyInvocable<void() &&>(std::move(cb))]() {
               std::move (*cb_ptr)();
               delete cb_ptr;
@@ -761,7 +762,7 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
     gpu::GpuCollectives* gpu_collectives =
         gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     usage_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
-        BufferSequencingEvent::Create(this->thread_pool()));
+        BufferSequencingEvent::Create(this->async_work_runner()));
 
     gpu::AcquiredCliquesMap acquired_cliques_map;
     for (int i = 0; i < buffers.size(); ++i) {
@@ -853,7 +854,7 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
     Future<> all_sends_future = JoinFutures(group_futures);
 
     all_sends_future.OnReady(
-        *this->thread_pool()->AsExecutor(),
+        this->async_work_runner()->AsExecutor(),
         [this, local_device_state, stream, promises = std::move(promises),
          usage_event, grouped_sends = std::move(grouped_sends)](
             const absl::Status& status) mutable {
@@ -870,7 +871,7 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
           // Asynchronously fulfill promises via a host callback, failing them
           // early if there is an issue registering the callback.
           absl::Status callback_status = RunCallbackOnStream(
-              stream, this->thread_pool(), [promises]() mutable {
+              stream, this->async_work_runner(), [promises]() mutable {
                 FulfillPromises(promises, absl::OkStatus());
               });
 
@@ -911,7 +912,7 @@ StreamExecutorGpuClient::PrepareReceiveBuffer(PjRtDevice* device, Shape shape) {
   se::Stream* stream = local_device->GetDeviceToDeviceStream();
 
   BufferSequencingEventRef definition_event =
-      BufferSequencingEvent::Create(this->thread_pool());
+      BufferSequencingEvent::Create(this->async_work_runner());
   TF_ASSIGN_OR_RETURN(
       auto buffer,
       DefineBuffer(
@@ -981,7 +982,7 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
     gpu::GpuCollectives* gpu_collectives =
         gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     definition_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
-        BufferSequencingEvent::Create(this->thread_pool()));
+        BufferSequencingEvent::Create(this->async_work_runner()));
 
     gpu::AcquiredCliquesMap acquired_cliques_map;
     for (int i = 0; i < shapes.size(); ++i) {
@@ -1064,7 +1065,7 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
         Future<> all_receives_future = JoinFutures(group_futures);
 
         all_receives_future.OnReady(
-            *this->thread_pool()->AsExecutor(),
+            this->async_work_runner()->AsExecutor(),
             [this, local_device_state, stream,
              grouped_receives = std::move(grouped_receives),
              definition_event = std::move(definition_event)](
@@ -1105,7 +1106,7 @@ void StreamExecutorGpuClient::ScheduleRemoteSend(
   }
 
   BufferSequencingEventRef usage_event =
-      BufferSequencingEvent::Create(this->thread_pool());
+      BufferSequencingEvent::Create(this->async_work_runner());
 
   // Keep memory alive until the event is done.
   usage_event.AndThen([raw_buffer]() {});
@@ -1259,7 +1260,7 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
       SetEventAsError(definition_event, s);
     }
   };
-  thread_pool()->Schedule(recv);
+  async_work_runner()->Schedule(recv);
 
   std::vector<std::unique_ptr<PjRtBuffer>> buffers;
   buffers.push_back(std::move(receive_prep_result.buffer));
 
@@ -2837,9 +2837,9 @@ ENTRY main.5 {
 TEST(StreamExecutorGpuClientTest, EventCaching) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(DefaultOptions()));
-  auto* thread_pool =
+  auto* async_work_runner =
       tensorflow::down_cast<PjRtStreamExecutorClient*>(client.get())
-          ->thread_pool();
+          ->async_work_runner();
   const auto& device = client->addressable_devices()[0];
   LocalDeviceState* local_device_state =
       tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
@@ -2848,14 +2848,14 @@ TEST(StreamExecutorGpuClientTest, EventCaching) {
   size_t sync_point0 = local_device_state->GetNextComputeStreamSyncPoint();
   TF_ASSERT_OK_AND_ASSIGN(auto event0,
                           local_device_state->GetEventForComputeStreamSyncPoint(
-                              sync_point0, thread_pool));
+                              sync_point0, async_work_runner));
   TF_ASSERT_OK_AND_ASSIGN(auto event1,
                           local_device_state->GetEventForComputeStreamSyncPoint(
-                              sync_point0, thread_pool));
+                              sync_point0, async_work_runner));
   size_t sync_point1 = local_device_state->GetNextComputeStreamSyncPoint();
   TF_ASSERT_OK_AND_ASSIGN(auto event2,
                           local_device_state->GetEventForComputeStreamSyncPoint(
-                              sync_point1, thread_pool));
+                              sync_point1, async_work_runner));
   // Events are getting cached.
   EXPECT_EQ(&*event0, &*event1);
   // New events are getting assigned.
@@ -2864,7 +2864,7 @@ TEST(StreamExecutorGpuClientTest, EventCaching) {
   // sync_point1 is ready, so it is the most recent event.
   TF_ASSERT_OK_AND_ASSIGN(auto event3,
                           local_device_state->GetEventForComputeStreamSyncPoint(
-                              sync_point0, thread_pool));
+                              sync_point0, async_work_runner));
   EXPECT_EQ(&*event3, &*event2);
 }
 
 
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/client/local_client.h"
+#include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/worker_thread.h"
 #include "xla/stream_executor/device_address.h"
@@ -325,7 +326,7 @@ absl::Status LocalDeviceState::AllocateAndRecordEvent(
 
 absl::StatusOr<BufferSequencingEventRef>
 LocalDeviceState::GetEventForComputeStreamSyncPoint(
-    size_t sync_point, tsl::thread::ThreadPool* thread_pool,
+    size_t sync_point, AsyncWorkRunner* async_work_runner,
     bool nullptr_if_past) {
   mu_.lock();
   size_t cur_sync_point = next_compute_stream_sync_point_.load();
@@ -343,7 +344,7 @@ LocalDeviceState::GetEventForComputeStreamSyncPoint(
     return event;
   }
   next_compute_stream_sync_point_.store(cur_sync_point + 1);
-  auto event = BufferSequencingEvent::Create(thread_pool);
+  auto event = BufferSequencingEvent::Create(async_work_runner);
   auto status = AllocateAndRecordEvent(event, compute_stream());
   if (!status.ok()) {
     mu_.unlock();
 
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/client/local_client.h"
+#include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -224,7 +225,7 @@ class LocalDeviceState {
   // which only incur the expense of constructing a cuda event if they're really
   // needed. This allows constructing a definition event per buffer.
   absl::StatusOr<BufferSequencingEventRef> GetEventForComputeStreamSyncPoint(
-      size_t sync_point, tsl::thread::ThreadPool* thread_pool,
+      size_t sync_point, AsyncWorkRunner* async_work_runner,
       bool nullptr_if_past = false);
 
  private:
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ void BufferSequencingEvent::ExecuteOrAddToFutureTasks(`
`133`	`133`	// Execute the `task` when definition event becomes available. If it's already
`134`	`134`	`// available, the task will be executed immediately.`
`135`	`135`	`event_.AndThen([this, traced_task = std::move(traced_task)]() mutable {`
`136`		`- thread_pool_->Schedule(std::move(traced_task));`
	`136`	`+ async_work_runner_->Schedule(std::move(traced_task));`
`137`	`137`	`});`
`138`	`138`	`}`
`139`	`139`