tensorflow
diff --git a/‎Sources/CX10/xla_tensor_wrapper.cc
Lines changed: 3 additions & 3 deletions b/‎Sources/CX10/xla_tensor_wrapper.cc
Lines changed: 3 additions & 3 deletions
diff --git a/‎Sources/x10/xla_client/BUILD
Lines changed: 7 additions & 0 deletions b/‎Sources/x10/xla_client/BUILD
Lines changed: 7 additions & 0 deletions
diff --git a/‎Sources/x10/xla_client/computation_client.cc
Lines changed: 7 additions & 47 deletions b/‎Sources/x10/xla_client/computation_client.cc
Lines changed: 7 additions & 47 deletions
diff --git a/‎Sources/x10/xla_client/computation_client.h
Lines changed: 30 additions & 100 deletions b/‎Sources/x10/xla_client/computation_client.h
Lines changed: 30 additions & 100 deletions
@@ -138,13 +138,13 @@ OpaqueXLATensor* copyTensorAndMakeResident(enum XLATensorScalarType type,
                                            const size_t* shape, size_t rank,
                                            const struct CDevice cdevice,
                                            bool to_reduced_precision) {
+  auto device = ConvertDevice(cdevice);
   if (to_reduced_precision && XLATensorScalarType_Float == type) {
     const float* float_buffer = reinterpret_cast<const float*>(value);
     auto non_owned_buffer =
         std::make_unique<at::NonOwnedAnyScalarBuffer<float>>(
             float_buffer, num_entries * sizeof(float));
     std::vector<int64_t> dims(shape, shape + rank);
-    auto device = ConvertDevice(cdevice);
     auto dest_shape = swift_xla::MakeArrayShapeFromDimensions(
         XlaHelpers::I64List(dims), /*dynamic_dimensions=*/{},
         xla::PrimitiveType::BF16, device.hw_type);
@@ -153,8 +153,8 @@ OpaqueXLATensor* copyTensorAndMakeResident(enum XLATensorScalarType type,
     return new swift_xla::XLATensor(
         swift_xla::XLATensor::Create(xla_data, at::ScalarType::Float));
   }
-  if (XLATensorScalarType_Float == type && xla::ComputationClient::IsLocal()) {
-    auto device = ConvertDevice(cdevice);
+  auto* device_ptr = xla::GetX10Device(device);
+  if (XLATensorScalarType_Float == type && device_ptr->IsLocal()) {
     std::vector<xla::int64> dims(shape, shape + rank);
     auto dest_shape = swift_xla::MakeArrayShapeFromDimensions(
         dims, /*dynamic_dimensions=*/{}, xla::PrimitiveType::F32,
 
@@ -24,6 +24,7 @@ cc_library(
         "computation_client.cc",
         "device.cc",
         "env_vars.cc",
+        "local_device.cc",
         "mesh_service.cc",
         "metrics.cc",
         "metrics_reader.cc",
@@ -47,6 +48,7 @@ cc_library(
         "debug_macros.h",
         "device.h",
         "env_vars.h",
+        "local_device.h",
         "mesh_service.h",
         "metrics.h",
         "metrics_reader.h",
@@ -83,8 +85,10 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xrt:xrt_proto_cc",
         "//tensorflow/compiler/xrt:xrt_server",
         "//tensorflow/compiler/xrt:xrt_utils",
@@ -98,10 +102,13 @@ cc_library(
         "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/numeric:int128",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 
@@ -38,7 +38,7 @@ std::shared_ptr<ComputationClient::Computation> ComputationClient::Compile(
   std::vector<CompileInstance> instances;
   instances.emplace_back(std::move(computation), output_shape);
   std::vector<std::shared_ptr<Computation>> results =
-      Compile(compilation_device, devices, std::move(instances));
+      GetX10Device(compilation_device)->Compile(devices, std::move(instances));
   return std::move(results[0]);
 }
 
@@ -183,10 +183,8 @@ metrics::Metric* ComputationClient::OutboundDataMetric() {
   return metric;
 }
 
-ComputationClient::DataPtr ComputationClient::TransferToServer(
-    xla::BorrowingLiteral literal, const xla::Shape& dest_shape,
-    const std::string& device) {
-  TF_LOG(FATAL) << "Only supported for LocalClient";
+int32_t ComputationClient::Device::mesh_id() const {
+  TF_LOG(FATAL) << "Unsupported";
 }
 
 std::vector<std::string> ComputationClient::GetAllDevices() const {
@@ -225,50 +223,16 @@ ComputationClient::Device* GetX10Device(swift_xla::Device device_id) {
 std::vector<Literal> ComputationClient::TransferFromServer(
     absl::Span<const DataPtr> handles) {
   if (handles.empty()) return {};
-  ComputationClient* client = handles[0]->device()->computation_client();
+  TransferManager* transfer = handles[0]->device()->GetTransferManager();
   for (auto& handle : handles) {
-    XLA_CHECK_EQ(client, handle->device()->computation_client());
+    XLA_CHECK_EQ(transfer, handle->device()->GetTransferManager());
   }
-  return client->TransferFromServerImpl(handles);
-}
-
-std::vector<ComputationClient::ComputationPtr>
-ComputationClient::Device::Compile(const std::vector<std::string>& devices,
-                                   std::vector<CompileInstance> instances) {
-  return client_->Compile(name_, devices, std::move(instances));
+  return transfer->TransferFromServerImpl(handles);
 }
 
 ComputationClient::DataPtr ComputationClient::Device::TransferToServer(
     xla::BorrowingLiteral literal, const xla::Shape& dest_shape) {
-  return client_->TransferToServer(std::move(literal), dest_shape, name_);
-}
-
-std::vector<ComputationClient::DataPtr>
-ComputationClient::Device::TransferToServer(
-    absl::Span<const TensorSource> tensors) {
-  return client_->TransferToServer(tensors);
-}
-
-std::vector<ComputationClient::DataPtr>
-ComputationClient::Device::ExecuteChained(
-    absl::Span<const ComputationClient::ExecuteChainedOp> ops) {
-  return client_->ExecuteChained(ops, name_);
-}
-
-std::string ComputationClient::Device::ResourceDomain() const {
-  return client_->GetResourceDomain(name_);
-}
-
-ComputationClient::DataPtr ComputationClient::Device::CreateDataPlaceholder(
-    Shape shape) const {
-  return client_->CreateDataPlaceholder(name_, std::move(shape));
-}
-
-std::vector<ComputationClient::DataPtr>
-ComputationClient::Device::ExecuteComputation(
-    const Computation& computation, absl::Span<const DataPtr> arguments,
-    const ExecuteComputationOptions& options) {
-  return client_->ExecuteComputation(computation, arguments, name_, options);
+  TF_LOG(FATAL) << "Only supported for LocalClient";
 }
 
 std::map<std::string, Metric> ComputationClient::ReadMetrics() {
@@ -299,8 +263,4 @@ std::vector<std::string> ComputationClient::AllDevices() {
   return Get()->GetAllDevices();
 }
 
-std::vector<std::string> ComputationClient::LocalDevices() {
-  return Get()->GetLocalDevices();
-}
-
 }  // namespace xla
@@ -47,42 +47,58 @@ class ComputationClient {
   using DataPtr = std::shared_ptr<Data>;
   using ComputationPtr = std::shared_ptr<Computation>;
 
+  class TransferManager {
+   public:
+    virtual ~TransferManager() {}
+
+    virtual std::vector<Literal> TransferFromServerImpl(
+        absl::Span<const DataPtr> handles) = 0;
+  };
+
   class Device {
    public:
     virtual ~Device() {}
 
     const std::string& name() const { return name_; }
-    ComputationClient* computation_client() const { return client_; }
+    virtual int32_t mesh_id() const;
     const swift_xla::Device& device_id() const { return device_id_; }
-    explicit Device(std::string name, ComputationClient* client)
-        : name_(name), client_(client) {
+
+    virtual TransferManager* GetTransferManager() const = 0;
+    explicit Device(std::string name) : name_(name) {
       device_id_ = swift_xla::Device(name_);
     }
 
     virtual std::vector<ComputationPtr> Compile(
         const std::vector<std::string>& devices,
-        std::vector<CompileInstance> instances);
+        std::vector<CompileInstance> instances) = 0;
 
+    // Transfers local tensor values to the TPU servers and fetches the handles.
     virtual std::vector<DataPtr> TransferToServer(
-        absl::Span<const TensorSource> tensors);
+        absl::Span<const TensorSource> tensors) = 0;
 
+    // Copies a single tensor in the form of a xla::BorrowingLiteral async to
+    // the TPU. The literal is copied to a temporary buffer and then copied
+    // async as per the semantics of TransferLiteralToDeviceAsync. The next
+    // computation that is scheduled will wait for this transfer to complete
+    // before running.
     virtual DataPtr TransferToServer(xla::BorrowingLiteral literal,
                                      const xla::Shape& dest_shape);
 
     virtual std::vector<DataPtr> ExecuteChained(
-        absl::Span<const ExecuteChainedOp> ops);
+        absl::Span<const ExecuteChainedOp> ops) = 0;
 
-    virtual std::string ResourceDomain() const;
+    virtual std::string ResourceDomain() const = 0;
 
-    virtual DataPtr CreateDataPlaceholder(Shape shape) const;
+    virtual DataPtr CreateDataPlaceholder(Shape shape) = 0;
 
     virtual std::vector<DataPtr> ExecuteComputation(
         const Computation& computation, absl::Span<const DataPtr> arguments,
-        const ExecuteComputationOptions& options);
+        const ExecuteComputationOptions& options) = 0;
+
+    virtual bool IsLocal() { return false; }
 
    private:
     std::string name_;
-    ComputationClient* client_;
     swift_xla::Device device_id_;
   };
   class Data {
@@ -152,13 +168,10 @@ class ComputationClient {
     using PopulateFn = std::function<void(const TensorSource&, void*, size_t)>;
 
     TensorSource() = default;
-    TensorSource(Shape shape, std::string device, PopulateFn populate_fn)
-        : shape(std::move(shape)),
-          device(std::move(device)),
-          populate_fn(std::move(populate_fn)) {}
+    TensorSource(Shape shape, PopulateFn populate_fn)
+        : shape(std::move(shape)), populate_fn(std::move(populate_fn)) {}
 
     Shape shape;
-    std::string device;
     PopulateFn populate_fn;
   };
 
@@ -210,85 +223,13 @@ class ComputationClient {
 
   static std::unique_ptr<ComputationClient> Create();
 
-  static bool IsLocal();
-
   virtual ~ComputationClient() {}
 
-  // Creates a Data object with no actual device handle in it. The device handle
-  // will be populated in an asynchrounous fashion.
-  virtual DataPtr CreateDataPlaceholder(std::string device, Shape shape) = 0;
-
-  // Transfers local tensor values to the TPU servers and fetches the handles.
-  virtual DataPtr TransferToServer(xla::BorrowingLiteral literal,
-                                   const xla::Shape& dest_shape,
-                                   const std::string& device);
-
-  // Transfers local tensor values to the TPU servers and fetches the handles.
-  virtual std::vector<DataPtr> TransferToServer(
-      absl::Span<const TensorSource> tensors) = 0;
-
   // Reads the tensor literal values stored at TPU server sites, behind the
   // supplied handles.
   static std::vector<Literal> TransferFromServer(
       absl::Span<const DataPtr> handles);
 
-  std::vector<ComputationPtr> Compile(std::vector<CompileInstance> instances);
-
-  // Executes computation with arguments and returns the result.
-  // The passed device must match the common device of the arguments Data.
-  // If options.explode_tuple is true, the output tuple will be decomposed into
-  // its single elements.
-  virtual std::vector<DataPtr> ExecuteComputation(
-      const Computation& computation, absl::Span<const DataPtr> arguments,
-      const std::string& device, const ExecuteComputationOptions& options) = 0;
-
-  // Executes the computation in replicated mode.
-  // The size of the arguments vector is the number of replicas to execute,
-  // and it must match the size of the computation.devices() as well as the
-  // devices passed as argument. The destination devices for each replicated
-  // computation come from the devices the Data objects are stored into, which
-  // must match the devices argument. Within arguments[i], every Data
-  // object must be coming from the same device. Returns a vector (of the same
-  // size of the arguments vector) with the results of the parallel execution.
-  // The result[i], a vector itself, will be the result of the computation fed
-  // with arguments[i]. If options.explode_tuple is true, the output tuples will
-  // be decomposed into their single elements.
-  virtual std::vector<std::vector<DataPtr>> ExecuteReplicated(
-      const Computation& computation,
-      const std::vector<std::vector<DataPtr>>& arguments,
-      absl::Span<const std::string> devices,
-      const ExecuteReplicatedOptions& options) = 0;
-
-  // Executes the computations in parallel. Each computation must target a
-  // different device, and the the common device of arguments[i] must match
-  // devices[i]. The computations[i] computation is fed with arguments[i]
-  // arguments.
-  // Returns a vector of vectors of device side Data object, with result[i]
-  // being the return value of computations[i]. If options.explode_tuple is
-  // true, the output tuples will be decomposed into their single elements.
-  virtual std::vector<std::vector<DataPtr>> ExecuteParallel(
-      absl::Span<const Computation* const> computations,
-      const std::vector<std::vector<DataPtr>>& arguments,
-      absl::Span<const std::string> devices,
-      const ExecuteParallelOptions& options) = 0;
-
-  // Executes a serie of operations, whose results are input of other
-  // operations. The ops is a valid post-order for the execution, which means
-  // that the inputs of op at index I, will have to be coming from ops at index
-  // lower than I. It returns a vector of device data shared pointers, one for
-  // every ExecuteChainedOp marked with is_result=true, in the order they appear
-  // within the ops post-order.
-  virtual std::vector<DataPtr> ExecuteChained(
-      absl::Span<const ExecuteChainedOp> ops, const std::string& device) = 0;
-
-  virtual std::vector<std::vector<DataPtr>> DeconstructTuple(
-      absl::Span<const DataPtr> tuples) = 0;
-
-  // Returns a unique string which identifies the resource domain of a given
-  // device. Within a resource domain, handles to device memory or compiled
-  // computations can be used for all devices part of such domain.
-  virtual std::string GetResourceDomain(const std::string& device) const = 0;
-
   virtual std::string GetDefaultDevice() const = 0;
   static Device* DefaultDevice();
 
@@ -297,11 +238,6 @@ class ComputationClient {
   virtual swift_xla::Device GetDefaultDeviceStruct() const = 0;
   static swift_xla::Device DefaultDeviceStruct();
 
-  virtual size_t GetNumDevices() const = 0;
-
-  virtual std::vector<std::string> GetLocalDevices() const = 0;
-  static std::vector<std::string> LocalDevices();
-
   std::vector<std::string> GetAllDevices() const;
   static std::vector<std::string> AllDevices();
 
@@ -336,7 +272,6 @@ class ComputationClient {
   // after the last ':' character of the device string.
   static int64 GetDeviceOrdinal(const std::string& device);
 
- protected:
   // Metrics common to all client intrfaces.
   static metrics::Metric* TransferToServerMetric();
   static metrics::Metric* TransferToServerTransformMetric();
@@ -357,19 +292,14 @@ class ComputationClient {
   static metrics::Metric* ReleaseCompileHandlesTimeMetric();
   static metrics::Metric* InboundDataMetric();
   static metrics::Metric* OutboundDataMetric();
+
+ protected:
   void AddDevice(std::unique_ptr<Device> device);
 
   // Returns the ComputationClient singleton.
   static ComputationClient* Get();
 
  private:
-  virtual std::vector<Literal> TransferFromServerImpl(
-      absl::Span<const DataPtr> handles) = 0;
-  // Compiles a set of computations.
-  virtual std::vector<ComputationPtr> Compile(
-      const std::string& device, const std::vector<std::string>& devices,
-      std::vector<CompileInstance> instances) = 0;
-
   std::vector<Device*> devices_;
   std::vector<std::unique_ptr<Device>> devices_owned_;
   absl::node_hash_map<std::string, Device*> devices_by_name_;