pytorch
diff --git a/‎test/cpp/test_xla_sharding.cpp‎
Lines changed: 65 additions & 31 deletions b/‎test/cpp/test_xla_sharding.cpp‎
Lines changed: 65 additions & 31 deletions
diff --git a/‎torch_xla/csrc/BUILD‎
Lines changed: 12 additions & 0 deletions b/‎torch_xla/csrc/BUILD‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎torch_xla/csrc/debug_util.cpp‎
Lines changed: 3 additions & 1 deletion b/‎torch_xla/csrc/debug_util.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎torch_xla/csrc/init_python_bindings.cpp‎
Lines changed: 14 additions & 10 deletions b/‎torch_xla/csrc/init_python_bindings.cpp‎
Lines changed: 14 additions & 10 deletions
@@ -50,15 +50,18 @@ TEST_F(XLAShardingTest, GetShardShape) {
       {0, 1},
       {2, 3},
   });
-  auto sharding = xla::HloSharding::Tile(mesh).ToProto();
+  auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
+  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
 
   auto shard_shape = ShardingUtil::GetShardShape(sharding_spec);
   // For tiled sharding, each dimension should be halved
   EXPECT_EQ(shard_shape, std::vector<int64_t>({4, 4}));
 
-  sharding_spec->sharding = xla::HloSharding::Replicate().ToProto();
+  xla_sharding = xla::HloSharding::Replicate().ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding_spec->sharding = sharding;
   shard_shape = ShardingUtil::GetShardShape(sharding_spec);
   // For replicated sharding, each dimension should be preserved
   EXPECT_EQ(shard_shape, std::vector<int64_t>({8, 7}));
@@ -74,7 +77,8 @@ TEST_F(XLAShardingTest, GetShardIndicesForDevices) {
       {0, 1},
       {2, 3},
   });
-  auto sharding = xla::HloSharding::Tile(mesh).ToProto();
+  auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
+  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   auto shard_shape = ShardingUtil::GetShardShape(sharding_spec);
@@ -103,7 +107,8 @@ TEST_F(XLAShardingTest, GetShardIndicesForDevices) {
       EXPECT_EQ(slice.step(), 1);
     }
   }
-  sharding = xla::HloSharding::Replicate().ToProto();
+  xla_sharding = xla::HloSharding::Replicate().ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
   sharding_spec->sharding = sharding;
   shard_shape = ShardingUtil::GetShardShape(sharding_spec);
   replica_and_indices = ShardingUtil::GetShardReplicaAndIndicesForDevices(
@@ -126,11 +131,12 @@ TEST_F(XLAShardingTest, ShardTensor) {
   at::Tensor tensor = at::ones({8}, at::TensorOptions(at::kFloat));
   xla::Shape tensor_shape =
       CreateComputationShapeFromTensor(tensor, bridge::GetDefaultDevice());
-  xla::OpSharding sharding =
+  xla::OpSharding xla_sharding =
       xla::HloSharding::Tile1D(
           CreateComputationShapeFromTensor(tensor, bridge::GetDefaultDevice()),
           devices.size())
           .ToProto();
+  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   auto shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -148,7 +154,8 @@ TEST_F(XLAShardingTest, ShardTensor) {
       {0, 1, 2, 3},
       {4, 5, 6, 7},
   });
-  sharding = xla::HloSharding::Tile(mesh).ToProto();
+  xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
   sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -160,15 +167,19 @@ TEST_F(XLAShardingTest, ShardTensor) {
   // 3D tiled, the first dim is replicated and the last halved. The last shard
   // size should be smaller in dim=1 because it's not evenly divisible.
   xla::Array3D<int64_t> cube({{{0, 1}, {2, 3}, {4, 5}, {6, 7}}});
-  sharding_spec->sharding = xla::HloSharding::Tile(cube).ToProto();
+  xla_sharding = xla::HloSharding::Tile(cube).ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding_spec->sharding = sharding;
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
                                      /*padded=*/false);
   EXPECT_EQ(shards.size(), 8);
   EXPECT_EQ(shards[0].sizes(), c10::ArrayRef<long>({8, 2, 2}));
   EXPECT_EQ(shards[7].sizes(), c10::ArrayRef<long>({8, 1, 2}));
 
   // Replicated, all shards should be identical.
-  sharding_spec->sharding = xla::HloSharding::Replicate().ToProto();
+  xla_sharding = xla::HloSharding::Replicate().ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding_spec->sharding = sharding;
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
                                      /*padded=*/false);
   EXPECT_EQ(shards.size(), 8);
@@ -182,7 +193,8 @@ TEST_F(XLAShardingTest, ShardTensor) {
   tensor_shape =
       CreateComputationShapeFromTensor(tensor, bridge::GetDefaultDevice());
   xla::Array4D<int64_t> tesseract({{{{0, 1}, {2, 3}, {4, 5}, {6, 7}}}});
-  sharding = xla::HloSharding::Tile(tesseract).ToProto();
+  xla_sharding = xla::HloSharding::Tile(tesseract).ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
   sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -206,7 +218,8 @@ TEST_F(XLAShardingTest, ShardTensor) {
       CreateComputationShapeFromTensor(tensor, bridge::GetDefaultDevice());
   xla::Array<int64_t> hypercube(std::vector<int64_t>{1, 1, 2, 2, 2});
   hypercube.FillIota(0);
-  sharding = xla::HloSharding::Tile(hypercube).ToProto();
+  xla_sharding = xla::HloSharding::Tile(hypercube).ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
   sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -234,7 +247,8 @@ TEST_F(XLAShardingTest, ShardTensorMultiHost) {
       {4, 5, 0, 1},
       {6, 7, 2, 3},
   });
-  auto sharding = xla::HloSharding::Tile(mesh).ToProto();
+  auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
+  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   // For devices at the start of the mesh, all shards should have the same
@@ -251,7 +265,9 @@ TEST_F(XLAShardingTest, ShardTensorMultiHost) {
       {0, 1, 4, 5},
       {2, 3, 6, 7},
   });
-  sharding_spec->sharding = xla::HloSharding::Tile(mesh).ToProto();
+  xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding_spec->sharding = sharding;
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
                                      /*padded=*/false);
   EXPECT_EQ(shards.size(), 4);
@@ -278,7 +294,8 @@ TEST_F(XLAShardingTest, ShardTensorMiniBatch) {
       {{7}},
   });
 
-  auto sharding = xla::HloSharding::Tile(mesh).ToProto();
+  auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
+  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
   auto sharding_spec = std::make_shared<XLATensor::ShardingSpec>(
       sharding, global_shape, /*minibatch=*/true);
   auto shards = ShardingUtil::ShardTensor(minibatch_tensor, sharding_spec,
@@ -292,17 +309,20 @@ TEST_F(XLAShardingTest, EqualShardingSpecs) {
   auto tensor = at::ones({8, 7}, at::TensorOptions(at::kFloat));
   xla::Shape tensor_shape =
       CreateComputationShapeFromTensor(tensor, bridge::GetDefaultDevice());
-  XLATensor::ShardingSpec tiled_2d(xla::HloSharding::Tile({
-                                                              {0, 1, 2, 3},
-                                                              {4, 5, 6, 7},
-                                                          })
-                                       .ToProto(),
-                                   tensor_shape);
-  XLATensor::ShardingSpec tiled_3d(
-      xla::HloSharding::Tile({{{0, 1}, {2, 3}, {4, 5}, {6, 7}}}).ToProto(),
-      tensor_shape);
-  XLATensor::ShardingSpec replicated(xla::HloSharding::Replicate().ToProto(),
-                                     tensor_shape);
+  auto xla_sharding = xla::HloSharding::Tile({
+                                                 {0, 1, 2, 3},
+                                                 {4, 5, 6, 7},
+                                             })
+                          .ToProto();
+  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
+  XLATensor::ShardingSpec tiled_2d(sharding, tensor_shape);
+  xla_sharding =
+      xla::HloSharding::Tile({{{0, 1}, {2, 3}, {4, 5}, {6, 7}}}).ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  XLATensor::ShardingSpec tiled_3d(sharding, tensor_shape);
+  xla_sharding = xla::HloSharding::Replicate().ToProto();
+  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  XLATensor::ShardingSpec replicated(sharding, tensor_shape);
   EXPECT_TRUE(ShardingUtil::EqualShardingSpecs(tiled_2d, tiled_2d));
   EXPECT_FALSE(ShardingUtil::EqualShardingSpecs(tiled_2d, tiled_3d));
   EXPECT_TRUE(ShardingUtil::EqualShardingSpecs(replicated, replicated));
@@ -323,12 +343,17 @@ TEST_F(XLAShardingTest, CreateTensorsData) {
   std::vector<std::string> devices(3);
   std::fill_n(devices.begin(), devices.size(),
               bridge::GetDefaultDevice()->toString());
+  auto replicate_xla_sharding = xla::HloSharding::Replicate().ToProto();
+  auto unknown_xla_sharding = xla::HloSharding::Unknown().ToProto();
+  torch_xla::OpSharding replicate_sharding(replicate_xla_sharding,
+                                           std::nullopt);
+  torch_xla::OpSharding unknown_sharding(unknown_xla_sharding, std::nullopt);
   std::vector<XLATensor::ShardingSpecPtr> shardings = {
       nullptr,
-      std::make_shared<XLATensor::ShardingSpec>(
-          xla::HloSharding::Replicate().ToProto(), tensor_shape),
-      std::make_shared<XLATensor::ShardingSpec>(
-          xla::HloSharding::Unknown().ToProto(), tensor_shape)};
+      std::make_shared<XLATensor::ShardingSpec>(replicate_sharding,
+                                                tensor_shape),
+      std::make_shared<XLATensor::ShardingSpec>(unknown_sharding,
+                                                tensor_shape)};
   std::vector<torch::lazy::BackendDataPtr> tensors_data =
       CreateTensorsData(tensors, shardings, devices);
 
@@ -387,13 +412,21 @@ TEST_F(XLAShardingTest, PrepareOutputShardingPropagation) {
   auto y = xla::Add(x, xla::ConstantR0<float>(&b, 3));
   xla::XlaComputation xla_computation =
       GetValueOrThrow(b.Build(/*remove_dynamic_dimensions=*/false));
+
+  std::vector<torch::lazy::BackendDataPtr> parameters_data;
+  parameters_data.push_back(
+      torch_xla::runtime::GetComputationClientOrDie()->CreateDataPlaceholder(
+          bridge::GetDefaultDevice()->toString(), std::move(shape)));
+
   std::vector<torch_xla::runtime::ComputationClient::CompileInstance> instances;
   instances.push_back({std::move(xla_computation),
                        bridge::GetDefaultDevice()->toString(),
                        {bridge::GetDefaultDevice()->toString()},
                        &shape,
                        /*should_wrap_parameter=*/false,
-                       /*is_sharded=*/true});
+                       /*is_sharded=*/true,
+                       /*allow_spmd_sharding_propagation_to_output=*/true,
+                       /*parameters_data=*/parameters_data});
 
   std::vector<
       std::shared_ptr<torch_xla::runtime::ComputationClient::Computation>>
@@ -417,11 +450,12 @@ TEST_F(XLAShardingTest, PrepareOutputShardingPropagation) {
   if (n_devices > 1) {
     // Tiled sharding requires multiple devices.
     EXPECT_TRUE(xla::protobuf_util::HaveSameSerialization(
-        tiled, sharding_specs[0]->sharding));
+        tiled, sharding_specs[0]->sharding.GetXlaOpSharding()));
   } else {
     // Sincle device execution defaults to replication sharding.
     EXPECT_TRUE(xla::protobuf_util::HaveSameSerialization(
-        xla::HloSharding::Replicate().ToProto(), sharding_specs[0]->sharding));
+        xla::HloSharding::Replicate().ToProto(),
+        sharding_specs[0]->sharding.GetXlaOpSharding()));
   }
 
   // Check if the placeholder is on a SPMD device (sharded) with no real values.
 
@@ -126,6 +126,7 @@ ptxla_cc_library(
         ":shape_builder",
         ":shape_helper",
         ":status",
+        ":torch_xla_op_sharding",
         ":version",
         "//torch_xla/csrc:hash_util",
         "//torch_xla/csrc:thread_pool",
@@ -313,6 +314,7 @@ ptxla_cc_library(
         ":shape_helper",
         ":status",
         ":unwrap_data",
+        ":torch_xla_op_sharding",
         "//torch_xla/csrc/runtime:cache",
         "//torch_xla/csrc/runtime:computation_client",
         "@com_google_absl//absl/log:absl_check",
@@ -382,3 +384,13 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
     ],
 )
+
+cc_library(
+    name = "torch_xla_op_sharding",
+    srcs = ["torch_xla_op_sharding.cpp"],
+    hdrs = ["torch_xla_op_sharding.h"],
+    deps = [
+        "//torch_xla/csrc/runtime:debug_macros",
+        "@xla//xla/hlo/builder:xla_builder",
+    ],
+)
@@ -21,6 +21,7 @@
 #include "torch_xla/csrc/runtime/sys_util.h"
 #include "torch_xla/csrc/runtime/xla_util.h"
 #include "torch_xla/csrc/status.h"
+#include "torch_xla/csrc/torch_xla_op_sharding.h"
 #include "torch_xla/csrc/xla_graph_executor.h"
 
 namespace torch_xla {
@@ -218,7 +219,8 @@ void DebugUtil::SaveOutputShardingInfo(std::vector<XLATensorPtr>* tensors,
     auto xtensor = (*tensors)[indices[i]];
     ss << xtensor->shape().get().ToString() << " ";
     if (xtensor->sharding_spec()) {
-      ss << xla::HloSharding::FromProto(xtensor->sharding_spec()->sharding)
+      ss << xla::HloSharding::FromProto(
+                xtensor->sharding_spec()->sharding.GetXlaOpSharding())
                 ->ToString();
     } else {
       ss << xla::HloSharding::FromProto(xla::HloSharding::Unknown().ToProto())
 
@@ -69,6 +69,7 @@
 #include "torch_xla/csrc/tensor_methods.h"
 #include "torch_xla/csrc/tensor_util.h"
 #include "torch_xla/csrc/torch_util.h"
+#include "torch_xla/csrc/torch_xla_op_sharding.h"
 #include "torch_xla/csrc/version.h"
 #include "torch_xla/csrc/xla_backend_impl.h"
 #include "torch_xla/csrc/xla_graph_executor.h"
@@ -706,7 +707,8 @@ std::string GetTensorsHloGraph(const std::vector<at::Tensor>& tensors,
 std::string GetXLAShardingSpec(const XLATensorPtr xtensor) {
   auto sharding_spec = xtensor->sharding_spec();
   if (sharding_spec != nullptr) {
-    auto hlo_sharding = xla::HloSharding::FromProto(sharding_spec->sharding);
+    auto hlo_sharding =
+        xla::HloSharding::FromProto(sharding_spec->sharding.GetXlaOpSharding());
     return hlo_sharding->ToString();
   }
   return std::string();
@@ -1503,7 +1505,7 @@ void InitXlaModuleBindings(py::module m) {
              runtime::ComputationClient::ComputationPtr>(m, "XlaComputation");
 
   // Define the _XLAC.OpSharding class.
-  PythonScope<py::class_<xla::OpSharding>>(m, "OpSharding")
+  PythonScope<py::class_<torch_xla::OpSharding>>(m, "OpSharding")
       .def_init([](const py::list& tile_assignment,
                    const py::list& group_assignment,
                    const py::list& replication_groups, int sharding_type) {
@@ -2268,6 +2270,7 @@ void InitXlaModuleBindings(py::module m) {
            [](const std::vector<at::Tensor>& tensors, const std::string& device,
               const std::vector<std::string>& devices,
               bool emit_bytecode) -> py::bytes {
+            NoGilSection nogil;
             EmitMode mode = emit_bytecode ? EmitMode::kStableHloBytecode
                                           : EmitMode::kStableHloReadable;
             std::vector<XLATensorPtr> xtensors;
@@ -2504,16 +2507,16 @@ void InitXlaModuleBindings(py::module m) {
             }
            })
       .def("_xla_mark_sharding",
-           [](const at::Tensor& input, xla::OpSharding sharding) {
+           [](const at::Tensor& input, torch_xla::OpSharding sharding) {
             ShardingUtil::XlaMarkSharding(input, sharding);
            })
       .def("_xla_annotate_custom_sharding",
-           [](const at::Tensor& input, xla::OpSharding sharding) {
+           [](const at::Tensor& input, torch_xla::OpSharding sharding) {
             XLATensorPtr xtensor = bridge::GetXlaTensor(input);
             ShardingUtil::XlaAnnotateCustomSharding(xtensor, sharding);
            })
       .def("_mark_manual_sharding",
-           [](const at::Tensor& input, xla::OpSharding sharding) {
+           [](const at::Tensor& input, torch_xla::OpSharding sharding) {
             XLA_CHECK(IsNonDeviceDataIR(input))
                 << "Marking any data tensors as manual is not supported";
             ShardingUtil::XlaMarkSharding(input, sharding);
@@ -2533,13 +2536,14 @@ void InitXlaModuleBindings(py::module m) {
                 xtensor->CreateFrom(torch_xla::MakeNode<CustomSharding>(
                     xtensor->GetIrValue(), shard_shape,
                     CustomSharding::Type::kSPMDFullToShardShape));
-            output->SetShardingSpec(XLATensor::ShardingSpec(
-                xla::HloSharding::Manual().ToProto(), shard_shape));
+            torch_xla::OpSharding sharding(xla::HloSharding::Manual().ToProto(), 
+                            sharding_spec->sharding.GetDenormalizedTileAssignment());
+            output->SetShardingSpec(XLATensor::ShardingSpec(sharding, shard_shape));
             return bridge::AtenFromXlaTensor(output);
           })
       .def(
           "_spmd_shard_to_full_shape",
-          [](const at::Tensor& input, const xla::OpSharding& sharding,
+          [](const at::Tensor& input, const torch_xla::OpSharding& sharding,
              const std::vector<int64_t>& output_shape,
              const py::object& output_dtype) -> at::Tensor {
             XLATensorPtr xtensor = bridge::GetXlaTensor(input);
@@ -2578,7 +2582,7 @@ void InitXlaModuleBindings(py::module m) {
             XLATensor::ShardingSpecPtr sharding_spec =
                 xtensor ? xtensor->sharding_spec() : nullptr;
             if (sharding_spec != nullptr) {
-              return sharding_spec->sharding;
+              return sharding_spec->sharding.GetXlaOpSharding();
             }
             return std::nullopt;
            })
@@ -2613,7 +2617,7 @@ void InitXlaModuleBindings(py::module m) {
           // `torch_xla.runtime.local_runtime_devices()`.
           "_global_tensor_from_cpu_shards",
           [](const std::vector<at::Tensor>& shards,
-             const xla::OpSharding& sharding,
+             const torch_xla::OpSharding& sharding,
              std::optional<std::vector<int64_t>>& global_shape) -> at::Tensor {
             XLA_CHECK(UseVirtualDevice())
                 << "Please enable SPMD via `torch_xla.runtime.use_spmd()`";