[Experimental] Add initial implementation of GSPMD->Shardy pass within PyTorch/XLA (#1)

hshahTT · hshahTT · commit e0cedaf59bde · 2025-08-30T18:12:44.000Z
Adds an environment variable CONVERT_SHLO_TO_SHARDY that does 2 things:

- Uses V2 sharding annotations when generating the GSPMD SHLO module (i.e., in V1 a mesh annotation string like: devices=[2,1,4]0,1,2,3,4,5,6,7 becomes this in V2: devices=[2,1,4]&lt;=[8]).
- Converts the new GSPMD module with the V2 annotations into a Shardy module.
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1561,12 +1561,19 @@ void InitXlaModuleBindings(py::module m) {
 
   // Define the _XLAC.OpSharding class.
   PythonScope<py::class_<xla::OpSharding>>(m, "OpSharding")
+      // Constructor for V1 shardings
       .def_init([](const py::list& tile_assignment,
                    const py::list& group_assignment,
                    const py::list& replication_groups, int sharding_type) {
         return ShardingUtil::CreateOpSharding(
             tile_assignment, group_assignment, replication_groups,
             ShardingUtil::ShardingType(sharding_type));
+      })
+      // Constructor for V2 shardings.
+      .def_init([](const py::list& dims, const py::list& reshape_dims,
+                   const py::list& transpose_perm) {
+        return ShardingUtil::CreateIotaOpSharding(dims, reshape_dims,
+                                                  transpose_perm);
       });
 
   // Define the _XLAC.PjRtPlugin class.
diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD
@@ -366,6 +366,7 @@ cc_library(
         "@xla//xla/mlir_hlo:all_passes",
         "@xla//xla/hlo/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
         "@xla//xla/hlo/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
+        "@xla//xla/service/spmd/shardy/stablehlo_round_trip:stablehlo_import",
     ],
 )
 
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cpp b/torch_xla/csrc/runtime/pjrt_computation_client.cpp
@@ -14,6 +14,7 @@
 #include "torch_xla/csrc/runtime/env_vars.h"
 #include "torch_xla/csrc/runtime/pjrt_registry.h"
 #include "torch_xla/csrc/runtime/stablehlo_helper.h"
+#include "torch_xla/csrc/runtime/sys_util.h"
 #include "torch_xla/csrc/runtime/tensor_source.h"
 #include "torch_xla/csrc/runtime/tf_logging.h"
 #include "torch_xla/csrc/runtime/util.h"
@@ -638,6 +639,9 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
       mlir::ModuleOp mlir_module =
           mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
       ConvertHloToStableHlo(instance.computation.mutable_proto(), &mlir_module);
+      if (runtime::sys_util::GetEnvBool("CONVERT_SHLO_TO_SHARDY", false)) {
+        ConvertStableHloToSdy(&mlir_module);
+      }
       executable = util::RaisePythonValueErrorOnFailure([&] {
         return fake_xla_compile_
                    ? fake_xla_compile_()
diff --git a/torch_xla/csrc/runtime/stablehlo_helper.cpp b/torch_xla/csrc/runtime/stablehlo_helper.cpp
@@ -18,6 +18,7 @@
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
 #include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.h"
 
 namespace torch_xla {
 
@@ -89,6 +90,7 @@ static absl::Status mhloToStablehloHelper(mlir::ModuleOp* mlir_module,
       torch_xla::runtime::CreateRemoveXlaMarkTensorOpsPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+
   if (!mlir::succeeded(pm.run(*mlir_module))) {
     return absl::Status(
         absl::StatusCode::kInternal,
@@ -111,6 +113,14 @@ void ConvertHloToStableHlo(const xla::HloModuleProto* proto,
                          << getHloModuleStr(proto);
 }
 
+void ConvertStableHloToSdy(mlir::ModuleOp* mlir_module) {
+  mlir::PassManager pm(mlir_module->getContext());
+  xla::sdy::addStablehloImportPipeline(pm, false, false);
+  if (!mlir::succeeded(pm.run(*mlir_module))) {
+    XLA_ERROR() << "StableHLO -> SDY conversion failed.\n";
+  }
+}
+
 std::string hloToStablehlo(const xla::HloModuleProto* proto,
                            bool emit_bytecode) {
   mlir::MLIRContext context;
diff --git a/torch_xla/csrc/runtime/stablehlo_helper.h b/torch_xla/csrc/runtime/stablehlo_helper.h
@@ -13,6 +13,8 @@ namespace torch_xla {
 std::string hloToStablehlo(const xla::HloModuleProto* proto,
                            bool emit_bytecode);
 
+void ConvertStableHloToSdy(mlir::ModuleOp* mlir_module);
+
 void ConvertHloToStableHlo(const xla::HloModuleProto* proto,
                            mlir::ModuleOp* mlir_module);
 
diff --git a/torch_xla/csrc/xla_sharding_util.cpp b/torch_xla/csrc/xla_sharding_util.cpp
@@ -218,6 +218,23 @@ bool ShardingUtil::EqualOpShardings(const xla::OpSharding& a,
   return xla::protobuf_util::HaveSameSerialization(a, b);
 }
 
+xla::OpSharding ShardingUtil::CreateIotaOpSharding(
+    const py::list& dims, const py::list& reshape_dims,
+    const py::list& transpose_perm) {
+  auto dims_vec = dims.cast<std::vector<int64_t>>();
+  auto reshape_dims_vec = reshape_dims.cast<std::vector<int64_t>>();
+  auto transpose_perm_vec = transpose_perm.cast<std::vector<int>>();
+  std::vector<xla::OpSharding::Type> subgroup_types;
+  if (dims_vec.size() > transpose_perm.size()) {
+    subgroup_types.push_back(xla::OpSharding::REPLICATED);
+  }
+  return xla::HloSharding::Subgroup(
+             xla::TileAssignment(dims_vec, reshape_dims_vec,
+                                 transpose_perm_vec),
+             subgroup_types)
+      .ToProto();
+}
+
 xla::OpSharding ShardingUtil::CreateOpSharding(
     const py::list& tile_assignment, const py::list& group_assignment,
     const py::list& replication_groups, ShardingType sharding_type) {
diff --git a/torch_xla/csrc/xla_sharding_util.h b/torch_xla/csrc/xla_sharding_util.h
@@ -51,6 +51,11 @@ class ShardingUtil {
                                           const py::list& group_assignment,
                                           const py::list& replication_groups,
                                           ShardingType sharding_type);
+  // Creates an xla::OpSharding for TILED and PARTIAL types using the
+  // HloShardingV2 system.
+  static xla::OpSharding CreateIotaOpSharding(const py::list& dims,
+                                              const py::list& reshape_dims,
+                                              const py::list& transpose_perm);
 
   // Returns the shape of the resulting shards of `tensor` after applying
   // `sharding`. This assumes the shards will be padded to ensure they all
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -1,6 +1,7 @@
 import collections
 from collections.abc import Generator, MutableMapping
 import math
+import os
 from collections import OrderedDict, defaultdict
 from dataclasses import dataclass, field
 import torch
@@ -118,9 +119,18 @@ def get_axis_name_idx(self, name: str) -> int:
       return None
     return self.axis_names.index(name)
 
+  def _validate_translated_partition_spec(self, partition_spec: tuple):
+    flat_specs = np.hstack([d for d in partition_spec])
+    specs = [d for d in flat_specs if d is not None]
+    assert all(d >= 0 and d < len(self.mesh_shape) for d in specs), \
+      f"partition_spec ({partition_spec}) contains out of bound index into mesh_shape."
+    assert len(specs) == len(np.unique(specs)), \
+    f"Each device mesh dimension should appear at most once in partition_spec {partition_spec}."
+
   @functools.lru_cache(maxsize=None)
   def _get_op_sharding_args(self, partition_spec: PartitionSpec):
     partition_spec = _translate_named_partition_spec(self, partition_spec)
+    self._validate_translated_partition_spec(partition_spec)
     flat_specs = np.hstack([d for d in partition_spec])
     specs = [d for d in flat_specs if d is not None]
     assert all(d >= 0 and d < len(self.mesh_shape) for d in specs), \
@@ -142,6 +152,57 @@ def _get_op_sharding_args(self, partition_spec: PartitionSpec):
     sharding_type = int(sharding_type)
     return tile_assignment, group_assignment, replication_groups, sharding_type
 
+  @functools.lru_cache(maxsize=None)
+  def _get_op_sharding_args_v2(self, partition_spec: PartitionSpec):
+    """
+    Returns the appropriate dims, reshape_dims, and transpose_perm for the given partition spec.
+    """
+    partition_spec = _translate_named_partition_spec(self, partition_spec)
+    self._validate_translated_partition_spec(partition_spec)
+
+    dims = []
+    used_axes = OrderedDict()
+    for axis in partition_spec:
+      if isinstance(axis, tuple):
+        dim_size = 1
+        for i in axis:
+          assert i is not None, "None not allowed within tuple"
+          dim_size *= self.mesh_shape[i]
+          used_axes[i] = True
+        dims.append(dim_size)
+      elif axis is not None:
+        assert isinstance(axis, int), "Axis must be an int or a tuple of ints"
+        dims.append(self.mesh_shape[axis])
+        used_axes[axis] = True
+      else:
+        # Replicated mesh axis
+        dims.append(1)
+
+    transpose_perm = [k for k in used_axes.keys()]
+    for i in range(len(self.mesh_shape)):
+      if i not in used_axes:
+        dims.append(self.mesh_shape[i])
+        transpose_perm.append(i)
+    reshape_dims = list(self.mesh_shape)
+
+    return dims, reshape_dims, transpose_perm
+
+  @functools.lru_cache(maxsize=None)
+  def get_op_sharding_v2(
+      self, partition_spec: PartitionSpec) -> torch_xla._XLAC.OpSharding:
+    """
+    Return the OpSharding for the given partition spec using V2 annotations.
+    """
+    if len(partition_spec) == 0:
+      return torch_xla._XLAC.OpSharding([], [], [], ShardingType.REPLICATED)
+    sharding_type = _get_sharding_type(partition_spec, self.size())
+    if sharding_type not in (ShardingType.TILED, ShardingType.PARTIAL):
+      return torch_xla._XLAC.OpSharding([], [], [0], sharding_type)
+
+    dims, reshape_dims, transpose_perm = self._get_op_sharding_args_v2(
+        partition_spec)
+    return torch_xla._XLAC.OpSharding(dims, reshape_dims, transpose_perm)
+
   @functools.lru_cache(maxsize=None)
   def get_op_sharding(
       self, partition_spec: PartitionSpec) -> torch_xla._XLAC.OpSharding:
@@ -157,6 +218,7 @@ def get_op_sharding(
 
     tile_assignment, group_assignment, replication_groups, sharding_type = self._get_op_sharding_args(
         partition_spec)
+
     return torch_xla._XLAC.OpSharding(tile_assignment, group_assignment,
                                       replication_groups, sharding_type)
 
@@ -653,7 +715,10 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
     t.shard_(NamedSharding(jmesh, P(*partition_spec)))
     return t
 
-  op_sharding = mesh.get_op_sharding(partition_spec)
+  if os.environ.get('CONVERT_SHLO_TO_SHARDY', False):
+    op_sharding = mesh.get_op_sharding_v2(partition_spec)
+  else:
+    op_sharding = mesh.get_op_sharding(partition_spec)
   annotate_func = torch_xla._XLAC._xla_mark_sharding
   annotate_func(unwrap_sharded_tensor(t), op_sharding)
   # Pass mesh and partition spec information for DTensor compatibility

Original file line number	Diff line number	Diff line change
`@@ -366,6 +366,7 @@ cc_library(`
`366`	`366`	`"@xla//xla/mlir_hlo:all_passes",`
`367`	`367`	`"@xla//xla/hlo/translate/hlo_to_mhlo:hlo_to_mlir_hlo",`
`368`	`368`	`"@xla//xla/hlo/translate/mhlo_to_hlo:mlir_hlo_to_hlo",`
	`369`	`+ "@xla//xla/service/spmd/shardy/stablehlo_round_trip:stablehlo_import",`
`369`	`370`	`],`
`370`	`371`	`)`
`371`	`372`