rwth-i6 · NeoLegends · Jan 15, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 7, 2025
@@ -19,7 +19,7 @@
 import numpy
 import functools
 import typing
-from typing import TYPE_CHECKING, Optional, Any, Union, Type, Dict, Sequence, List, Callable
+from typing import TYPE_CHECKING, Optional, Any, Tuple, Union, Type, Dict, Sequence, List, Callable
 
 from returnn.log import log
 from returnn.engine.batch import Batch, BatchSetGenerator
@@ -111,8 +111,8 @@ def __init__(
         min_chunk_size=0,
         chunking_variance=0,
         estimated_num_seqs=None,
-        _num_shards=1,
-        _shard_index=0,
+        _num_shards=None,
+        _shard_index=None,
     ):
         """
         :param str name: e.g. "train" or "eval"
@@ -136,8 +136,8 @@ def __init__(
         :param str|None seq_order_seq_lens_file: for seq order, use the seq length given by this file
         :param int shuffle_frames_of_nseqs: shuffles the frames. not always supported
         :param None|int estimated_num_seqs: for progress reporting in case the real num_seqs is unknown
-        :param int _num_shards: number of shards the data is split into
-        :param int _shard_index: local shard index, when sharding is enabled
+        :param int|None _num_shards: number of shards the data is split into
+        :param int|None _shard_index: local shard index, when sharding is enabled
         """
         self.name = name or ("dataset_id%s" % id(self))
         self.lock = None  # type: Optional[RLock]  # Used when manipulating our data potentially from multiple threads.
@@ -171,7 +171,7 @@ def __init__(
         self._chunking = chunking
         self.chunk_size, self.chunk_step, self.custom_chunking_func = self._parse_chunking(chunking)
         self._context_window = context_window
-        assert 0 <= _shard_index < _num_shards
+        assert (_shard_index is None and _num_shards is None) or 0 <= _shard_index < _num_shards
         self._num_shards = _num_shards
         self._shard_index = _shard_index
         if isinstance(context_window, (tuple, list)):
@@ -249,6 +249,59 @@ def __reduce__(self):
         state = {attr: getattr(self, attr) for attr in ["epoch", "zpad"]}
         return Dataset._create_from_reduce, (self.__class__, kwargs, state)
 
+    @staticmethod
+    def _get_rank_and_size() -> Tuple[int, int]:
+        """
+        :return: tuple (rank, size): the global rank and size for distributed trainings
+        """
+        from returnn.config import get_global_config
+
+        config = get_global_config(raise_exception=False)
+        if not config:
+            return 0, 1
+        if config.typed_value("torch_distributed") is not None:
+            import returnn.torch.distributed
+
+            ctx = returnn.torch.distributed.get_ctx(config=config)
+            return ctx.rank(), ctx.size()
+        elif config.is_true("use_horovod"):
+            assert config.bool("use_tensorflow", False) or config.value("backend", "").startswith("tensorflow")
+
+            import returnn.tf.horovod
+
+            ctx = returnn.tf.horovod.get_ctx(config=config)
+            return ctx.rank(), ctx.size()
+        else:
+            return 0, 1
+
+    @staticmethod
+    def _get_default_shard_config():
+        """
+        :return: default shard index and number of shards based on the global config
+        """
+        from returnn.config import get_global_config
+
+        config = get_global_config(raise_exception=False)
+        if not config:
+            return 0, 1
+        dd_cfg = config.typed_value("dataset_distribution", "random_seed_offset")
+        assert dd_cfg in ["random_seed_offset", "shard"]
+        return Dataset._get_rank_and_size() if dd_cfg == "shard" else 0, 1
+
+    @property
+    def num_shards(self) -> int:
+        """:return: number of shards the data is split into"""
+        if self._num_shards is None:
+            self._shard_index, self._num_shards = self._get_default_shard_config()
+        return self._num_shards
+
+    @property
+    def shard_index(self) -> int:
+        """:return: local shard index, when sharding is enabled"""
+        if self._shard_index is None:
+            self._shard_index, self._num_shards = self._get_default_shard_config()
+        return self._shard_index
+
     @property
     def random_seed_offset(self) -> int:
         """:return: random seed offset for shuffling"""
@@ -258,10 +311,10 @@ def random_seed_offset(self) -> int:
 
     def _uses_custom_distributed_sharding(self) -> bool:
         """
-        :return: if dataset has its own data sharding logic independent of TF/PT.
+        :return: if the dataset has its own data sharding logic independent of TF/PT.
             Leads to a fixed random_seed_offset independent of the workers local rank.
         """
-        return False
+        return self.num_shards > 1
 
     def _get_default_random_seed_offset(self):
         """
@@ -642,9 +695,9 @@ def get_seq_order_for_epoch(
             seq_index = [
                 i for i in seq_index if (all_seq_tags[i] not in used_seq_tags, used_seq_tags.add(all_seq_tags[i]))[0]
             ]
-        if partition_epoch > 1 or self._num_shards > 1:
+        if partition_epoch > 1 or self.num_shards > 1:
             seq_index = self._apply_partition_epoch_and_sharding(
-                seq_index, partition_epoch, epoch, self._num_shards, self._shard_index
+                seq_index, partition_epoch, epoch, self.num_shards, self.shard_index
             )
         if repeat_epoch > 1:
             seq_index = list(seq_index) * repeat_epoch
@@ -736,8 +789,8 @@ def init_seq_order(self, epoch=None, seq_list=None, seq_order=None):
         self.epoch = epoch
         self.rnd_seq_drop = Random(self._get_random_seed_for_epoch(epoch=epoch))
         assert (
-            self._num_shards == 1 or self.supports_sharding()
-        ), f"{self}: does not support sharding, but got num_shards == {self._num_shards}"
+            self.num_shards == 1 or self.supports_sharding()
+        ), f"{self}: does not support sharding, but got num_shards == {self.num_shards}"
         return False
 
     def finish_epoch(self, *, free_resources: bool = False):
@@ -1553,9 +1606,9 @@ def _dataset_extend_default_kwargs_from_parent_dataset(
     default_kwargs = default_kwargs.copy() if default_kwargs else {}
     default_kwargs.setdefault("random_seed_offset", parent_dataset.random_seed_offset)
     # noinspection PyProtectedMember
-    default_kwargs.setdefault("_num_shards", parent_dataset._num_shards)
+    default_kwargs.setdefault("_num_shards", parent_dataset.num_shards)
     # noinspection PyProtectedMember
-    default_kwargs.setdefault("_shard_index", parent_dataset._shard_index)
+    default_kwargs.setdefault("_shard_index", parent_dataset.shard_index)
     return default_kwargs
 
 

@@ -137,9 +137,8 @@ def __init__(
         get_sub_epoch_dataset: Callable[[List[FileTree]], Dict[str, Any]],
         preload_next_n_sub_epochs: int = 1,
         buffer_size: int = 1,
-        distrib_shard_files: bool = False,
+        distrib_shard_files: Optional[bool] = None,
         _meta_info_cache: Optional[Dict[str, Any]] = None,
-        _distrib_info: Optional[Dict[str, int]] = None,
         **kwargs,
     ):
         """
@@ -148,10 +147,10 @@ def __init__(
         :param get_sub_epoch_dataset: callable which returns a dataset dict for a given subset of files
         :param preload_next_n_sub_epochs: how many sub epoch datasets to preload
         :param buffer_size: buffer size for each worker, amount of seqs to prefetch
-        :param distrib_shard_files: set to true to shard the data across worker processes in
-            distributed training scenaria
+        :param distrib_shard_files: deprecated. Replaced by global config option ``dataset_distribution="shard"``.
+
+            Set to true to shard the data across worker processes in distributed training scenaria.
-        :param distrib_shard_files: deprecated. Replaced by global config option ``dataset_distribution="shard"``.
-
-            Set to true to shard the data across worker processes in distributed training scenaria.
+        :param distrib_shard_files: deprecated. set to true to shard the data across worker processes in
+            distributed training scenaria
-        :param distrib_shard_files: deprecated. Replaced by global config option ``dataset_distribution="shard"``.
-
-            Set to true to shard the data across worker processes in distributed training scenaria.
+        :param distrib_shard_files: deprecated. set to true to shard the data across worker processes in
+            distributed training scenaria
         :param _meta_info_cache: for internal use
-        :param _distrib_info: for internal use
         """
         super().__init__(**kwargs)
         self.files = files
@@ -166,21 +165,13 @@ def __init__(
         self._workers: Dict[int, _WorkerProcParent] = {}  # epoch -> worker
         self._files_order_cache: Dict[int, List[List[FileTree]]] = {}  # full epoch (0-indexed) -> files order
 
-        self.distrib_shard_files = distrib_shard_files
-        if distrib_shard_files:
-            assert self._num_shards == 1 and self._shard_index == 0, (  # ensure defaults are set
-                f"{self}: Cannot use both dataset-sharding via properties _num_shards and _shard index "
-                f"and {self.__class__.__name__}'s own sharding implementation based on the trainings rank and size."
+        if distrib_shard_files is not None:
+            log.print_deprecation_warning(
+                f"{self.__class__.__name__}' `distrib_shard_files` config option is set. "
+                "Use global config option `dataset_distribution` instead "
+                "for the same behavior across more types of datasets."
             )
-            log.print_deprecation_warning(
-                f"{self.__class__.__name__}' `distrib_shard_files` config option is set. "
-                "Use global config option `dataset_distribution` instead "
-                "for the same behavior across more types of datasets."
-            )
-            log.print_deprecation_warning(
-                f"{self.__class__.__name__}' `distrib_shard_files` config option is set. "
-                "Use global config option `dataset_distribution` instead "
-                "for the same behavior across more types of datasets."
-            )
-            if _distrib_info:
-                # If we're in a child process `_get_rank_and_size()` no longer works,
-                # so we pass the info about the shards via a pickled property.
-                # See also Dataset.__reduce__.
-                self._shard_index = _distrib_info["_shard_index"]
-                self._num_shards = _distrib_info["_num_shards"]
-            else:
-                self._shard_index, self._num_shards = _get_rank_and_size()
-        assert 0 <= self._shard_index < self._num_shards
+            self._validate_global_shard_cfg(distrib_shard_files)
 
         if _meta_info_cache:
             # This allows to skip the lazy init in self.initialize().
@@ -204,10 +195,6 @@ def supports_sharding(self) -> bool:
         """:return: whether the dataset supports sharding based on the seq_order"""
         return True
 
-    @property
-    def _distrib_info(self):
-        return {"_num_shards": self._num_shards, "_shard_index": self._shard_index}
-
     @property
     def _meta_info_cache(self):
         if not self.num_outputs:
@@ -220,9 +207,6 @@ def _meta_info_cache(self):
             "file_sizes": self._file_sizes,
         }
 
-    def _uses_custom_distributed_sharding(self) -> bool:
-        return self._num_shards > 1
-
     def _lazy_init_num_outputs(self):
         if self.num_outputs:
             return
@@ -290,11 +274,11 @@ def init_seq_order(self, epoch: Optional[int] = None, seq_list=None, seq_order=N
             else:
                 raise ValueError(f"{self}: seq_ordering {self.seq_ordering!r} not supported")
             file_bins = self._distribute_evenly_by_size(
-                num_bins=self._num_shards * self.partition_epoch,
+                num_bins=self.num_shards * self.partition_epoch,
                 file_sizes=self._file_sizes,
                 files_order=files_order_flat,
             )
-            self_index_base = self.partition_epoch * self._shard_index
+            self_index_base = self.partition_epoch * self.shard_index
             self_index_end = self_index_base + self.partition_epoch
             self._files_order_cache[full_epoch_0idx_] = file_bins[self_index_base:self_index_end]
 
@@ -328,6 +312,10 @@ def _get_sub_dataset_dict(self, files: List[FileTree]) -> Dict[str, Any]:
 
         dataset_dict = self.get_sub_epoch_dataset(files)
         dataset_dict = extend_dataset_dict_from_parent_dataset(dataset_dict, parent_dataset=self)
+        # We shard by splitting the files list into shards, the sub datasets must not shard any further by themselves
+        if self.num_shards > 1:
+            dataset_dict["_num_shards"] = 1
+            dataset_dict["_shard_index"] = 0
 
         flat_sub_dset = tree.flatten_with_path(dataset_dict)
 
@@ -452,6 +440,21 @@ def get_data_keys(self) -> List[str]:
             self._lazy_init_num_outputs()
         return self._data_keys
 
+    @classmethod
+    def _validate_global_shard_cfg(cls, distrib_shard_files: bool):
+        from returnn.config import get_global_config
+
+        config = get_global_config(raise_exception=False)
+        if not config:
+            return
+
+        dd_cfg = config.typed_value("dataset_distribution", None)
+        if dd_cfg and (distrib_shard_files and dd_cfg != "shard") or (not distrib_shard_files and dd_cfg == "shard"):
+            raise ValueError(
+                f"{cls.__name__}: `distrib_shard_files` config ({distrib_shard_files}) mismatch "
+                f"with global config option `dataset_distribution` ({dd_cfg})."
+            )
+
 
 def _get_key_for_file_tree(t: FileTree) -> str:
     """generates a deterministic key given a file tree"""
@@ -460,32 +463,6 @@ def _get_key_for_file_tree(t: FileTree) -> str:
     return ":".join(tree.flatten(t))
 
 
-def _get_rank_and_size() -> Tuple[int, int]:
-    """
-    :return: tuple (rank, size): the global rank and size for distributed trainings
-    """
-
-    from returnn.config import get_global_config
-
-    config = get_global_config(raise_exception=False)
-    if not config:
-        return 0, 1
-    if config.typed_value("torch_distributed") is not None:
-        import returnn.torch.distributed
-
-        ctx = returnn.torch.distributed.get_ctx(config=config)
-        return ctx.rank(), ctx.size()
-    elif config.is_true("use_horovod"):
-        assert config.bool("use_tensorflow", False) or config.value("backend", "").startswith("tensorflow")
-
-        import returnn.tf.horovod
-
-        ctx = returnn.tf.horovod.get_ctx(config=config)
-        return ctx.rank(), ctx.size()
-    else:
-        return 0, 1
-
-
 class _WorkerProcParent:
     def __init__(
         self,

@@ -561,10 +561,9 @@ def create_ogg_zip_txt_only_dataset(*, text: str = "hello world", seq_tag: str =
 
 
 @contextlib.contextmanager
-def create_ogg_zip_txt_only_dataset_mult_seqs(*, seed: int = 1, num_seqs: int = 100, max_seq_len: int = 100):
+def create_ogg_zip_txt_only_dataset_mult_seqs_opts(*, seed: int = 1, num_seqs: int = 100, max_seq_len: int = 100):
     """create OggZipDataset"""
     import zipfile
-    from returnn.datasets.audio import OggZipDataset
 
     rnd = numpy.random.RandomState(seed)
 
@@ -593,6 +592,15 @@ def create_ogg_zip_txt_only_dataset_mult_seqs(*, seed: int = 1, num_seqs: int =
             "audio": None,
             "targets": {"class": "CharacterTargets", "vocab_file": tmp_vocab_file.name, "seq_postfix": []},
         }
+        yield opts
+
+
+@contextlib.contextmanager
+def create_ogg_zip_txt_only_dataset_mult_seqs(*, seed: int = 1, num_seqs: int = 100, max_seq_len: int = 100):
+    """create OggZipDataset"""
+    from returnn.datasets.audio import OggZipDataset
+
+    with create_ogg_zip_txt_only_dataset_mult_seqs_opts(seed=seed, num_seqs=num_seqs, max_seq_len=max_seq_len) as opts:
         dataset = init_dataset(opts)
         assert isinstance(dataset, OggZipDataset)
         yield dataset
@@ -1212,6 +1220,18 @@ def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
         assert sub_ep == outer_epoch * multi_epoch + 1 and sub_seq_idx == 0
 
 
+def test_dataset_sharding():
+    from returnn.datasets.audio import OggZipDataset
+
+    with create_ogg_zip_txt_only_dataset_mult_seqs_opts(num_seqs=10) as dataset_opts:
-    with create_ogg_zip_txt_only_dataset_mult_seqs_opts(num_seqs=10) as dataset_opts:
+    with create_ogg_zip_txt_only_dataset_mult_seqs_opts(num_seqs=11) as dataset_opts:
-    with create_ogg_zip_txt_only_dataset_mult_seqs_opts(num_seqs=10) as dataset_opts:
+    with create_ogg_zip_txt_only_dataset_mult_seqs_opts(num_seqs=11) as dataset_opts:
+        datasets = [init_dataset({**dataset_opts, "_num_shards": 2, "_shard_index": i}) for i in range(2)]
+        for dataset in datasets:
+            assert isinstance(dataset, OggZipDataset)
+            dataset.init_seq_order(epoch=1)
+            assert dataset.shard_index < dataset.num_shards == 2
+            assert dataset.num_seqs == 5
+
+
 if __name__ == "__main__":
     better_exchook.install()
     if len(sys.argv) <= 1: