tmp

cbalioglu · cbalioglu · commit 90af6f378e59 · 2025-05-08T02:03:44.000Z
diff --git a/src/fairseq2/checkpoint/__init__.py b/src/fairseq2/checkpoint/__init__.py
@@ -15,16 +15,11 @@
     CheckpointNotFoundError as CheckpointNotFoundError,
 )
 from fairseq2.checkpoint._manager import CheckpointSaveError as CheckpointSaveError
-from fairseq2.checkpoint._manager import CheckpointSaver as CheckpointSaver
 from fairseq2.checkpoint._manager import CheckpointState as CheckpointState
 from fairseq2.checkpoint._manager import (
     CheckpointStateProcessor as CheckpointStateProcessor,
 )
 from fairseq2.checkpoint._manager import FileCheckpointManager as FileCheckpointManager
-from fairseq2.checkpoint._manager import InProcCheckpointSaver as InProcCheckpointSaver
-from fairseq2.checkpoint._manager import (
-    OutOfProcCheckpointSaver as OutOfProcCheckpointSaver,
-)
 from fairseq2.checkpoint._manager import Stateful as Stateful
 from fairseq2.checkpoint._metadata_provider import (
     CheckpointMetadataSaver as CheckpointMetadataSaver,
diff --git a/src/fairseq2/checkpoint/_manager.py b/src/fairseq2/checkpoint/_manager.py
@@ -10,16 +10,12 @@
 from collections.abc import Callable, Mapping, Set
 from concurrent.futures import Future
 from copy import deepcopy
-from multiprocessing.pool import Pool
 from os import scandir
 from pathlib import Path
-from pickle import PickleError
 from shutil import Error
-from signal import SIG_IGN, SIGINT, signal
-from typing import ClassVar, Protocol, TypeAlias, cast, final, runtime_checkable
+from typing import Protocol, TypeAlias, cast, final, runtime_checkable
 
 import torch
-import torch.multiprocessing as mp
 from torch import Tensor
 from typing_extensions import override
 
@@ -141,7 +137,7 @@ class FileCheckpointManager(CheckpointManager):
     _checkpoint_dir: Path
     _gangs: Gangs
     _file_system: FileSystem
-    _saver: CheckpointSaver
+    _tensor_dumper: TensorDumper
     _tensor_loader: TensorLoader
     _thread_pool: ThreadPool
     _save_op: Future[Callable[[], None]] | None
@@ -152,7 +148,7 @@ def __init__(
         checkpoint_dir: Path,
         gangs: Gangs,
         file_system: FileSystem,
-        saver: CheckpointSaver,
+        tensor_dumper: TensorDumper,
         tensor_loader: TensorLoader,
         thread_pool: ThreadPool,
     ) -> None:
@@ -167,8 +163,7 @@ def __init__(
 
         self._file_system = file_system
 
-        self._saver = saver
-
+        self._tensor_dumper = tensor_dumper
         self._tensor_loader = tensor_loader
 
         self._thread_pool = thread_pool
@@ -464,7 +459,7 @@ def _do_save_checkpoint(
         def save() -> Callable[[], None]:
             nonlocal state
 
-            self._saver.save(step_nr, state)
+            self._save_state_files(step_nr, state)
 
             del state
 
@@ -540,6 +535,15 @@ def move_to_host(item: object) -> object:
 
         return cast(dict[str, object], move_to_host(state_dict))
 
+    def _save_state_files(self, step_nr: int, state: CheckpointState) -> None:
+        for kind, (file, state_dict) in state.items():
+            try:
+                self._tensor_dumper.dump(state_dict, file)
+            except TensorDumpError as ex:
+                raise CheckpointSaveError(
+                    step_nr, f"The '{kind}' state of step {step_nr} cannot be saved to the '{ex.path}' file. See the nested exception for details."  # fmt: skip
+                ) from ex
+
     def _copy_cc(self, step_nr: int) -> None:
         gangs = self._gangs
 
@@ -983,103 +987,11 @@ def load_error() -> CheckpointError:
 
         return scores
 
-    @override
-    def close(self) -> None:
-        self._saver.close()
-
-
-class CheckpointSaver(Closable):
-    @abstractmethod
-    def save(self, step_nr: int, state: CheckpointState) -> None: ...
-
-
-@final
-class InProcCheckpointSaver(CheckpointSaver):
-    _tensor_dumper: TensorDumper
-
-    def __init__(self, tensor_dumper: TensorDumper) -> None:
-        self._tensor_dumper = tensor_dumper
-
-    @override
-    def save(self, step_nr: int, state: CheckpointState) -> None:
-        _save_state_files(self._tensor_dumper, step_nr, state)
-
     @override
     def close(self) -> None:
         pass
 
 
-@final
-class OutOfProcCheckpointSaver(CheckpointSaver):
-    _pool: Pool
-
-    def __init__(self, pool: Pool) -> None:
-        self._pool = pool
-
-    @staticmethod
-    def create(tensor_dumper: TensorDumper) -> OutOfProcCheckpointSaver:
-        mp.set_sharing_strategy("file_system")
-
-        ctx = mp.get_context("spawn")
-
-        # Do not allow the pool process to handle SIGINT. It will be gracefully
-        # closed when `close()` is called.
-        sig = signal(SIGINT, SIG_IGN)
-
-        try:
-            pool = ctx.Pool(1, _PoolProcess.init, (tensor_dumper,))
-        except (RuntimeError, ValueError, PickleError) as ex:
-            raise CheckpointError(
-                "The checkpoint process pool cannot be initialized. See the nested exception for details."  # fmt: skip
-            ) from ex
-        finally:
-            signal(SIGINT, sig)
-
-        return OutOfProcCheckpointSaver(pool)
-
-    @override
-    def save(self, step_nr: int, state: CheckpointState) -> None:
-        try:
-            self._pool.apply(_PoolProcess.save_state_files, (step_nr, state))
-        except RuntimeError as ex:
-            raise CheckpointError(
-                "The checkpoint process pool has failed to dispatch the save operation. See the nested exception for details."  # fmt: skip
-            ) from ex
-
-    @override
-    def close(self) -> None:
-        self._pool.close()
-
-        self._pool.join()
-
-
-class _PoolProcess:
-    _tensor_dumper: ClassVar[TensorDumper | None] = None
-
-    @staticmethod
-    def init(tensor_dumper: TensorDumper) -> None:
-        _PoolProcess._tensor_dumper = tensor_dumper
-
-    @staticmethod
-    def save_state_files(step_nr: int, state: CheckpointState) -> None:
-        if _PoolProcess._tensor_dumper is None:
-            raise InternalError("`_tensor_dumper` is `None`.")
-
-        _save_state_files(_PoolProcess._tensor_dumper, step_nr, state)
-
-
-def _save_state_files(
-    tensor_dumper: TensorDumper, step_nr: int, state: CheckpointState
-) -> None:
-    for kind, (file, state_dict) in state.items():
-        try:
-            tensor_dumper.dump(state_dict, file)
-        except TensorDumpError as ex:
-            raise CheckpointSaveError(
-                step_nr, f"The '{kind}' state of step {step_nr} cannot be saved to the '{ex.path}' file. See the nested exception for details."  # fmt: skip
-            ) from ex
-
-
 class CheckpointNotFoundError(Exception):
     step_nr: int
 
diff --git a/src/fairseq2/datasets/instruction.py b/src/fairseq2/datasets/instruction.py
@@ -128,7 +128,7 @@ def splits(self) -> set[str]:
 
 
 # TODO: FIX, INFER
-npc = 10
+npc = 5  # 10
 
 
 GENERIC_INSTRUCTION_DATASET_FAMILY: Final = "generic_instruction"
@@ -223,9 +223,9 @@ def create_reader(
             else:
                 builder = DataPipeline.concat(pipelines)
 
-        # Shuffle files. Must be consistent across all processes.
-        if options.example_shuffle_window != 1:
-            builder.shuffle(options.example_shuffle_window, seed=seed)
+        #        # Shuffle files. Must be consistent across all processes.
+        #        if options.example_shuffle_window != 1:
+        #            builder.shuffle(options.example_shuffle_window, seed=seed)
 
         seed += 1
 
@@ -286,9 +286,9 @@ def skip(example: dict[str, Any]) -> bool:
         else:
             raise NotSupportedError(f"`{batching}` is not supported.")
 
-        # Shuffle buckets.
-        if options.batch_shuffle_window != 1:
-            builder.shuffle(options.batch_shuffle_window, seed=seed)
+        ##        # Shuffle buckets.
+        #        if options.batch_shuffle_window != 1:
+        #            builder.shuffle(options.batch_shuffle_window, seed=seed)
 
         seed += 1
 
@@ -308,7 +308,7 @@ def skip(example: dict[str, Any]) -> bool:
             builder.take(options.max_num_batches)
 
         # Prefetch `num_prefetch` batches in background.
-        builder.prefetch(options.num_prefetch)
+        #        builder.prefetch(options.num_prefetch)
 
         # Wrap examples with `SequenceBatch`.
         def to_batch(example: dict[str, Any]) -> SequenceBatch:
diff --git a/src/fairseq2/recipes/common/_checkpoint.py b/src/fairseq2/recipes/common/_checkpoint.py
@@ -8,25 +8,15 @@
 
 from pathlib import Path
 
-from fairseq2.checkpoint import (
-    CheckpointManager,
-    CheckpointSaver,
-    FileCheckpointManager,
-    InProcCheckpointSaver,
-    OutOfProcCheckpointSaver,
-)
+from fairseq2.checkpoint import CheckpointManager, FileCheckpointManager
 from fairseq2.context import RuntimeContext
 from fairseq2.gang import Gangs
-from fairseq2.recipes.config import RegimeSection
 from fairseq2.utils.io import TorchTensorDumper, TorchTensorLoader
 from fairseq2.utils.threading import get_default_thread_pool
 
 
 def create_checkpoint_manager(
-    context: RuntimeContext,
-    regime_section: RegimeSection,
-    gangs: Gangs,
-    output_dir: Path,
+    context: RuntimeContext, gangs: Gangs, output_dir: Path
 ) -> CheckpointManager:
     checkpoint_dir = output_dir.joinpath("checkpoints")
 
@@ -35,15 +25,8 @@ def create_checkpoint_manager(
     tensor_loader = TorchTensorLoader(file_system)
     tensor_dumper = TorchTensorDumper(file_system)
 
-    saver: CheckpointSaver
-
-    if regime_section.in_proc_checkpoint:
-        saver = InProcCheckpointSaver(tensor_dumper)
-    else:
-        saver = OutOfProcCheckpointSaver.create(tensor_dumper)
-
     thread_pool = get_default_thread_pool()
 
     return FileCheckpointManager(
-        checkpoint_dir, gangs, file_system, saver, tensor_loader, thread_pool
+        checkpoint_dir, gangs, file_system, tensor_dumper, tensor_loader, thread_pool
     )
diff --git a/src/fairseq2/recipes/config.py b/src/fairseq2/recipes/config.py
@@ -266,12 +266,6 @@ class RegimeSection:
 
     keep_checkpoint_every_n_steps: int | None = None
 
-    in_proc_checkpoint: bool = False
-    """
-    If ``True``, saves checkpoints in a background thread instead of a child
-    process.
-    """
-
     publish_metrics_after_n_steps: int = 0
 
     publish_metrics_every_n_steps: int | None = None
diff --git a/src/fairseq2/recipes/lm/_instruction_finetune.py b/src/fairseq2/recipes/lm/_instruction_finetune.py
@@ -227,9 +227,7 @@ def load_instruction_finetuner(
 
     gangs = setup_training_gangs(context, config.gang, config.trainer)
 
-    checkpoint_manager = create_checkpoint_manager(
-        context, config.regime, gangs, output_dir
-    )
+    checkpoint_manager = create_checkpoint_manager(context, gangs, output_dir)
 
     seed = config.common.seed
 
diff --git a/src/fairseq2/recipes/lm/_preference_finetune/_recipe.py b/src/fairseq2/recipes/lm/_preference_finetune/_recipe.py
@@ -114,9 +114,7 @@ def load_po_finetuner(
 
     gangs = setup_training_gangs(context, config.gang, config.trainer)
 
-    checkpoint_manager = create_checkpoint_manager(
-        context, config.regime, gangs, output_dir
-    )
+    checkpoint_manager = create_checkpoint_manager(context, gangs, output_dir)
 
     seed = config.common.seed
 
diff --git a/src/fairseq2/recipes/lm/_train.py b/src/fairseq2/recipes/lm/_train.py
@@ -180,9 +180,7 @@ def load_lm_trainer(
 
     gangs = setup_training_gangs(context, config.gang, config.trainer)
 
-    checkpoint_manager = create_checkpoint_manager(
-        context, config.regime, gangs, output_dir
-    )
+    checkpoint_manager = create_checkpoint_manager(context, gangs, output_dir)
 
     seed = config.common.seed
 
diff --git a/src/fairseq2/recipes/mt/_train.py b/src/fairseq2/recipes/mt/_train.py
@@ -226,9 +226,7 @@ def load_mt_trainer(
 
     gangs = setup_training_gangs(context, config.gang, config.trainer)
 
-    checkpoint_manager = create_checkpoint_manager(
-        context, config.regime, gangs, output_dir
-    )
+    checkpoint_manager = create_checkpoint_manager(context, gangs, output_dir)
 
     seed = config.common.seed
 
diff --git a/src/fairseq2/recipes/wav2vec2/_train.py b/src/fairseq2/recipes/wav2vec2/_train.py
@@ -203,9 +203,7 @@ def load_wav2vec2_trainer(
 
     gangs = setup_training_gangs(context, config.gang, config.trainer)
 
-    checkpoint_manager = create_checkpoint_manager(
-        context, config.regime, gangs, output_dir
-    )
+    checkpoint_manager = create_checkpoint_manager(context, gangs, output_dir)
 
     seed = config.common.seed
 
diff --git a/src/fairseq2/recipes/wav2vec2/asr/_train.py b/src/fairseq2/recipes/wav2vec2/asr/_train.py
@@ -222,9 +222,7 @@ def load_wav2vec2_asr_trainer(
 
     gangs = setup_training_gangs(context, config.gang, config.trainer)
 
-    checkpoint_manager = create_checkpoint_manager(
-        context, config.regime, gangs, output_dir
-    )
+    checkpoint_manager = create_checkpoint_manager(context, gangs, output_dir)
 
     seed = config.common.seed