Fix typos (#3753)

cyyever · S1ro1 · web-flow · commit 8830e58a91f6 · 2025-09-08T13:33:18.000+02:00
* Fix typos

Signed-off-by: cyy &lt;cyyever@outlook.com&gt;

* Fix: style

---------

Signed-off-by: cyy &lt;cyyever@outlook.com&gt;
Co-authored-by: S1ro1 &lt;matej.sirovatka@gmail.com&gt;
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -1327,7 +1327,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
 
         <Tip warning={true}>
 
-        Overidding `even_batches` will not affect iterable-style data loaders.
+        Overriding `even_batches` will not affect iterable-style data loaders.
 
         </Tip>
 
@@ -1363,7 +1363,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
 
                 if iterable_dl_seen:
                     warnings.warn(
-                        "Overridding even_batches is only supported for map-style datasets, yet some dataloaders given were iterable"
+                        "Overriding even_batches is only supported for map-style datasets, yet some dataloaders given were iterable"
                     )
             else:
                 even_batches = self.even_batches
@@ -1542,7 +1542,7 @@ def prepare(self, *args, device_placement=None):
                 and self.state.use_ipex
             ):
                 logger.warning(
-                    "You are using lower version of PyTorch(< 2.7.0) with ipex acceleration on Intel CPU or XPU, Intel has upstreamed most of the optimizations into stock PyTorch from 2.7.0, we enourage you to install the latest stock PyTorch and enjoy the out-of-experience on Intel CPU/XPU."
+                    "You are using lower version of PyTorch(< 2.7.0) with ipex acceleration on Intel CPU or XPU, Intel has upstreamed most of the optimizations into stock PyTorch from 2.7.0, we encourage you to install the latest stock PyTorch and enjoy the out-of-experience on Intel CPU/XPU."
                 )
                 args = self._prepare_ipex(*args)
         if self.parallelism_config and self.parallelism_config.tp_enabled:
@@ -1672,7 +1672,7 @@ def _prepare_fsdp2(self, *args):
             else:
                 model = torch.compile(model, **self.state.dynamo_plugin.to_kwargs())
 
-        # Get old params and canonicalize - we cannonicalize to have the mapping easy
+        # Get old params and canonicalize - we canonicalize to have the mapping easy
         old_named_params = fsdp2_canonicalize_names(self._get_named_parameters(*tuple(result), drop_refs=True))
 
         # Swap the optimizer parameters with empty, so `fully_shard` after will not allocate too much memory
@@ -2888,7 +2888,7 @@ def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
                     while isinstance(opt, AcceleratedOptimizer):
                         opt = opt.optimizer
                     gradients = xm._fetch_gradients(opt)
-                    # Use xm.all_reduce to perform an in-place all-reduce. Recusrsive all-reduce each tensor
+                    # Use xm.all_reduce to perform an in-place all-reduce. Recursive all-reduce each tensor
                     # one by one in self.reduce is non-inplace.
                     xm.all_reduce("sum", gradients, scale=1.0 / self.num_processes)
                     # Set is_xla_gradients_synced to True to avoid all-reduce twice in the AcceleratedOptimizer step.
@@ -3047,7 +3047,7 @@ def reduce(self, tensor, reduction="sum", scale=1.0):
             reduction (`str`, *optional*, defaults to "sum"):
                 A reduction type, can be one of 'sum', 'mean', or 'none'. If 'none', will not perform any operation.
             scale (`float`, *optional*, defaults to 1.0):
-                A default scaling value to be applied after the reduce, only valied on XLA.
+                A default scaling value to be applied after the reduce, only valid on XLA.
 
         Returns:
             `torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`:
@@ -3339,7 +3339,7 @@ def save_model(
 
         Arguments:
             model: (`torch.nn.Module`):
-                Model to be saved. The model can be wrapped or unwraped.
+                Model to be saved. The model can be wrapped or unwrapped.
             save_directory (`str` or `os.PathLike`):
                 Directory to which to save. Will be created if it doesn't exist.
             max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
@@ -3450,7 +3450,7 @@ def register_save_state_pre_hook(self, hook: Callable[..., None]) -> hooks.Remov
 
         `hook(models: list[torch.nn.Module], weights: list[dict[str, torch.Tensor]], input_dir: str) -> None`
 
-        The `models` argument are the models as saved in the accelerator state under `accelerator._models`, `weigths`
+        The `models` argument are the models as saved in the accelerator state under `accelerator._models`, `weights`
         argument are the state dicts of the `models`, and the `input_dir` argument is the `input_dir` argument passed
         to [`Accelerator.load_state`].
 
diff --git a/src/accelerate/commands/config/update.py b/src/accelerate/commands/config/update.py
@@ -60,4 +60,4 @@ def update_command_parser(parser, parents):
 
 def update_config_command(args):
     config_file = update_config(args)
-    print(f"Sucessfully updated the configuration file at {config_file}.")
+    print(f"Successfully updated the configuration file at {config_file}.")
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -493,13 +493,13 @@ def launch_command_parser(subparsers=None):
         "--deepspeed_exclusion_filter",
         default=None,
         type=str,
-        help="DeepSpeed exclusion filter string when using mutli-node setup.",
+        help="DeepSpeed exclusion filter string when using multi-node setup.",
     )
     deepspeed_args.add_argument(
         "--deepspeed_inclusion_filter",
         default=None,
         type=str,
-        help="DeepSpeed inclusion filter string when using mutli-node setup.",
+        help="DeepSpeed inclusion filter string when using multi-node setup.",
     )
     deepspeed_args.add_argument(
         "--deepspeed_multinode_launcher",
@@ -585,7 +585,7 @@ def launch_command_parser(subparsers=None):
         "--fsdp_use_orig_params",
         default="true",
         type=str,
-        help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres."
+        help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable parameters."
         " (useful only when `use_fsdp` flag is passed).",
     )
     fsdp_args.add_argument(
diff --git a/src/accelerate/commands/to_fsdp2.py b/src/accelerate/commands/to_fsdp2.py
@@ -89,7 +89,7 @@ def convert_config_to_fsdp2(config: dict) -> dict:
     new_fsdp_config = {}
 
     if fsdp_config.get("fsdp_version", 1) == 2:
-        logger.warning("Config already specfies FSDP2, skipping conversion...")
+        logger.warning("Config already specifies FSDP2, skipping conversion...")
         logger.warning(
             "If the config doesn't use new argument names, change `fsdp_version` to `1` and rerun the command."
         )
diff --git a/src/accelerate/data_loader.py b/src/accelerate/data_loader.py
@@ -75,7 +75,7 @@ class SeedableRandomSampler(RandomSampler):
     Same as a random sampler, except that in `__iter__` a seed can be used.
 
     Needed specifically in distributed cases, when the random generator for each GPU needs to start from the same seed
-    and be fully reproducable on multiple iterations.
+    and be fully reproducible on multiple iterations.
 
     If a custom `generator` is passed, it will rely on its initial seed as well as the current iteration it is on
     (stored in `self.epoch`).
@@ -408,7 +408,7 @@ def end(self):
 class DataLoaderAdapter:
     """
     A class which wraps around a PyTorch `DataLoader` (or variants of it) to be used with the `Accelerator`. For
-    compatability reasons, this class inherits from the class it wraps around, so it can be used as a drop-in.
+    compatibility reasons, this class inherits from the class it wraps around, so it can be used as a drop-in.
     """
 
     def __init__(self, dataset, use_stateful_dataloader=False, batch_sampler=None, **kwargs):
@@ -451,8 +451,8 @@ def load_state_dict(self, state_dict):
     @property
     def __class__(self):
         """
-        In order to maintain backwards compatability with other code, we need to ensure `isinstance(obj, DataLoader)`
-        returs true. This is because some downstream code assumes that the `DataLoader` is the base class of the
+        In order to maintain backwards compatibility with other code, we need to ensure `isinstance(obj, DataLoader)`
+        returns true. This is because some downstream code assumes that the `DataLoader` is the base class of the
         object.
         """
         return self.base_dataloader.__class__
@@ -763,12 +763,12 @@ def __init__(
 
         # if a device mesh is provided extract each dimension (dp, fsdp, tp)
         # device mesh may hold any number of dimensions, however,
-        # below code is for targetted support for dp, fsdp and tp
+        # below code is for targeted support for dp, fsdp and tp
 
         # device mesh will be used only if there is tp involved
         # or any multi-dimensional parallelism involving tp
         # (dp, tp) (fsdp, tp) (dp, fsdp, tp)
-        # otherwise the default behavour not using device mesh should be sufficient
+        # otherwise the default behaviour not using device mesh should be sufficient
         # since multi dimensional parallelism devoid of tp would anyway need
         # different batches for each process irrespective of dp or fsdp
         self.submesh_tp = None
@@ -1063,7 +1063,7 @@ def prepare_data_loader(
             ignored otherwise.
         use_seedable_sampler (`bool`, *optional*, defaults to `False`):
             Whether to use the [`~data_loader.SeedableRandomSampler`] instead of a `RandomSampler` for better
-            reproducability. Comes at a cost of potentially different performances due to different shuffling
+            reproducibility. Comes at a cost of potentially different performances due to different shuffling
             algorithms but ensures results will be the *exact* same. Should be paired with `set_seed()` at every
             `self.set_epoch`
         data_seed (`int`, *optional*, defaults to `None`):
diff --git a/src/accelerate/state.py b/src/accelerate/state.py
@@ -195,7 +195,7 @@ def __init__(self, cpu: bool = False, **kwargs):
             original_backend = kwargs.pop("backend", None)
             backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, original_backend)
             if original_backend is not None and backend != original_backend:
-                raise ValueError(f"Your assigned backend {original_backend} is not avaliable, please use {backend}")
+                raise ValueError(f"Your assigned backend {original_backend} is not available, please use {backend}")
             self.backend = backend
             self.distributed_type = distributed_type
             use_deepspeed = False
@@ -1084,7 +1084,7 @@ def destroy_process_group(self, group=None):
         """
         Destroys the process group. If one is not specified, the default process group is destroyed.
 
-        If `self.fork_lauched` is `True` and `group` is `None`, nothing happens.
+        If `self.fork_launched` is `True` and `group` is `None`, nothing happens.
         """
         PartialState().destroy_process_group(group)
 
diff --git a/src/accelerate/test_utils/scripts/test_ops.py b/src/accelerate/test_utils/scripts/test_ops.py
@@ -50,7 +50,7 @@ def test_gather_object(state):
     assert gathered_obj == list(range(state.num_processes)), f"{gathered_obj} != {list(range(state.num_processes))}"
 
 
-def test_gather_non_contigous(state):
+def test_gather_non_contiguous(state):
     # Skip this test because the 'is_contiguous' function of XLA tensor always returns True.
     if state.distributed_type == DistributedType.XLA:
         return
@@ -160,8 +160,8 @@ def main():
     test_gather(state)
     state.print("testing gather_object")
     test_gather_object(state)
-    state.print("testing gather non-contigous")
-    test_gather_non_contigous(state)
+    state.print("testing gather non-contiguous")
+    test_gather_non_contiguous(state)
     state.print("testing broadcast")
     test_broadcast(state)
     state.print("testing pad_across_processes")
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
@@ -598,7 +598,7 @@ def require_torchdata_stateful_dataloader(test_case):
 def run_first(test_case):
     """
     Decorator marking a test with order(1). When pytest-order plugin is installed, tests marked with this decorator are
-    garanteed to run first.
+    guaranteed to run first.
 
     This is especially useful in some test settings like on a Gaudi instance where a Gaudi device can only be used by a
     single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
@@ -617,7 +617,7 @@ def run_first(test_case):
 class TempDirTestCase(unittest.TestCase):
     """
     A TestCase class that keeps a single `tempfile.TemporaryDirectory` open for the duration of the class, wipes its
-    data at the start of a test, and then destroyes it at the end of the TestCase.
+    data at the start of a test, and then destroys it at the end of the TestCase.
 
     Useful for when a class or API requires a single constant folder throughout it's use, such as Weights and Biases
 
diff --git a/src/accelerate/tracking.py b/src/accelerate/tracking.py
@@ -111,7 +111,7 @@ class GeneralTracker:
     (`bool`): Whether the logger requires a directory to store their logs. `tracker` (`object`): Should return internal
     tracking mechanism used by a tracker class (such as the `run` for wandb)
 
-    Implementations can also include a `main_process_only` (`bool`) attribute to toggle if relevent logging, init, and
+    Implementations can also include a `main_process_only` (`bool`) attribute to toggle if relevant logging, init, and
     other functions should occur on the main process or across all processes (by default will use `True`)
     """
 
diff --git a/src/accelerate/utils/bnb.py b/src/accelerate/utils/bnb.py
@@ -314,7 +314,7 @@ def _replace_with_bnb_layers(
     """
     Private method that wraps the recursion for module replacement.
 
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
     """
     # bitsandbytes will initialize CUDA on import, so it needs to be imported lazily
     import bitsandbytes as bnb
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -683,7 +683,7 @@ class DynamoBackend(str, BaseEnum):
           more](https://github.com/pytorch/xla/blob/r2.0/docs/dynamo.md)
         - **IPEX** -- Uses IPEX for inference on CPU. Inference only. [Read
           more](https://github.com/intel/intel-extension-for-pytorch).
-        - **TVM** -- Uses Apach TVM for inference optimizations. [Read more](https://tvm.apache.org/)
+        - **TVM** -- Uses Apache TVM for inference optimizations. [Read more](https://tvm.apache.org/)
         - **HPU_BACKEND** -- Uses HPU backend for inference optimizations.
 
     """
@@ -804,9 +804,9 @@ class DataLoaderConfiguration:
             all workers.
         use_seedable_sampler (`bool`, defaults to `False`):
             Whether or not use a fully seedable random sampler ([`data_loader.SeedableRandomSampler`]). Ensures
-            training results are fully reproducable using a different sampling technique. While seed-to-seed results
-            may differ, on average the differences are neglible when using multiple different seeds to compare. Should
-            also be ran with [`~utils.set_seed`] for the best results.
+            training results are fully reproducible using a different sampling technique. While seed-to-seed results
+            may differ, on average the differences are negligible when using multiple different seeds to compare.
+            Should also be ran with [`~utils.set_seed`] for the best results.
         data_seed (`int`, defaults to `None`):
             The seed to use for the underlying generator when using `use_seedable_sampler`. If `None`, the generator
             will use the current default seed from torch.
@@ -849,8 +849,8 @@ class DataLoaderConfiguration:
         default=False,
         metadata={
             "help": "Whether or not use a fully seedable random sampler ([`data_loader.SeedableRandomSampler`])."
-            "Ensures training results are fully reproducable using a different sampling technique. "
-            "While seed-to-seed results may differ, on average the differences are neglible when using"
+            "Ensures training results are fully reproducible using a different sampling technique. "
+            "While seed-to-seed results may differ, on average the differences are negligible when using"
             "multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
         },
     )
@@ -956,7 +956,7 @@ class GradientAccumulationPlugin(KwargsHandler):
         sync_with_dataloader (`bool`, *optional*, defaults to `True`):
             Whether to synchronize setting the gradients when at the end of the dataloader.
         sync_each_batch (`bool`, *optional*):
-                Whether to synchronize setting the gradients at each data batch. Seting to `True` may reduce memory
+                Whether to synchronize setting the gradients at each data batch. Setting to `True` may reduce memory
                 requirements when using gradient accumulation with distributed training, at expense of speed.
 
     Example:
@@ -2008,7 +2008,7 @@ def set_state_dict_type(self, state_dict_type=None):
 
     def set_auto_wrap_policy(self, model):
         """
-        Given `model`, creates an `auto_wrap_policy` baesd on the passed in policy and if we can use the
+        Given `model`, creates an `auto_wrap_policy` based on the passed in policy and if we can use the
         `transformer_cls_to_wrap`
         """
         from torch.distributed.fsdp.wrap import (
@@ -2256,7 +2256,7 @@ class MegatronLMPlugin:
         lr_warmup_fraction (`float`, defaults to `None`):
             Fraction of lr-warmup-(iters/samples) to linearly warmup learning rate over.
         min_lr (`float`, defaults to `0`):
-            Minumum value for learning rate. The scheduler clip values below this threshold.
+            Minimum value for learning rate. The scheduler clip values below this threshold.
         consumed_samples (`List`, defaults to `None`):
             Number of samples consumed in the same order as the dataloaders to `accelerator.prepare` call.
         no_wd_decay_cond (`Optional`, defaults to `None`):
@@ -2383,7 +2383,7 @@ class MegatronLMPlugin:
     )
     min_lr: float = field(
         default=0,
-        metadata={"help": "Minumum value for learning rate. The scheduler clip values below this threshold."},
+        metadata={"help": "Minimum value for learning rate. The scheduler clip values below this threshold."},
     )
     consumed_samples: list[int] = field(
         default=None,
diff --git a/src/accelerate/utils/environment.py b/src/accelerate/utils/environment.py
@@ -149,7 +149,7 @@ def check_cuda_p2p_ib_support():
     Checks if the devices being used have issues with P2P and IB communications, namely any consumer GPU hardware after
     the 3090.
 
-    Noteably uses `nvidia-smi` instead of torch to not initialize CUDA.
+    Notably uses `nvidia-smi` instead of torch to not initialize CUDA.
     """
     try:
         device_names, device_count = get_gpu_info()
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -648,7 +648,7 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
     if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
         # Context: `fully_shard` moves the model to GPU if it was on CPU, however it can also be on `meta` and then it stays there even after `fully_shard`
         # For this reason, we need to move the model to `meta` device, as then sharding happens on `meta` device
-        # If we kept the model on CPU (`cpu_ram_efficient_loading` has model be on CPU on all ranks, though non-main ranks only have `torch.emtpy`), `fully_shard` would move it to GPU
+        # If we kept the model on CPU (`cpu_ram_efficient_loading` has model be on CPU on all ranks, though non-main ranks only have `torch.empty`), `fully_shard` would move it to GPU
         # Afterwards, when we call `fsdp2_load_full_state_dict`, us creating the state_dict would result into briefly having two copies of model state_dict on the GPU -> VRAM spike
 
         # We need to keep the original non-persistent buffers, as those MAY not be in the state_dict, resulting in them staying on meta device
diff --git a/src/accelerate/utils/megatron_lm.py b/src/accelerate/utils/megatron_lm.py
diff --git a/src/accelerate/utils/operations.py b/src/accelerate/utils/operations.py

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ def convert_config_to_fsdp2(config: dict) -> dict:`
`89`	`89`	`new_fsdp_config = {}`
`90`	`90`
`91`	`91`	`if fsdp_config.get("fsdp_version", 1) == 2:`
`92`		`- logger.warning("Config already specfies FSDP2, skipping conversion...")`
	`92`	`+ logger.warning("Config already specifies FSDP2, skipping conversion...")`
`93`	`93`	`logger.warning(`
`94`	`94`	"If the config doesn't use new argument names, change `fsdp_version` to `1` and rerun the command."
`95`	`95`	`)`