[Typing] Fix pyrefly-ignore in train.py (#2282)

fegin · web-flow · commit f0955d0fd7a8 · 2026-01-28T10:33:58.000-08:00
The main goal is to fix train.py. Since train.py uses many components,
this PR also fixes some other components as well.

Trainer.model_parts should be `list[ModelProtocol]` but this change will
affect many files. Will do this in another PR.

Some ignores are already useless, not sure why those ignores are not
removed previously using `pyrefly check --remove-unused-ignores`.

Total pyrefly ignores removed: 19
diff --git a/torchtitan/components/dataloader.py b/torchtitan/components/dataloader.py
@@ -9,7 +9,9 @@
 import inspect
 import pickle
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, Iterator
+
+import torch
 
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.utils.data import IterableDataset
@@ -38,11 +40,10 @@ class BaseDataLoader(Stateful, ABC):
     """
 
     @abstractmethod
-    def __iter__(self):
+    def __iter__(self) -> Iterator[tuple[dict[str, torch.Tensor], torch.Tensor]]:
         ...
 
 
-# pyrefly: ignore [inconsistent-inheritance]
 class ParallelAwareDataloader(StatefulDataLoader, BaseDataLoader):
     """Dataloader that is aware of distributed data parallelism.
 
diff --git a/torchtitan/components/ft/manager.py b/torchtitan/components/ft/manager.py
@@ -7,15 +7,16 @@
 import importlib.util
 from contextlib import nullcontext
 from datetime import timedelta
-from typing import Callable, ContextManager, Optional, TYPE_CHECKING, Union
+from typing import Callable, cast, ContextManager, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.distributed as dist
 
 import torch.nn as nn
 from torch.distributed._composable.fsdp.fully_shard import FSDPModule
 from torch.distributed.distributed_c10d import ReduceOp
-from torchtitan.components.ft.config import FaultTolerance as FTConfig
+from torchtitan.components.ft.config import FaultTolerance as ExtendedFTConfig
+from torchtitan.config import FaultTolerance as FTConfig
 from torchtitan.tools.logging import logger
 
 if importlib.util.find_spec("torchft") is not None:
@@ -119,8 +120,9 @@ def maybe_semi_sync_training(
     """
     If TorchFT is enabled and the config is set, use semi_sync_method
     """
-    semi_sync_method = ft_config.semi_sync_method
-    if ft_config.enable and semi_sync_method is not None:
+    extend_ft_config = cast(ExtendedFTConfig, ft_config)
+    semi_sync_method = extend_ft_config.semi_sync_method
+    if extend_ft_config.enable and semi_sync_method is not None:
         from torchft import local_sgd
 
         assert (
@@ -131,7 +133,7 @@ def maybe_semi_sync_training(
         )
         if semi_sync_method.lower() == "diloco":
             if fragment_fn:
-                model_parts = fragment_fn(model, ft_config, n_layers)
+                model_parts = fragment_fn(model, extend_ft_config, n_layers)
             else:
                 model_parts = [model]
 
@@ -149,17 +151,17 @@ def maybe_semi_sync_training(
                 model_fragments=model_parts,
                 inner_optimizer=optimizer,
                 outer_optimizer=outer_optimizers,
-                sync_every=ft_config.sync_steps,
-                should_quantize=ft_config.should_quantize,
-                fragment_sync_delay=ft_config.fragment_sync_delay,
-                fragment_update_alpha=ft_config.fragment_update_alpha,
+                sync_every=extend_ft_config.sync_steps,
+                should_quantize=extend_ft_config.should_quantize,
+                fragment_sync_delay=extend_ft_config.fragment_sync_delay,
+                fragment_update_alpha=extend_ft_config.fragment_update_alpha,
             )
         elif semi_sync_method.lower() == "local_sgd":
             return local_sgd.LocalSGD(
                 manager=ft_manager._manager,
                 model=model,
                 optimizer=optimizer,
-                sync_every=ft_config.sync_steps,
+                sync_every=extend_ft_config.sync_steps,
             )
         else:
             raise ValueError(
diff --git a/torchtitan/components/metrics.py b/torchtitan/components/metrics.py
@@ -18,7 +18,7 @@
 from torchtitan.distributed import ParallelDims
 from torchtitan.tools import utils
 from torchtitan.tools.logging import logger
-from torchtitan.tools.utils import Color, device_module, device_type
+from torchtitan.tools.utils import Color, device_module, device_type, NoColor
 
 if TYPE_CHECKING:
     from torchtitan.protocols import BaseModelArgs
@@ -195,7 +195,7 @@ def close(self) -> None:
 
 
 def ensure_pp_loss_visible(
-    parallel_dims: ParallelDims, job_config: JobConfig, color: Color
+    parallel_dims: ParallelDims, job_config: JobConfig, color: Color | NoColor
 ) -> None:
     """
     Ensures that the loss is visible on the console for pipeline-parallel training.
diff --git a/torchtitan/components/validate.py b/torchtitan/components/validate.py
@@ -29,7 +29,7 @@ class BaseValidator:
     def __init__(self, job_config: JobConfig):
         self.job_config = job_config
 
-    def validate(self, model_parts: list[nn.Module]) -> dict[str, float]:
+    def validate(self, model_parts: list[nn.Module], step: int) -> None:
         raise NotImplementedError("validate method not implemented")
 
     def should_validate(self, step: int) -> bool:
@@ -154,7 +154,6 @@ def post_dataloading_process(
         return inputs, labels, extra_inputs, extra_kwargs
 
     @torch.no_grad()
-    # pyrefly: ignore [bad-override]
     def validate(
         self,
         model_parts: list[nn.Module],
@@ -170,7 +169,6 @@ def validate(
         device_type = utils.device_type
         num_steps = 0
 
-        # pyrefly: ignore [not-iterable]
         for input_dict, labels in self.validation_dataloader:
             if (
                 self.job_config.validation.steps != -1
@@ -190,7 +188,6 @@ def validate(
 
             # Count valid tokens for this batch
             local_valid_tokens = torch.tensor(0, dtype=torch.int64, device=device_type)
-            # pyrefly: ignore [missing-attribute]
             local_valid_tokens += (labels != IGNORE_INDEX).sum()
 
             # All-reduce token count across DP ranks to get global token count
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -68,13 +68,10 @@ def forward(
         max_q = attention_masks.max_q
         max_k = attention_masks.max_k
 
-        # pyrefly: ignore [no-matching-overload]
         xq_packed = xq.transpose(1, 2).flatten(0, 1)  # (bs * seqlen, n_heads, head_dim)
-        # pyrefly: ignore [no-matching-overload]
         xk_packed = xk.transpose(1, 2).flatten(
             0, 1
         )  # (bs * seqlen, n_kv_heads, head_dim)
-        # pyrefly: ignore [no-matching-overload]
         xv_packed = xv.transpose(1, 2).flatten(
             0, 1
         )  # (bs * seqlen, n_kv_heads, head_dim)
diff --git a/torchtitan/models/flux/train.py b/torchtitan/models/flux/train.py
@@ -138,7 +138,6 @@ def forward_backward_step(
             # pyrefly: ignore [bad-assignment]
             global_valid_tokens = dist_utils.dist_sum(local_valid_tokens, batch_mesh)
         else:
-            # pyrefly: ignore [bad-assignment]
             global_valid_tokens = local_valid_tokens.float()
 
         # Keep these variables local to shorten the code as these are
diff --git a/torchtitan/models/flux/validate.py b/torchtitan/models/flux/validate.py
@@ -127,7 +127,6 @@ def validate(
         device_type = dist_utils.device_type
         num_steps = 0
 
-        # pyrefly: ignore [not-iterable]
         for input_dict, labels in self.validation_dataloader:
             if (
                 self.job_config.validation.steps != -1
@@ -139,6 +138,7 @@ def validate(
             if not isinstance(prompt, list):
                 prompt = [prompt]
             for p in prompt:
+                assert isinstance(p, str), f"prompt must be a string, got {type(p)}"
                 if save_img_count != -1 and save_img_count <= 0:
                     break
                 image = generate_image(
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -10,7 +10,7 @@
 import os
 import time
 from datetime import timedelta
-from typing import Any, Iterable
+from typing import Any, cast, Iterable, Iterator
 
 import torch
 import torch.distributed.checkpoint.stateful
@@ -28,6 +28,7 @@
 from torchtitan.config import ConfigManager, JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims, utils as dist_utils
 from torchtitan.distributed.context_parallel import prepare_context_parallel_input
+from torchtitan.protocols import ModelProtocol
 from torchtitan.protocols.model_converter import build_model_converters
 from torchtitan.tools import utils
 from torchtitan.tools.logging import init_logger, logger
@@ -46,6 +47,8 @@ class Trainer(torch.distributed.checkpoint.stateful.Stateful):
     # swappable training components in TrainSpec
     tokenizer: train_spec_module.BaseTokenizer | None
     dataloader: train_spec_module.BaseDataLoader
+    # TODO: we should make this list[ModelProtocol] but this will affect many components.
+    # will do this in a separate PR
     model_parts: list[torch.nn.Module]
     loss_fn: train_spec_module.LossFunction
     optimizers: train_spec_module.OptimizersContainer
@@ -97,7 +100,6 @@ def __init__(self, job_config: JobConfig):
         else:
             batch_degree, batch_rank = 1, 0
 
-        # pyrefly: ignore [bad-argument-type]
         self.ft_manager = FTManager(job_config.fault_tolerance)
         batch_degree, batch_rank = self.ft_manager.get_dp_info(batch_degree, batch_rank)
 
@@ -173,12 +175,13 @@ def __init__(self, job_config: JobConfig):
         )
 
         # move sharded model to CPU/GPU and initialize weights via DTensor
+        buffer_device: torch.device | None
         if job_config.checkpoint.create_seed_checkpoint:
             init_device = "cpu"
             buffer_device = None
         elif job_config.training.enable_cpu_offload:
             init_device = "cpu"
-            buffer_device = device_type
+            buffer_device = torch.device(device_type)
         else:
             init_device = device_type
             buffer_device = None
@@ -239,21 +242,18 @@ def __init__(self, job_config: JobConfig):
             for m in self.model_parts:
                 m.to_empty(device=init_device)
                 with torch.no_grad():
-                    # pyrefly: ignore [not-callable]
-                    m.init_weights(buffer_device=buffer_device)
+                    cast(ModelProtocol, m).init_weights(buffer_device=buffer_device)
                 m.train()
 
             # confirm that user will be able to view loss metrics on the console
-            # pyrefly: ignore [bad-argument-type]
             ensure_pp_loss_visible(parallel_dims, job_config, color)
         else:
             # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
             model = self.train_spec.parallelize_fn(model, parallel_dims, job_config)
 
             model.to_empty(device=init_device)
             with torch.no_grad():
-                # pyrefly: ignore [not-callable]
-                model.init_weights(buffer_device=buffer_device)
+                cast(ModelProtocol, model).init_weights(buffer_device=buffer_device)
             model.train()
 
             self.model_parts = [model]
@@ -384,7 +384,7 @@ def init_distributed(self) -> ParallelDims:
 
     def batch_generator(
         self, data_iterable: Iterable[tuple[dict[str, torch.Tensor], torch.Tensor]]
-    ) -> Iterable[tuple[dict[str, torch.Tensor], torch.Tensor]]:
+    ) -> Iterator[tuple[dict[str, torch.Tensor], torch.Tensor]]:
         """Returns an iterator that processes batches from the data iterator.
 
         Note: Tensors are yielded on CPU. The caller is responsible for moving
@@ -457,8 +457,11 @@ def post_dataloading_process(
 
         attn_type = getattr(self.model_args, "attn_type", "sdpa")
         if attn_type in ["flex", "varlen"]:
-            # pyrefly: ignore [not-callable]
-            extra_kwargs["attention_masks"] = self.model_parts[0].get_attention_masks(
+            assert (
+                self.tokenizer is not None
+            ), "tokenizer is required for flex/varlen attention"
+            model = cast(ModelProtocol, self.model_parts[0])
+            extra_kwargs["attention_masks"] = model.get_attention_masks(
                 input_batch=inputs,
                 tokenizer=self.tokenizer,
                 extra_inputs=extra_inputs,
@@ -543,7 +546,7 @@ def forward_backward_step(
         return loss
 
     def train_step(
-        self, data_iterator: Iterable[tuple[dict[str, torch.Tensor], torch.Tensor]]
+        self, data_iterator: Iterator[tuple[dict[str, torch.Tensor], torch.Tensor]]
     ):
         self.optimizers.zero_grad()
         # Save the current step learning rate for logging
@@ -557,9 +560,7 @@ def train_step(
         microbatches = []
         local_valid_tokens = torch.tensor(0, dtype=torch.int64)
         for _microbatch in range(self.gradient_accumulation_steps):
-            # pyrefly: ignore [no-matching-overload]
             input_dict, labels = next(data_iterator)
-            # pyrefly: ignore [missing-attribute]
             local_valid_tokens += (labels != IGNORE_INDEX).sum()
             microbatches.append((input_dict, labels))
 
@@ -668,7 +669,6 @@ def train(self):
                 leaf_folder=leaf_folder,
             ) as memory_profiler,
             maybe_semi_sync_training(
-                # pyrefly: ignore [bad-argument-type]
                 job_config.fault_tolerance,
                 ft_manager=self.ft_manager,
                 model=self.model_parts[0],
@@ -685,7 +685,6 @@ def train(self):
                 ),
             ),
         ):
-            # pyrefly: ignore [bad-argument-type]
             data_iterator = self.batch_generator(self.dataloader)
             while self.should_continue_training():
                 self.step += 1
@@ -705,7 +704,6 @@ def train(self):
                     self.job_config.validation.enable
                     and self.validator.should_validate(self.step)
                 ):
-                    # pyrefly: ignore [bad-argument-count]
                     self.validator.validate(self.model_parts, self.step)
 
                 # signal the profiler that the next profiling step has started