lint eval.py and builder.py

anirudh · anirudh · commit afdb3ceabb46 · 2025-02-23T09:32:14.000+05:30
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -16,13 +16,22 @@
 import torch._inductor.config
 import torch.distributed as dist
 
-from torchchat.distributed.utils import(
+from torchtune.models.convert_weights import meta_to_tune
+
+from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
+
+from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune
+
+from torchtune.training import set_default_dtype
+
+from torchchat.distributed.logging_utils import SingletonLogger
+
+from torchchat.distributed.utils import (
     Color as color,
     CUDATrackTime,
-    init_distributed,
     GPUMemoryMonitor,
+    init_distributed,
 )
-from torchchat.distributed.logging_utils import SingletonLogger
 
 from torchchat.model import Model, ModelArgs, ModelType, Transformer, TransformerArgs
 from torchchat.model_config.model_config import resolve_model_config
@@ -36,15 +45,6 @@
 from torchchat.utils.quantize import quantize_model
 
 
-from torchtune.models.convert_weights import meta_to_tune
-
-from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
-
-from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune
-
-from torchtune.training import set_default_dtype
-
-
 @dataclass
 class BuilderArgs:
     checkpoint_path: Optional[Union[Path, str]] = None
@@ -194,15 +194,19 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         tp = getattr(args, "tp", 1)
         chpt_from = getattr(args, "chpt_from", "hf")
         sdp_backend_dict = {
-            'math': torch.nn.attention.SDPBackend.MATH,
-            'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION,
-            'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
-            'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+            "math": torch.nn.attention.SDPBackend.MATH,
+            "flash_attention": torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+            "efficient_attention": torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+            "cudnn_attention": torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
         }
         attention_backend = sdp_backend_dict[args.attention_backend]
-        if args.device == "cpu" and (args.attention_backend == "efficient_attention"
-                                     or args.attention_backend == "cudnn_attention"):
-            print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.")
+        if args.device == "cpu" and (
+            args.attention_backend == "efficient_attention"
+            or args.attention_backend == "cudnn_attention"
+        ):
+            print(
+                f"Warning: {args.attention_backend} is not supported on CPU. Using math instead."
+            )
             attention_backend = torch.nn.attention.SDPBackend.MATH
         return cls(
             checkpoint_dir=checkpoint_dir,
@@ -321,7 +325,17 @@ def validate_model(
         if model is None:
             return
 
-        if sum([self.is_tiktoken, self.is_hf_tokenizer, self.is_sentencepiece, self.is_llama_3_2_mm]) != 1:
+        if (
+            sum(
+                [
+                    self.is_tiktoken,
+                    self.is_hf_tokenizer,
+                    self.is_sentencepiece,
+                    self.is_llama_3_2_mm,
+                ]
+            )
+            != 1
+        ):
             raise RuntimeError(f"no tokenizer was found at {self.tokenizer_path}")
 
         is_tiktoken = self.is_tiktoken
@@ -333,10 +347,10 @@ def validate_model(
         use_hf_tokenizer = model.config.use_hf_tokenizer
         use_other_tokenizer = not (use_tiktoken or use_hf_tokenizer)
         if (
-            (is_tiktoken and not use_tiktoken) or
-            (is_hf_tokenizer and not use_hf_tokenizer) or
-            (is_sentencepiece and not use_other_tokenizer) or
-            (is_llama_3_2_mm and not use_other_tokenizer)
+            (is_tiktoken and not use_tiktoken)
+            or (is_hf_tokenizer and not use_hf_tokenizer)
+            or (is_sentencepiece and not use_other_tokenizer)
+            or (is_llama_3_2_mm and not use_other_tokenizer)
         ):
             raise RuntimeError(
                 "model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}".format(
@@ -534,6 +548,7 @@ def _load_model(builder_args: BuilderArgs) -> Model:
         # AOTI-compoiled model will load its own weights.
         # Release weights here to avoid OOM
         import gc
+
         if hasattr(model, "model"):
             model.model = None
         gc.collect()
@@ -591,6 +606,7 @@ def _initialize_model(
 
             def do_nothing(max_batch_size, max_seq_length):
                 pass
+
             model.setup_caches = do_nothing
 
             model.forward = torch._export.aot_load(
@@ -628,6 +644,7 @@ def do_nothing(max_batch_size, max_seq_length):
 
             def do_nothing(max_batch_size, max_seq_length):
                 pass
+
             model.setup_caches = do_nothing
 
             model.forward = aoti_compiled_model
@@ -702,7 +719,9 @@ def do_nothing(max_batch_size, max_seq_length):
         logger = SingletonLogger.get_logger()
 
         gpu_memory_monitor = GPUMemoryMonitor("cuda")
-        logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}")
+        logger.info(
+            f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}"
+        )
 
         # Model-level config
         if builder_args.params_table:
@@ -713,20 +732,16 @@ def do_nothing(max_batch_size, max_seq_length):
         config = TransformerArgs.from_params(model_config.transformer_args["text"])
         logger.info(f"Transformer Config: {config}")
 
-        #TODO: Move into head of file after solving circular import
-        from torchchat.distributed.checkpoint_utils import (
-            load_model_weights,
-            )
+        # TODO: Move into head of file after solving circular import
+        from torchchat.distributed.checkpoint_utils import load_model_weights
 
         # Validate pipeline degree
         assert config.n_layers % pp_degree == 0
 
         # Create device mesh
         device_mesh = dist.init_device_mesh(
-            "cuda",
-            (pp_degree, tp_degree),
-            mesh_dim_names=("pp", "tp")
-            )
+            "cuda", (pp_degree, tp_degree), mesh_dim_names=("pp", "tp")
+        )
         tp_mesh = device_mesh["tp"]
         pp_mesh = device_mesh["pp"]
         logger.info(f"Created device mesh: {device_mesh}\n{tp_mesh=}, {pp_mesh=}")
@@ -755,7 +770,13 @@ def do_nothing(max_batch_size, max_seq_length):
         # Load weights
         logger.info(f"Loading weights for {pp_rank=} on {device=}")
         with CUDATrackTime() as timer:
-            load_model_weights(model, builder_args.distribution_path, device, config, builder_args.chpt_from)
+            load_model_weights(
+                model,
+                builder_args.distribution_path,
+                device,
+                config,
+                builder_args.chpt_from,
+            )
 
         logger.info(
             f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
@@ -769,7 +790,7 @@ def do_nothing(max_batch_size, max_seq_length):
         # lanes.
         # TODO: bump up the lane count
         pipeline_lanes = 1
-        seqlen_prefill=1024
+        seqlen_prefill = 1024
         with device:
             model.setup_caches(1, seqlen_prefill, cache_lanes=pipeline_lanes)
 
diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import argparse
-from typing import Callable, Optional, Dict, List
+from typing import Callable, Dict, List, Optional
 
 import torch
 import torch._dynamo.config
@@ -30,25 +30,24 @@
 
 import lm_eval
 
+import PIL
+
 from lm_eval.evaluator import evaluate
+from lm_eval.models.hf_vlms import HFMultimodalLM
 from lm_eval.models.huggingface import HFLM as eval_wrapper
 from lm_eval.tasks import get_task_dict
-from lm_eval.models.hf_vlms import HFMultimodalLM
-from lm_eval.evaluator import evaluate
-
-from torchtune.modules.common_utils import local_kv_cache 
-from torchtune.modules.model_fusion import DeepFusionModel
-from torchtune.modules.transforms import Transform
+from torchtune import utils
 from torchtune.data import (
     format_content_with_images,
     left_pad_sequence,
     Message,
     padded_collate_tiled_images_and_mask,
 )
 from torchtune.generation import generate, sample
-from torchtune import utils
 
-import PIL
+from torchtune.modules.common_utils import local_kv_cache
+from torchtune.modules.model_fusion import DeepFusionModel
+from torchtune.modules.transforms import Transform
 
 
 def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
@@ -428,7 +427,6 @@ def _model_multimodal_generate(
         return torch.tensor(generated_tokens, dtype=torch.int32).unsqueeze(0)
 
 
-
 @torch.no_grad()
 def eval(
     model: Model,
@@ -492,7 +490,8 @@ def multi_model_eval(
     limit: Optional[int] = None,
     max_seq_length: Optional[int] = None,
     device: str = "cpu",
-    is_pte_model: bool = False,):
+    is_pte_model: bool = False,
+):
     """
     Evaluates a language model on a specified task using the lm-evaluation-harness library.
 
@@ -513,7 +512,7 @@ def multi_model_eval(
 
     model_eval_wrapper = VLMEvalWrapper(
         model,
-        transform=tokenizer, # tranform is the tokenizer for multimodal models
+        transform=tokenizer,  # tranform is the tokenizer for multimodal models
         max_seq_length=max_seq_length,
         device=device,
     )
@@ -557,7 +556,10 @@ def main(args) -> None:
 
     modality = builder_args.modality
     print(f"Modality of model={modality}")
-    assert modality in ["text", "text-image"], "Only text and text-plus-image modality is supported for evaluation"
+    assert modality in [
+        "text",
+        "text-image",
+    ], "Only text and text-image modality is supported for evaluation"
 
     print(f"Using device={device}")
     set_precision(builder_args.precision)
@@ -575,12 +577,16 @@ def main(args) -> None:
 
     if compile:
         assert not (
-            builder_args.dso_path or builder_args.pte_path or builder_args.aoti_package_path
+            builder_args.dso_path
+            or builder_args.pte_path
+            or builder_args.aoti_package_path
         ), "cannot compile exported model"
         model_forward = torch.compile(
             model_forward, mode="reduce-overhead", dynamic=True, fullgraph=True
         )
-        torch._inductor.config.coordinate_descent_tuning = False if device == "cpu" else True
+        torch._inductor.config.coordinate_descent_tuning = (
+            False if device == "cpu" else True
+        )
 
     with measure_time("Time to run eval: {time:.02f}s."):
         if modality == "text":