[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 9a356278e098 · 2024-11-25T19:44:24.000Z
for more information, see https://pre-commit.ci
diff --git a/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/handlers/fp8_training_handler.py b/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/handlers/fp8_training_handler.py
@@ -2,7 +2,7 @@
 import logging
 import operator
 from dataclasses import dataclass
-from typing import Dict, List, Union
+from typing import Union
 
 import torch
 import torch.nn as nn
@@ -46,7 +46,7 @@ class FP8Config:
 class Float8TrainingHandler:
     """Handler for configuring models for FP8 training using torchao."""
 
-    def __init__(self, args: FP8Config, model_path: str, parallel_dims: Dict[str, bool]):
+    def __init__(self, args: FP8Config, model_path: str, parallel_dims: dict[str, bool]):
         """Initializes the handler for FP8 training and configuration.
 
         Args:
@@ -164,7 +164,7 @@ def convert_to_float8_training(self, model: nn.Module, module_filter_fn: callabl
             f"Swapped to Float8Linear layers with enable_fsdp_float8_all_gather={self.config.enable_fsdp_float8_all_gather}"
         )
 
-    def precompute_float8_dynamic_scale_for_fsdp(self, model: Union[nn.Module, List[nn.Module]]):
+    def precompute_float8_dynamic_scale_for_fsdp(self, model: Union[nn.Module, list[nn.Module]]):
         if not self.enable_fp8 or not self.precompute_scale:
             return
 
@@ -174,7 +174,7 @@ def precompute_float8_dynamic_scale_for_fsdp(self, model: Union[nn.Module, List[
         for m in models:
             precompute_float8_dynamic_scale_for_fsdp(m)
 
-    def sync_float8_amax_and_scale_history(self, model: Union[nn.Module, List[nn.Module]]):
+    def sync_float8_amax_and_scale_history(self, model: Union[nn.Module, list[nn.Module]]):
         if not self.enable_fp8 or not self.delayed_scaling:
             return
 
diff --git a/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/handlers/fsdp2_handler.py b/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/handlers/fsdp2_handler.py
@@ -75,13 +75,13 @@ def wrap_model(self, model: nn.Module):
         dp_mesh = self.device_mesh["data_parallel"]
         assert dp_mesh.size() > 1, "FSDP requires at least two devices."
 
-        fsdp_policy = dict(
-            mesh=dp_mesh,
-            mp_policy=self.MixedPrecisionPolicy(
+        fsdp_policy = {
+            "mesh": dp_mesh,
+            "mp_policy": self.MixedPrecisionPolicy(
                 param_dtype=torch.bfloat16,
                 reduce_dtype=torch.float32,
             ),
-        )
+        }
         if self.args.enable_cpu_offload:
             fsdp_policy["offload_policy"] = self.CPUOffloadPolicy()
 
diff --git a/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/tests/test_fp8_training_handler.py b/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/tests/test_fp8_training_handler.py
@@ -1,6 +1,7 @@
 import unittest
 from unittest.mock import patch
 
+import pytest
 import torch.nn as nn
 from handlers.fp8_training_handler import Float8TrainingHandler, FP8Config
 from lightning.pytorch.demos import Transformer
@@ -38,33 +39,30 @@ def setUp(self):
     @patch("handlers.fp8_training_handler.is_sm89_or_later", return_value=True)
     def test_handler_initialization(self, mock_sm89):
         handler = Float8TrainingHandler(self.args, self.model_path, self.parallel_dims)
-        self.assertTrue(handler.enable_fp8)
-        self.assertFalse(handler.compile)
-        self.assertIsNotNone(handler.args)
-        self.assertIsNotNone(handler.parallel_dims)
+        assert handler.enable_fp8
+        assert not handler.compile
+        assert handler.args is not None
+        assert handler.parallel_dims is not None
 
     @patch("handlers.fp8_training_handler.is_sm89_or_later", return_value=True)
     def test_compile_flag(self, mock_sm89):
         self.args.enable_torch_compile = True
         handler = Float8TrainingHandler(self.args, self.model_path, self.parallel_dims)
-        self.assertTrue(handler.compile)
+        assert handler.compile
 
     @patch("handlers.fp8_training_handler.is_sm89_or_later", return_value=False)
     def test_handler_disabled_on_unsupported_hardware(self, mock_sm89):
         # Assert that the RuntimeError is raised
-        with self.assertRaises(RuntimeError) as context:
+        with pytest.raises(RuntimeError) as context:
             Float8TrainingHandler(self.args, self.model_path, self.parallel_dims)
 
         # Check that the error message matches the expected text
-        self.assertIn(
-            "Float8Linear operation is not supported on the current hardware.",
-            str(context.exception),
-        )
+        assert "Float8Linear operation is not supported on the current hardware." in str(context.exception)
 
     def test_handler_disabled_when_fp8_not_enabled(self):
         self.args.enable_fp8 = False
         handler = Float8TrainingHandler(self.args, self.model_path, self.parallel_dims)
-        self.assertFalse(handler.enable_fp8)
+        assert not handler.enable_fp8
 
     @patch("handlers.fp8_training_handler.is_sm89_or_later", return_value=True)
     def test_convert_to_float8_training(self, mock_sm89):
@@ -75,9 +73,9 @@ def test_convert_to_float8_training(self, mock_sm89):
         print(self.model)
         for module_name, module in self.model.named_modules():
             if any(proj in module_name for proj in ["w1", "w2", "w3"]):  # Float8Linear
-                self.assertIsInstance(module, Float8Linear, f"{module_name} should be Float8Linear")
+                assert isinstance(module, Float8Linear), f"{module_name} should be Float8Linear"
             elif isinstance(module, nn.Linear):
-                self.assertNotIsInstance(module, Float8Linear, f"{module_name} should not be Float8Linear")
+                assert not isinstance(module, Float8Linear), f"{module_name} should not be Float8Linear"
 
     @patch("handlers.fp8_training_handler.is_sm89_or_later", return_value=True)
     def test_precompute_float8_dynamic_scale_for_fsdp(self, mock_sm89):
diff --git a/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/tests/test_fsdp2_handler.py b/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/tests/test_fsdp2_handler.py
@@ -1,6 +1,7 @@
 import unittest
 from unittest.mock import MagicMock, patch
 
+import pytest
 import torch.nn as nn
 from handlers.fsdp2_handler import FSDP2Config, FSDP2Handler
 
@@ -37,15 +38,15 @@ def setUp(self):
 
         class ModelWrapper(nn.Module):
             def __init__(self, model):
-                super(ModelWrapper, self).__init__()
+                super().__init__()
                 self.model = model  # The wrapped Transformer model
 
             def forward(self, *args, **kwargs):
                 return self.model(*args, **kwargs)
 
         class InnerModel(nn.Module):
             def __init__(self, num_layers, input_size, hidden_size):
-                super(InnerModel, self).__init__()
+                super().__init__()
                 # Initialize a ModuleList to store the layers
                 self.layers = nn.ModuleList()
                 for _ in range(num_layers):
@@ -77,23 +78,23 @@ def test_wrap_model(self, mock_checkpoint_wrapper_func, mock_fully_shard_func):
         wrapped_model = handler.wrap_model(self.model)
 
         # Ensure fully_shard and checkpoint_wrapper are called
-        self.assertTrue(mock_fully_shard_func.called, "fully_shard was not called")
-        self.assertTrue(mock_checkpoint_wrapper_func.called, "checkpoint_wrapper was not called")
+        assert mock_fully_shard_func.called, "fully_shard was not called"
+        assert mock_checkpoint_wrapper_func.called, "checkpoint_wrapper was not called"
 
         # Verify that the model's layers have been wrapped
-        self.assertIsNotNone(wrapped_model, "wrapped_model is None")
+        assert wrapped_model is not None, "wrapped_model is None"
         mock_fully_shard_func.assert_called()
 
         # Ensure that checkpoint_wrapper is called for each layer
-        self.assertEqual(mock_checkpoint_wrapper_func.call_count, len(self.model.model.layers))
+        assert mock_checkpoint_wrapper_func.call_count == len(self.model.model.layers)
         # Ensure that fully_shard is called for each layer + full module
-        self.assertEqual(mock_fully_shard_func.call_count, len(self.model.model.layers) + 1)
+        assert mock_fully_shard_func.call_count == len(self.model.model.layers) + 1
 
     def test_wrap_model_with_single_device(self):
         # Simulate single device
         self.device_mesh["data_parallel"].size.return_value = 1
         handler = FSDP2Handler(self.args, self.device_mesh)
-        with self.assertRaises(AssertionError):
+        with pytest.raises(AssertionError):
             handler.wrap_model(self.model)
 
     @patch("torch.distributed._composable.fsdp.fully_shard", side_effect=mock_fully_shard)
@@ -103,8 +104,8 @@ def test_enable_cpu_offload(self, mock_fully_shard_func):
         handler.wrap_model(self.model)
         # Check if CPUOffloadPolicy is used
         args, kwargs = mock_fully_shard_func.call_args
-        self.assertIn("offload_policy", kwargs)
-        self.assertIsNotNone(kwargs["offload_policy"])
+        assert "offload_policy" in kwargs
+        assert kwargs["offload_policy"] is not None
 
     @patch("torch.distributed._composable.fsdp.fully_shard", side_effect=mock_fully_shard)
     @patch(
@@ -116,4 +117,4 @@ def test_diable_gradient_checkpointing(self, mock_checkpoint_wrapper_func, mock_
         handler = FSDP2Handler(self.args, self.device_mesh)
         handler.wrap_model(self.model)
         # Check if gradient checkpointing is disabled
-        self.assertFalse(mock_checkpoint_wrapper_func.called, "Error: checkpoint_wrapper was unexpectedly called.")
+        assert not mock_checkpoint_wrapper_func.called, "Error: checkpoint_wrapper was unexpectedly called."
diff --git a/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/tests/test_torch_compile_handler.py b/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/tests/test_torch_compile_handler.py
@@ -43,11 +43,7 @@ def test_compile_transformer_encoder_layers(self, mock_compile):
         handler.compile_model(self.model)
 
         # Ensure torch.compile was called with the correct layer
-        self.assertEqual(
-            mock_compile.call_count,
-            self.num_layers,
-            f"Expected mock_compile to be called {self.num_layers} times",
-        )
+        assert mock_compile.call_count == self.num_layers, f"Expected mock_compile to be called {self.num_layers} times"
 
     def test_compile_disabled(self):
         handler = TorchCompileHandler(False, self.model_path)
@@ -74,9 +70,5 @@ def forward(self, x):
         handler.compile_model(model)
 
         # LlamaMLP inside NestedModel should be compiled
-        self.assertTrue(mock_compile.called)
-        self.assertEqual(
-            mock_compile.call_count,
-            self.num_layers,
-            f"Expected mock_compile to be called {self.num_layers} times",
-        )
+        assert mock_compile.called
+        assert mock_compile.call_count == self.num_layers, f"Expected mock_compile to be called {self.num_layers} times"
diff --git a/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/train.py b/examples/pytorch/custom_handler_fp8_fsdp1n2_compile/train.py
@@ -1,20 +1,21 @@
 import argparse
-from dataclasses import dataclass
 import logging
+from dataclasses import dataclass
 
-import torch.distributed as dist
 import lightning as L
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
-from lightning.pytorch.demos import Transformer, WikiText2
+from lightning.pytorch.demos import WikiText2
 from lightning.pytorch.strategies import FSDPStrategy, ModelParallelStrategy
 from torch.distributed.fsdp import BackwardPrefetch, MixedPrecision
 from torch.utils.data import DataLoader
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 log = logging.getLogger(__name__)
 
+
 @dataclass
 class Args:
     vocab_size: int = 32000
@@ -24,56 +25,56 @@ class Args:
     enable_gradient_checkpointing: bool = False
     enable_fsdp2: bool = False
 
+
 class SimpleLayer(nn.Module):
     def __init__(self, hidden_size):
-        super(SimpleLayer, self).__init__()
+        super().__init__()
         self.linear = nn.Linear(hidden_size, hidden_size)
         self.activation = nn.ReLU()
 
     def forward(self, x):
         print(f"Input shape before Linear: {x.shape}")
         x = self.linear(x)
         print(f"Output shape after Linear: {x.shape}")
-        x = self.activation(x)
-        return x
+        return self.activation(x)
+
 
 class InnerModel(nn.Module):
     def __init__(self, num_layers, hidden_size, vocab_size=32000):
-        super(InnerModel, self).__init__()
+        super().__init__()
         # Embedding layer
         self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=hidden_size)
         # Initialize a ModuleList to store the intermediate layers
         self.layers = nn.ModuleList([SimpleLayer(hidden_size) for _ in range(num_layers)])
         self.lm_head = nn.Linear(hidden_size, vocab_size)
 
-
     def forward(self, x):
         x = self.embedding(x)
         # Pass the input through each layer sequentially
         for layer in self.layers:
             x = layer(x)
-        x = self.lm_head(x)
-        return x
+        return self.lm_head(x)
 
 
 class ModelWrapper(nn.Module):
     def __init__(self, model):
-        super(ModelWrapper, self).__init__()
+        super().__init__()
         self.model = model  # The wrapped Transformer model
 
     def forward(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
 
 class LanguageModel(L.LightningModule):
-    def __init__(self, 
-                vocab_size=32000,
-                enable_fp8 = False, 
-                enable_fsdp2 = False,
-                enable_torch_compile = False,
-                enable_gradient_checkpointing = False,
-                enable_cpu_offload = False
-                ):
+    def __init__(
+        self,
+        vocab_size=32000,
+        enable_fp8=False,
+        enable_fsdp2=False,
+        enable_torch_compile=False,
+        enable_gradient_checkpointing=False,
+        enable_cpu_offload=False,
+    ):
         super().__init__()
         self.model = None
         self.vocab_size = vocab_size
@@ -83,15 +84,14 @@ def __init__(self,
         self.enable_gradient_checkpointing = enable_gradient_checkpointing
         self.enable_cpu_offload = enable_cpu_offload
         self.model_path = "dummy"  # placeholder
-        self.parallel_dims = {
-            "dp_shard_enabled": True if torch.cuda.device_count() > 1 else False
-        }  # only used for FP8 training
+        self.parallel_dims = {"dp_shard_enabled": torch.cuda.device_count() > 1}  # only used for FP8 training
 
     def log_model_stage(self, stage: str):
-        """
-        Logs the current state of the model with a description of the stage.
+        """Logs the current state of the model with a description of the stage.
+
         Args:
             stage (str): Description of the current model stage.
+
         """
         log.warning(f"Model at stage: {stage}\n{self.model}")
 
@@ -129,7 +129,7 @@ def configure_fsdp2(self):
 
     def configure_fp8(self):
         # Setup fp8 training, if enable_fp8 is false, it will create a fake handler
-        from handlers.fp8_training_handler import FP8Config, Float8TrainingHandler
+        from handlers.fp8_training_handler import Float8TrainingHandler, FP8Config
 
         fp8_config = FP8Config(
             enable_fp8=self.enable_fp8,
@@ -207,13 +207,14 @@ def train(args):
     dataset = WikiText2()
     train_dataloader = DataLoader(dataset, num_workers=8, batch_size=1)
 
-    model = LanguageModel(vocab_size=args.vocab_size,
-                          enable_fp8 = args.enable_fp8,
-                          enable_fsdp2 = args.enable_fsdp2,
-                          enable_torch_compile = args.enable_torch_compile,
-                          enable_gradient_checkpointing = args.enable_gradient_checkpointing,
-                          enable_cpu_offload = args.enable_cpu_offload,
-                          )
+    model = LanguageModel(
+        vocab_size=args.vocab_size,
+        enable_fp8=args.enable_fp8,
+        enable_fsdp2=args.enable_fsdp2,
+        enable_torch_compile=args.enable_torch_compile,
+        enable_gradient_checkpointing=args.enable_gradient_checkpointing,
+        enable_cpu_offload=args.enable_cpu_offload,
+    )
 
     if args.enable_fsdp2:
         strategy = ModelParallelStrategy(