update export SpinQuant checkpoint to align with the new format (pytorch#5645)

Lunwen He · facebook-github-bot · commit 52d521865ffc · 2024-09-25T13:41:53.000-07:00
Summary: Pull Request resolved: pytorch#5645 Per our new aligned checkpoint format with SpinQuant: - Original weights: drop it from the checkpoint - Int4 or int8 weight: .weight - Scales: .scales, grouped by group size for attention linear layers, grouped by per channel for embedding and output layer. This PR updates the export flow to follow this new format. Reviewed By: mergennachin Differential Revision: D63402708 fbshipit-source-id: e06a45b95ad7628732cfadb803a16f650042cd97
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
@@ -258,12 +258,7 @@ def __init__(self, **kwargs):
                     embedding_group_size,
                 )
 
-            sanitize_checkpoint_from_spinquant(
-                module=self.model_,
-                checkpoint=checkpoint,
-                linear_group_size=self.args.spin_group_size,
-                embedding_group_size=embedding_group_size,
-            )
+            sanitize_checkpoint_from_spinquant(checkpoint)
 
         # assign=True: load params/buffers by assignment instead of performing an in-place copy.
         # Because we are using device="meta", tensors do not have memory associated with them
diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py
@@ -102,7 +102,7 @@ def _replace_linear_with_linear_8da4w_for_spin_quant(
 ):
     def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
         # Only replace linear layers where the checkpoint contains explicit scales
-        scales_key = f"{cur_fqn}.scale"
+        scales_key = f"{cur_fqn}.scales"
         if isinstance(child, nn.Linear) and scales_key in checkpoint:
             assert _check_linear_int4_k(child.in_features, group_size)
             assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8
@@ -155,7 +155,7 @@ def _replace_output_linear_with_linear_int8_for_spinquant(
     dtype: torch.dtype,
 ):
     def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
-        scales_key = f"{cur_fqn}.scale"
+        scales_key = f"{cur_fqn}.scales"
         if (
             isinstance(child, nn.Linear)
             and scales_key in checkpoint
@@ -205,7 +205,7 @@ def _replace_embedding_with_quantized_group_embedding_for_spinquant(
 ):
     def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
         # Only replace embedding layers where the checkpoint contains explicit scales
-        scales_key = f"{cur_fqn}.scale"
+        scales_key = f"{cur_fqn}.scales"
         if isinstance(child, nn.Embedding) and scales_key in checkpoint:
             assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8
             assert checkpoint[scales_key].dtype == torch.float32
@@ -250,59 +250,12 @@ def transform_embedding_for_spinquant(
 
 
 def sanitize_checkpoint_from_spinquant(
-    module: torch.nn.Module,
     checkpoint: Any,
-    linear_group_size: int,
-    embedding_group_size: Optional[int] = None,
 ):
     """
     Sanitize the SpinQuant checkpoint.
-        - Renames 'scale' to 'scales'
-        - Groups scales
-        - Removes 'o_weight'
         - Converts all tensors to contiguous format
+        - Squeeze all tensors
     """
-    keys_to_rename = []
-    keys_to_remove = []
-    for k, _ in checkpoint.items():
-        if k.endswith(".scale"):
-            new_key = k + "s"
-            keys_to_rename.append((k, new_key))
-        if k.endswith(".o_weight"):
-            keys_to_remove.append(k)
-
-    for old_key, new_key in keys_to_rename:
-        old_val = checkpoint.pop(old_key)
-        module_name = new_key[0 : new_key.rfind(".")]
-        sub_module = module.get_submodule(module_name)
-        assert sub_module is not None
-        assert (
-            isinstance(sub_module, Int8DynActInt4WeightLinear)
-            or isinstance(sub_module, QuantizedGroupEmbedding)
-            or isinstance(sub_module, Int8DynActInt8WeightLinear)
-        )
-        # Checkpoints with SpinQuant could come with two formats for scales:
-        # 1. scales is grouped by group size
-        # 2. scales is not grouped by group size
-        # We need to handle both cases here.
-        # TODO(lunwenh): remove this once we have a unified format for scales.
-        if isinstance(sub_module, Int8DynActInt4WeightLinear):
-            checkpoint[new_key] = (
-                old_val if linear_group_size == -1 else old_val[:, ::linear_group_size]
-            )
-        elif isinstance(sub_module, Int8DynActInt8WeightLinear):
-            checkpoint[new_key] = old_val[:, 0]
-        elif isinstance(sub_module, QuantizedGroupEmbedding):
-            if (
-                embedding_group_size is None or embedding_group_size == 0
-            ):  # Scales are not grouped
-                checkpoint[new_key] = old_val[:, 0]
-            elif embedding_group_size == -1:  # Scales are grouped by group size
-                checkpoint[new_key] = old_val
-            else:
-                checkpoint[new_key] = old_val[:, ::embedding_group_size]
-
-    for k in keys_to_remove:
-        checkpoint.pop(k)
     for k, v in checkpoint.items():
-        checkpoint[k] = v.contiguous()
+        checkpoint[k] = torch.squeeze(v.contiguous())
diff --git a/examples/models/llama2/tests/test_spinquant_transforms.py b/examples/models/llama2/tests/test_spinquant_transforms.py
@@ -65,7 +65,7 @@ def test_transform_linear_for_spinquant(self):
                     weight.to(torch.float32), n_bit, group_size, scales_precision
                 )
                 checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
-                checkpoint[f"{fqn}.scale"] = scales.to("cpu")
+                checkpoint[f"{fqn}.scales"] = scales.to("cpu")
 
         # Step 3:
         # Transform the model so that it is compatible with the new checkpoint
@@ -76,11 +76,7 @@ def test_transform_linear_for_spinquant(self):
             "8da4w",
             torch.float32,
         )
-        sanitize_checkpoint_from_spinquant(
-            model,
-            checkpoint,
-            -1,
-        )
+        sanitize_checkpoint_from_spinquant(checkpoint)
 
         model.load_state_dict(
             checkpoint,
@@ -114,7 +110,7 @@ def test_transform_output_linear_for_spinquant(self):
                     scales_dtype=torch.float32,
                 )
                 checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
-                checkpoint[f"{fqn}.scale"] = scales.to("cpu")
+                checkpoint[f"{fqn}.scales"] = scales.to("cpu")
 
         # Step 3:
         # Transform the model so that it is compatible with the new checkpoint
@@ -123,11 +119,7 @@ def test_transform_output_linear_for_spinquant(self):
             checkpoint,
             torch.float32,
         )
-        sanitize_checkpoint_from_spinquant(
-            model,
-            checkpoint,
-            -1,
-        )
+        sanitize_checkpoint_from_spinquant(checkpoint)
 
         model.load_state_dict(
             checkpoint,
@@ -166,7 +158,7 @@ def test_transform_embedding_for_spinquant(self):
                     weight.to(torch.float32), n_bit, group_size, scales_precision
                 )
                 checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
-                checkpoint[f"{fqn}.scale"] = scales.to("cpu")
+                checkpoint[f"{fqn}.scales"] = scales.to("cpu")
 
         # Step 3:
         # Transform the model so that it is compatible with the new checkpoint
@@ -177,12 +169,7 @@ def test_transform_embedding_for_spinquant(self):
             n_bit,
             group_size,
         )
-        sanitize_checkpoint_from_spinquant(
-            module=model,
-            checkpoint=checkpoint,
-            linear_group_size=-1,
-            embedding_group_size=-1,
-        )
+        sanitize_checkpoint_from_spinquant(checkpoint)
 
         model.load_state_dict(
             checkpoint,