Merge remote-tracking branch 'origin' into kylesayrs/sequential-onloading

kylesayrs · kylesayrs · commit 03482437c0ef · 2025-06-05T22:46:05.000-04:00
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -66,15 +66,13 @@
     model=model,
     **oneshot_kwargs,
     stage="sparsity_stage",
-    output_dir=output_dir,
 )
 
 # Sparse finetune
 finetune_applied_model = train(
     model=oneshot_applied_model,
     **oneshot_kwargs,
     **training_kwargs,
-    output_dir=output_dir,
     stage="finetuning_stage",
 )
 
diff --git a/setup.py b/setup.py
@@ -124,7 +124,7 @@ def localversion_func(version: ScmVersion) -> str:
         (
             "compressed-tensors==0.9.4"
             if BUILD_TYPE == "release"
-            else "compressed-tensors>=0.9.5a2"
+            else "compressed-tensors>=0.10.1a2"
         ),
     ],
     extras_require={
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -41,6 +41,10 @@ def save_checkpoint(
     :param save_safetensors: save model checkpoint using safetensors file type
     :param save_compressed: save model checkpoint using compressed-tensors format
     """
+    from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+        get_model_compressor,  # avoid circular import
+    )
+
     # saving the model also saves the recipe
     model.save_pretrained(
         save_path,
@@ -51,6 +55,16 @@ def save_checkpoint(
     if processor is not None:
         processor.save_pretrained(save_path)
 
+    # saving the model modifies the model strcuture
+    # as this is only a checkpoint, decompress model to enable future training/oneshot
+    compressor = get_model_compressor(
+        model=model,
+        save_compressed=save_compressed,
+        skip_sparsity_compression_stats=skip_sparsity_compression_stats,
+    )
+    if compressor is not None:
+        compressor.decompress_model(model)
+
 
 def fallback_to_cpu(device: str) -> str:
     """
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -2,7 +2,7 @@
 import re
 import weakref
 from functools import wraps
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 import transformers
@@ -91,45 +91,27 @@ def save_pretrained_wrapper(
             # https://github.com/huggingface/transformers/pull/30488
             transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size
 
-            # state_dict gets passed in as a kwarg for FSDP models
-            state_dict = kwargs.pop("state_dict", None)
-            if state_dict is None:
-                logger.info("Fetching state_dict - this may take some time")
-                state_dict = get_state_dict_offloaded_model(model)
-
-            logger.info("Fetching compressor")
+            # compress model using compressor
             compressor = get_model_compressor(
                 model=model,
                 sparsity_config=sparsity_config,
                 quantization_format=quantization_format,
                 save_compressed=save_compressed,
                 skip_sparsity_compression_stats=skip_sparsity_compression_stats,
-                state_dict=state_dict,
                 disable_sparse_compression=disable_sparse_compression,
             )
+            if compressor is not None:
+                compressor.compress_model(model)
+
+            # save (compressed) model structure
+            original_save_pretrained.__get__(model, model_class)(
+                save_directory,
+                safe_serialization=safe_serialization,
+                **kwargs,
+            )
 
-            if compressor is None:
-                # model is not compressed or quantized, save as normal
-                original_save_pretrained_func = original_save_pretrained.__get__(
-                    model, model_class
-                )
-                original_save_pretrained_func(
-                    save_directory, state_dict=state_dict, **kwargs
-                )
-                return
-
-            # make sure we're on the main process when saving
-            if state_dict is not None and len(state_dict) > 0:
-                compressed_state_dict = compressor.compress(
-                    model, state_dict, show_progress=True
-                )
-                logger.info("Saving compressed model to disk")
-                original_save_pretrained.__get__(model, model_class)(
-                    save_directory,
-                    state_dict=compressed_state_dict,
-                    safe_serialization=safe_serialization,
-                    **kwargs,
-                )
+            # update config to reflect compression
+            if compressor is not None:
                 compressor.update_config(save_directory)
 
             # update existing recipe
@@ -197,7 +179,6 @@ def get_model_compressor(
     quantization_format: Optional[str] = None,
     save_compressed: bool = True,
     skip_sparsity_compression_stats: bool = True,
-    state_dict: Optional[Dict] = None,
     disable_sparse_compression: bool = False,
 ):
     """
@@ -211,12 +192,8 @@ def get_model_compressor(
     :param save_compressed: boolean representing to save in a compressed
         format
     :param skip_sparsity_compression_stats: bool allowing compression stats on std out
-    :param state_dict: state_dict of the model
     :param disable_sparse_compression: bool to skip sparse compression
     """
-    # find offloaded state dict if none is provided
-    if state_dict is None:
-        state_dict = get_state_dict_offloaded_model(model)
 
     if sparsity_config is None:
         """
@@ -244,6 +221,8 @@ def get_model_compressor(
             )
             sparsity_config = None
         else:
+            state_dict = get_state_dict_offloaded_model(model)
+
             sparsity_config = SparsityConfigMetadata.from_pretrained(
                 model,
                 state_dict=state_dict,
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py
@@ -7,6 +7,9 @@
 from parameterized import parameterized_class
 from transformers import AutoConfig
 
+from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+    get_model_compressor,
+)
 from tests.testing_utils import parse_params, requires_gpu
 
 CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_oneshot_configs"
@@ -34,17 +37,21 @@ def _test_oneshot_and_finetune(self):
             output_dir=self.output,
         )
 
-        train_args = dict(
-            num_train_epochs=self.num_train_epochs,
-            precision="bfloat16",
-            bf16=True,
-        )
         oneshot_model = oneshot(
             model=self.model,
             **oneshot_args,
             stage="test_oneshot_stage",
         )
 
+        compressor = get_model_compressor(model=oneshot_model, save_compressed=True)
+        if compressor is not None:
+            compressor.decompress_model(oneshot_model)
+
+        train_args = dict(
+            num_train_epochs=self.num_train_epochs,
+            precision="bfloat16",
+            bf16=True,
+        )
         train(
             model=oneshot_model,
             **oneshot_args,
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
@@ -55,7 +55,6 @@ def test_oneshot_and_finetune_with_tokenizer(self):
             concatenate_data=concatenate_data,
             splits=splits,
             tokenizer=tokenizer,
-            output_dir=self.output,
         )
 
         oneshot_model = oneshot(
@@ -70,6 +69,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):
             max_steps=max_steps,
             stage="test_train_stage",
             **model_and_data_kwargs,
+            output_dir=self.output,
         )
 
         input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
@@ -35,7 +35,7 @@ def labeled_dataloader(self, dataset_name, model_name):
         dataset_manager = TextGenerationDataset.load_from_registry(
             dataset_args.dataset,
             dataset_args=dataset_args,
-            split="train",
+            split=f"train[:{self.num_samples}]",
             processor=tokenizer,
         )
         calib_dataset = dataset_manager()
@@ -51,10 +51,14 @@ def _test_oneshot_completion(self, model_name: str = None):
         from llmcompressor import oneshot
         from llmcompressor.pytorch.model_load.helpers import get_session_model
         from llmcompressor.pytorch.utils import tensors_to_device
+        from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+            get_model_compressor,  # avoid circular import
+        )
 
         oneshot(
             model=self.model,
             dataset=self.dataset,
+            splits={"calibration": f"train[:{self.num_samples}]"},
             oneshot_device=self.device,
             recipe=self.recipe,
             max_seq_length=512,
@@ -65,6 +69,13 @@ def _test_oneshot_completion(self, model_name: str = None):
         )
 
         first_tiny_model = get_session_model()
+        compressor = get_model_compressor(
+            model=first_tiny_model,
+            save_compressed=True,
+            skip_sparsity_compression_stats=False,
+        )
+        if compressor is not None:
+            compressor.decompress_model(first_tiny_model)
 
         dataset = "open_platypus"
 

Original file line number	Diff line number	Diff line change
`@@ -66,15 +66,13 @@`
`66`	`66`	`model=model,`
`67`	`67`	`**oneshot_kwargs,`
`68`	`68`	`stage="sparsity_stage",`
`69`		`- output_dir=output_dir,`
`70`	`69`	`)`
`71`	`70`
`72`	`71`	`# Sparse finetune`
`73`	`72`	`finetune_applied_model = train(`
`74`	`73`	`model=oneshot_applied_model,`
`75`	`74`	`**oneshot_kwargs,`
`76`	`75`	`**training_kwargs,`
`77`		`- output_dir=output_dir,`
`78`	`76`	`stage="finetuning_stage",`
`79`	`77`	`)`
`80`	`78`
Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ def localversion_func(version: ScmVersion) -> str:`
`124`	`124`	`(`
`125`	`125`	`"compressed-tensors==0.9.4"`
`126`	`126`	`if BUILD_TYPE == "release"`
`127`		`- else "compressed-tensors>=0.9.5a2"`
	`127`	`+ else "compressed-tensors>=0.10.1a2"`
`128`	`128`	`),`
`129`	`129`	`],`
`130`	`130`	`extras_require={`
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,6 @@ def test_oneshot_and_finetune_with_tokenizer(self):`
`55`	`55`	`concatenate_data=concatenate_data,`
`56`	`56`	`splits=splits,`
`57`	`57`	`tokenizer=tokenizer,`
`58`		`- output_dir=self.output,`
`59`	`58`	`)`
`60`	`59`
`61`	`60`	`oneshot_model = oneshot(`
`@@ -70,6 +69,7 @@ def test_oneshot_and_finetune_with_tokenizer(self):`
`70`	`69`	`max_steps=max_steps,`
`71`	`70`	`stage="test_train_stage",`
`72`	`71`	`**model_and_data_kwargs,`
	`72`	`+ output_dir=self.output,`
`73`	`73`	`)`
`74`	`74`
`75`	`75`	`input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(`