Merge remote-tracking branch 'origin' into kylesayrs/sequential-onloading

kylesayrs · kylesayrs · commit 6fdcdb1a99e6 · 2025-06-09T20:46:35.000-04:00
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -20,7 +20,7 @@
 # 3) Apply quantization and save in compressed-tensors format.
 oneshot(model=model, recipe=recipe, tokenizer=tokenizer)
 
-# Save to disk in compressed-tensors format.
+# 4) Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -20,10 +20,7 @@
 )
 
 # Apply quantization.
-oneshot(
-    model=model,
-    recipe=recipe,
-)
+oneshot(model=model, recipe=recipe)
 
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -59,9 +59,6 @@ def tokenize(sample):
 # list so they remain at full precision
 recipe = "deepseek_recipe_w4a16.yaml"
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
-
-
 oneshot(
     model=model,
     dataset=ds,
@@ -70,7 +67,6 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     save_compressed=True,
     trust_remote_code_model=True,
-    output_dir=SAVE_DIR,
 )
 
 # Confirm generations of the quantized model look sane.
@@ -87,6 +83,11 @@ def tokenize(sample):
         "deepseek models with transformers >= 4.48"
     )
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
 
 # Run the model on vLLM
 try:
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -66,19 +66,20 @@ def tokenize(sample):
     ),
 ]
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
-
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    save_compressed=True,
-    output_dir=SAVE_DIR,
 )
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
 # Load model after saving
 model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
 
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -67,19 +67,20 @@ def tokenize(sample):
     ),
 ]
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
-
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    save_compressed=True,
-    output_dir=SAVE_DIR,
 )
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
 # Load model after saving
 model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
 
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -19,15 +19,11 @@
 MAX_SEQ_LENGTH = 2048
 NUM_CALIBRATION_SAMPLES = 512
 
-# Save location of quantized model
-SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
-SAVE_COMPRESSED = True
-
+# Recipe
 layers_to_ignore: List[str] = [
     "lm_head",
     "re:.*block_sparse_moe.gate",  # does not quantize well
 ]
-
 recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
 
 
@@ -39,10 +35,13 @@
     recipe=recipe,
     max_seq_length=MAX_SEQ_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=SAVE_COMPRESSED,
-    output_dir=SAVE_DIR,
 )
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
 # Load model after saving
 model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
 
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -59,9 +59,6 @@ def tokenize(sample):
     ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
 )
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
-
-
 oneshot(
     model=model,
     dataset=ds,
@@ -70,9 +67,13 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     save_compressed=True,
     trust_remote_code_model=True,
-    output_dir=SAVE_DIR,
 )
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
 # Load model after saving
 model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
 
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 import torch
+from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.utils import force_cpu_offload
 from loguru import logger
 from torch.utils.data import DataLoader
@@ -14,8 +15,6 @@
 from llmcompressor.entrypoints.utils import post_process, pre_process
 from llmcompressor.pipelines import CalibrationPipeline
 
-from accelerate.hooks import remove_hook_from_module
-
 __all__ = ["Oneshot", "oneshot"]
 
 
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -3,6 +3,7 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
+from accelerate.hooks import remove_hook_from_module
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -27,7 +28,6 @@
 )
 from llmcompressor.typing import Processor
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
-from accelerate.hooks import remove_hook_from_module
 
 
 def pre_process(model_args: "ModelArguments"):
diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py
@@ -0,0 +1,29 @@
+from typing import Callable, Union
+
+import tqdm
+from torch.nn import Module
+
+
+def module_bfs(
+    module: Module,
+    func: Callable[[Module], Module],
+    pre: bool = True,
+    progress: Union[bool, tqdm.tqdm] = False,
+) -> Module:
+    if progress is True:
+        total = len(list(module.modules()))
+        progress = tqdm.tqdm(total=total)
+
+    if pre:
+        module = func(module)
+
+    for name, child in list(module.named_children()):
+        module.add_module(name, module_bfs(child, func, pre, progress))
+
+    if not pre:
+        module = func(module)
+
+    if isinstance(progress, tqdm.tqdm):
+        progress.update(1)
+
+    return module