[Tracing] Onload sequential ancestors (#1690)

kylesayrs · web-flow · commit e628e13a91f9 · 2025-07-30T16:15:19.000-04:00
## Background ## The idea of tracing is to capture all of the events and operations that happen during a model's execution. However, some operations are too complex to capture in the graph. One of these skipped operations is the onloading/offloading of weights because this operation is too complicated to captured by torch dynamo. ``` parent | \ seq_target weight ``` The problem is that, if we skip onloading/offloading during graph capture, weights which appear in the graph might not be onloaded at execution time. This isn't a problem for weights which are equal to/ high granularity than the sequential targets, since those weights belong to modules which are called by the graph (and therefore trigger hooks). ``` hidden_states = ... ... = self.decoder_layer(hidden_states) # module call triggers onloading hooks ``` ``` hidden_states = ... hidden_states = hidden_states + self.bias # self.bias might not be onloaded # we never captured the onloading hook in the graph ``` ### Big Code ### The GPT Big Code model is the first model we've seen like this, where it contains a parameter directly attached to a module which is a caller (ancestor) of the sequential targets. It has this extra, [directly attached bias parameter](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L416-L418). ``` GPTBigCodeModel | \ GPTBigCodeBlock bias ``` ## Purpose ## * Support models which have call graph ancestors with directly attached parameters ## Changes ## * Change the policy for these kinds of modules. Instead of skipping the tracing of their internals, instead just onload them, as they are unlikely to have a large number of direct parameters. ## Testing ## * Added big code to tracing tests * Ran big code example script to completion <details><summary>example_script.py</summary> ```python3 from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot from datasets import load_dataset # Load model model_stub = "ibm-granite/granite-20b-code-instruct-8k" model_name = model_stub.split("/")[-1] num_samples = 1024 max_seq_len = 8192 tokenizer = AutoTokenizer.from_pretrained(model_stub) model = AutoModelForCausalLM.from_pretrained( model_stub, device_map="auto", torch_dtype="auto", ) def preprocess_fn(example): return {"text": example["text"]} #, add_generation_prompt=False, tokenize=False)} ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train") ds = ds.map(preprocess_fn) def tokenize(sample): return tokenizer( sample["text"], padding=False, truncation=False, add_special_tokens=True, ) ds = ds.map(tokenize, remove_columns=ds.column_names) # Configure the quantization algorithm and scheme recipe = GPTQModifier( targets="Linear", scheme="W4A16", ignore=["lm_head"], sequential_targets=["GPTBigCodeBlock"], dampening_frac=0.01, ) # Apply quantization oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=max_seq_len, num_calibration_samples=num_samples, ) # Save to disk in compressed-tensors format save_path = model_name + "-quantized.w4a16" model.save_pretrained(save_path) tokenizer.save_pretrained(save_path) print(f"Model and tokenizer saved to: {save_path}") ``` </details> --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py
@@ -133,18 +133,9 @@ class AWQMapping:
 ]
 
 _bloom_mappings = [
-    AWQMapping(
-        "re:.*input_layernorm$",
-        ["re:.*query_key_value$"]
-    ),
-    AWQMapping(
-        "re:.*post_attention_layernorm$",
-        ["re:.*dense_h_to_4h$"]
-    ),
-    AWQMapping(
-        "re:.*gelu_impl$",
-        ["re:.*dense_4h_to_h$"]
-    ),
+    AWQMapping("re:.*input_layernorm$", ["re:.*query_key_value$"]),
+    AWQMapping("re:.*post_attention_layernorm$", ["re:.*dense_h_to_4h$"]),
+    AWQMapping("re:.*gelu_impl$", ["re:.*dense_4h_to_h$"]),
     # Note: AutoAWQ excludes this mapping, based on researcher's post in
     # https://github.com/mit-han-lab/llm-awq/issues/2#issuecomment-1606297469
     # AWQMapping(
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple
 
 import torch
+from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.quantization import find_name_or_class_matches
 from compressed_tensors.utils import (
     has_offloaded_params,
@@ -177,13 +178,12 @@ def __init__(self, ancestors: Set[Module], offloaded: Set[Module]):
 
         # check unlikely case that ancestors have direct params which are offloaded
         offloaded_ancestors = offloaded & ancestors
-        if offloaded_ancestors:
-            names = set(module.__class__.__name__ for module in offloaded_ancestors)
+        for ancestor in offloaded_ancestors:
+            remove_hook_from_module(ancestor, recurse=False)
+            self.offloaded.remove(ancestor)
             logger.warning(
-                "The following modules are call graph ancestors of sequential targets,"
-                f"but also contain offloaded modules: {names}.\n"
-                "These modules will not be traced, and any sequential target children "
-                "will be executed jointly, which may lead to OOM errors"
+                f"Direct parameters attached to {ancestor.__class__.__name__} have "
+                "been onloaded in order to ensure safe graph capture and execution"
             )
 
     def create_arg(self, a: Any) -> Argument:
diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py
@@ -42,7 +42,14 @@
             "text",
             [],
         ),
-        ("RedHatAI/DeepSeek-V3-BF16", AutoModelForCausalLM, None, "text", []),
+        (
+            "ibm-granite/granite-20b-code-instruct-8k",
+            AutoModelForCausalLM,
+            None,
+            "text",
+            [],
+        ),
+        ("unsloth/DeepSeek-R1-0528-BF16", AutoModelForCausalLM, None, "text", []),
         # --- vision ---
         (
             "HuggingFaceM4/Idefics3-8B-Llama3",