use remove_dispatch

kylesayrs · kylesayrs · commit a64a777e8af4 · 2025-06-16T12:01:36.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md
@@ -29,9 +29,7 @@ from llmcompressor.modifiers.quantization import QuantizationModifier
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load the model
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
@@ -204,9 +202,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 oneshot_output_dir = "./oneshot_model"
 
 # Load the model
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
@@ -226,7 +222,6 @@ from llmcompressor import create_session, train
 # Student model
 model = AutoModelForCausalLM.from_pretrained(
     oneshot_output_dir,
-    device_map="auto",
     quantization_config=CompressedTensorsConfig(run_compressed=False),
 )
 
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -3,6 +3,7 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
+from compressed_tensors.utils import remove_dispatch
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -84,6 +85,9 @@ def post_process(
     Raises:
         ValueError: If saving fails due to an invalid `output_dir` or other issues.
     """
+    # remove any existing dispatches
+    remove_dispatch(model_args.model)
+
     if model_args is not None and output_dir is not None:
         if recipe_args is not None and getattr(recipe_args, "stage", None) is not None:
             output_dir = os.path.join(output_dir, recipe_args.stage)
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,9 +5,12 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 
 import torch
-from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.quantization import find_name_or_class_matches
-from compressed_tensors.utils import has_offloaded_params, offloaded_dispatch
+from compressed_tensors.utils import (
+    has_offloaded_params,
+    offloaded_dispatch,
+    remove_dispatch,
+)
 from loguru import logger
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.graph import PythonCode
@@ -520,7 +523,7 @@ def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
     :param model: model to dispatch
     :return: dispatched model
     """
-    remove_hook_from_module(model, recurse=True)
+    remove_dispatch(model)
 
     if torch.cuda.is_available():
         offloaded_dispatch(model, execution_device=torch.device("cuda:0"))
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
@@ -6,8 +6,8 @@
 
 import torch
 from accelerate import dispatch_model, infer_auto_device_map
-from accelerate.hooks import remove_hook_from_module
 from accelerate.utils import get_balanced_memory
+from compressed_tensors.utils import remove_dispatch
 from huggingface_hub import snapshot_download
 from safetensors.torch import save_file
 from transformers import AutoModelForCausalLM, PreTrainedModel
@@ -124,7 +124,8 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     :param model: model to dispatch
     :return: model which is dispatched
     """
-    remove_hook_from_module(model, recurse=True)
+    remove_dispatch(model)
+
     max_memory = get_balanced_memory(
         model,
         dtype=model.dtype,