vllm-project
diff --git a/‎examples/awq/README.md‎
Lines changed: 1 addition & 5 deletions b/‎examples/awq/README.md‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎examples/awq/llama_example.py‎
Lines changed: 1 addition & 3 deletions b/‎examples/awq/llama_example.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/awq/qwen3_moe_example.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/awq/qwen3_moe_example.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/big_models_with_accelerate/README.md‎
Lines changed: 0 additions & 95 deletions b/‎examples/big_models_with_accelerate/README.md‎
Lines changed: 0 additions & 95 deletions
diff --git a/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py‎
Lines changed: 0 additions & 26 deletions b/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py‎
Lines changed: 0 additions & 81 deletions b/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py‎
Lines changed: 0 additions & 81 deletions
diff --git a/‎examples/big_models_with_accelerate/multi_gpu_int8.py‎
Lines changed: 0 additions & 78 deletions b/‎examples/big_models_with_accelerate/multi_gpu_int8.py‎
Lines changed: 0 additions & 78 deletions
diff --git a/‎examples/big_models_with_sequential_onloading/README.md‎
Lines changed: 12 additions & 0 deletions b/‎examples/big_models_with_sequential_onloading/README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/big_models_with_sequential_onloading/assets/sequential_onloading.png‎
69.5 KB b/‎examples/big_models_with_sequential_onloading/assets/sequential_onloading.png‎
69.5 KB
diff --git a/‎examples/multimodal_audio/README.md‎
Lines changed: 1 addition & 5 deletions b/‎examples/multimodal_audio/README.md‎
Lines changed: 1 addition & 5 deletions
@@ -18,11 +18,7 @@ recipe = [
 To use your own model, start with an existing example change the `model_id` to match your own model stub.
 ```python
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 ```
 
 ## Adding Mappings ##
 
@@ -7,9 +7,7 @@
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
 
@@ -3,13 +3,12 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
@@ -71,6 +70,7 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 
@@ -0,0 +1,12 @@
+## Big Modeling with Sequential Onloading ##
+### What is Sequential Onloading? ###
+Sequential onloading is a memory-efficient approach for compressing large language models (LLMs) using only a single GPU. Instead of loading the entire model into memory—which can easily require hundreds of gigabytes—this method loads and compresses one layer at a time. The outputs are offloaded before the next layer is processed, dramatically reducing peak memory usage while maintaining high compression fidelity.
+
+<p align="center">
+    <img src="assets/sequential_onloading.png"/>
+</p>
+
+For more information, see the [RedHat AI blog post](https://developers.redhat.com/articles/2025/05/09/llm-compressor-optimize-llms-low-latency-deployments#generalizing_to_multimodal_and_moe_architectures) or the [LLM Compressor Office Hours Recording](https://www.youtube.com/watch?v=GrhuqQDmBk8).
+
+### Using Sequential Onloading ###
+Sequential onloading is enabled by default within LLM Compressor. To disable sequential onloading, add the `pipeline="basic"` argument to the LLM Compressor `oneshot` function call.
@@ -21,11 +21,7 @@ This directory contains example scripts for quantizing a variety of audio langua
 To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
 ```python3
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 ```
 
 ## Customizing GPTQModifier Parameters ##