vllm-project
diff --git a/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 0 additions & 26 deletions b/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 0 additions & 26 deletions
diff --git a/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 0 additions & 81 deletions b/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 0 additions & 81 deletions
diff --git a/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 0 additions & 78 deletions b/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 0 additions & 78 deletions
diff --git a/‎examples/big_models_with_sequential_onloading/README.md
Lines changed: 12 additions & 0 deletions b/‎examples/big_models_with_sequential_onloading/README.md
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/big_models_with_sequential_onloading/assets/sequential_onloading.png
69.5 KB b/‎examples/big_models_with_sequential_onloading/assets/sequential_onloading.png
69.5 KB
diff --git a/‎examples/quantization_w4a16/llama3_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/quantization_w4a16/llama3_example.py
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,12 @@
+## Big Modeling with Sequential Onloading ##
+### What is Sequential Onloading? ###
+Sequential onloading is a memory-efficient approach for compressing large language models (LLMs) using only a single GPU. Instead of loading the entire model into memory—which can easily require hundreds of gigabytes—this method loads and compresses one layer at a time. The outputs are offloaded before the next layer is processed, dramatically reducing peak memory usage while maintaining high compression fidelity.
+
+<p align="center">
+    <img src="assets/sequential_onloading.png"/>
+</p>
+
+For more information, see the [RedHat AI blog post](https://developers.redhat.com/articles/2025/05/09/llm-compressor-optimize-llms-low-latency-deployments#generalizing_to_multimodal_and_moe_architectures) or the [LLM Compressor Office Hours Recording](https://www.youtube.com/watch?v=GrhuqQDmBk8).
+
+### Using Sequential Onloading ###
+Sequential onloading is enabled by default within LLM Compressor. To disable sequential onloading, add the `pipeline="basic"` argument to the LLM Compressor `oneshot` function call.
@@ -6,7 +6,7 @@
 from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "meta-llama/Llama-3.3-70B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)