vllm-project · dsikka · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/docs/steps/choosing-algo.md b/docs/steps/choosing-algo.md
@@ -24,6 +24,64 @@ Weight and activation quantization is best for maximum throughput on modern hard
 !!! note
     AWQ and GPTQ are typically used for weight-only quantization but can also be applied to weight and activation quantization workflows.
 
+### AWQ details
+
+The AWQ recipe uses the `AWQModifier`, which adjusts model scales ahead of weight quantization:
+
+```python
+recipe = [
+    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
+]
+```
+
+AWQ requires layer mappings to identify where to apply activation-aware scaling. Mappings for common model families are built in, but you can supply your own via the `mappings` argument. For example, the Llama mapping looks like:
+
+```python
+[
+    AWQMapping("re:.*input_layernorm", ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]),
+    AWQMapping("re:.*v_proj", ["re:.*o_proj"]),
+    AWQMapping("re:.*post_attention_layernorm", ["re:.*gate_proj", "re:.*up_proj"]),
+    AWQMapping("re:.*up_proj", ["re:.*down_proj"]),
+]
+```
+
+!!! note
+    Mappings define which layers get smoothed, while `targets` and `ignore` define which layers get quantized. A layer in the `ignore` list that is matched by a mapping will still be smoothed but not quantized.
+
+To add support for a new model family, supply your own mappings via the `mappings` argument or contribute them to the [mappings registry](/src/llmcompressor/modifiers/awq/mappings.py).
+
+### AutoRound details
+
+AutoRound introduces three trainable parameters (V, α, and β) to optimize rounding values and clipping ranges during quantization. It processes each decoder layer sequentially using block-wise output reconstruction error as the training objective.
+
+**When to use AutoRound:**
+
+- **INT4 for large models (≈30B+):** Performance comparable to other PTQ methods; accuracy drop is generally minimal at this scale.
+- **INT4 for small-to-medium models:** Likely to deliver higher accuracy than other PTQ methods.
+- **Sub-4-bit (INT2/INT3):** Shows 10–20% absolute accuracy improvements over PTQ methods, matching QAT performance at 1–2 orders of magnitude lower tuning cost.
+- **New data types (MXFP4/NVFP4):** Consistently outperforms RTN in accuracy for emerging floating-point formats.
+
+**Key parameters:**
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `scheme` | Quantization scheme (e.g. `W4A16`, `W8A16`) | — |
+| `iters` | Tuning iterations per block | 200 |
+| `batch_size` | Batch size for calibration | 8 |
+| `lr` | Learning rate; auto-set to `1.0/iters` if `None` | `None` |
+
+**Recommended configurations:**
+
+| Mode | Batch Size | Iters | Seq Length | Samples | Speed | Memory | Accuracy |
+|------|------------|-------|------------|---------|-------|--------|----------|
+| `default` | 8 | 200 | 2048 | 128 | Fast | Medium | Good |
+| `best` | 8 | 1000 | 2048 | 512 | Slow | High | Best |
+| `light` | 8 | 50 | 2048 | 128 | Fastest | Medium | Slight drop |
+| `fast` | 4 | 200 | 512 | 128 | Fastest | Low | Good |
+
+!!! note
+    AutoRound currently supports WNA16, NVFP4, and W8A8-FP8 quantization schemes. Support for additional schemes is planned; follow progress in the [RFC](https://github.com/vllm-project/llm-compressor/issues/1968).
+
 ## KV cache and attention quantization
 
 KV cache quantization reduces memory usage for long context inference:

diff --git a/examples/autoround/README.md b/examples/autoround/README.md
diff --git a/examples/awq/README.md b/examples/awq/README.md
diff --git a/examples/awq/RESULTS.md b/examples/awq/RESULTS.md
diff --git a/examples/awq/llama_example_ddp.py → ...ddp/llama3/w4a16/awq/llama_ddp_example.py b/examples/awq/llama_example_ddp.py → ...ddp/llama3/w4a16/awq/llama_ddp_example.py
diff --git a/.../quantization_w4a16/llama3_ddp_example.py → ...es/ddp/llama3/w4a16/llama3_ddp_example.py b/.../quantization_w4a16/llama3_ddp_example.py → ...es/ddp/llama3/w4a16/llama3_ddp_example.py
diff --git a/...on_w8a8_int8/benchmark_smoothquant_ddp.py → ...a3/w8a8_int8/benchmark_smoothquant_ddp.py b/...on_w8a8_int8/benchmark_smoothquant_ddp.py → ...a3/w8a8_int8/benchmark_smoothquant_ddp.py
diff --git a/...tion_w8a8_int8/smoothquant_ddp_example.py → ...ama3/w8a8_int8/smoothquant_ddp_example.py b/...tion_w8a8_int8/smoothquant_ddp_example.py → ...ama3/w8a8_int8/smoothquant_ddp_example.py
diff --git a/examples/autoround/ddp/ddp_qwen3_example.py → ...wen3/w4a16/autoround/qwen3_ddp_example.py b/examples/autoround/ddp/ddp_qwen3_example.py → ...wen3/w4a16/autoround/qwen3_ddp_example.py
diff --git a/examples/awq/qwen3_moe_example_ddp.py → .../qwen3/w4a16/awq/qwen3_moe_ddp_example.py b/examples/awq/qwen3_moe_example_ddp.py → .../qwen3/w4a16/awq/qwen3_moe_ddp_example.py
diff --git a/...les/quantizing_moe/deepseek_r1_example.py → .../deepseek_r1/w4a16/deepseek_r1_example.py b/...les/quantizing_moe/deepseek_r1_example.py → .../deepseek_r1/w4a16/deepseek_r1_example.py
diff --git a/...l_free_ptq/deepseek_r1_nvfp4_fp8_block.py → ...l_free_ptq/deepseek_r1_nvfp4_fp8_block.py b/...l_free_ptq/deepseek_r1_nvfp4_fp8_block.py → ...l_free_ptq/deepseek_r1_nvfp4_fp8_block.py
diff --git a/...s/quantization_w8a8_fp8/gemma2_example.py → .../models/gemma2/w8a8_fp8/gemma2_example.py b/...s/quantization_w8a8_fp8/gemma2_example.py → .../models/gemma2/w8a8_fp8/gemma2_example.py
diff --git a/.../quantization_w8a8_int8/gemma2_example.py → ...models/gemma2/w8a8_int8/gemma2_example.py b/.../quantization_w8a8_int8/gemma2_example.py → ...models/gemma2/w8a8_int8/gemma2_example.py
diff --git a/examples/multimodal_vision/gemma3_example.py → ...les/models/gemma3/w4a16/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py → ...les/models/gemma3/w4a16/gemma3_example.py
diff --git a/examples/quantizing_moe/glm4_7_example.py → ...les/models/glm4/w4a16/awq/glm4_example.py b/examples/quantizing_moe/glm4_7_example.py → ...les/models/glm4/w4a16/awq/glm4_example.py
diff --git a/examples/quantizing_moe/glm5_example.py → ...les/models/glm5/w4a16/awq/glm5_example.py b/examples/quantizing_moe/glm5_example.py → ...les/models/glm5/w4a16/awq/glm5_example.py
diff --git a/.../quantization_w4a8/gpt_oss_20b_example.py → ...odels/gpt_oss/w4a8/gpt_oss_20b_example.py b/.../quantization_w4a8/gpt_oss_20b_example.py → ...odels/gpt_oss/w4a8/gpt_oss_20b_example.py
diff --git a/.../quantization_w8a8_fp8/README_granite4.md → examples/models/granite4/w8a8_fp8/README.md b/.../quantization_w8a8_fp8/README_granite4.md → examples/models/granite4/w8a8_fp8/README.md
diff --git a/...quantization_w8a8_fp8/granite4_example.py → ...els/granite4/w8a8_fp8/granite4_example.py b/...quantization_w8a8_fp8/granite4_example.py → ...els/granite4/w8a8_fp8/granite4_example.py
diff --git a/...les/multimodal_vision/idefics3_example.py → ...models/idefics3/w4a16/idefics3_example.py b/...les/multimodal_vision/idefics3_example.py → ...models/idefics3/w4a16/idefics3_example.py
diff --git a/...les/multimodal_vision/README_internvl3.md → examples/models/internvl3/w8a8_fp8/README.md b/...les/multimodal_vision/README_internvl3.md → examples/models/internvl3/w8a8_fp8/README.md
diff --git a/...es/multimodal_vision/internvl3_example.py → ...s/internvl3/w8a8_fp8/internvl3_example.py b/...es/multimodal_vision/internvl3_example.py → ...s/internvl3/w8a8_fp8/internvl3_example.py
diff --git a/...del_free_ptq/kimi_k2_thinking_nvfp4a16.py → ...del_free_ptq/kimi_k2_thinking_nvfp4a16.py b/...del_free_ptq/kimi_k2_thinking_nvfp4a16.py → ...del_free_ptq/kimi_k2_thinking_nvfp4a16.py
diff --git a/...el_free_ptq/kimi_k2_thinking_fp8_block.py → ...el_free_ptq/kimi_k2_thinking_fp8_block.py b/...el_free_ptq/kimi_k2_thinking_fp8_block.py → ...el_free_ptq/kimi_k2_thinking_fp8_block.py
diff --git a/...d/quantization_kv_cache/llama3_example.py → ...ama3/kv_cache/autoround/llama3_example.py b/...d/quantization_kv_cache/llama3_example.py → ...ama3/kv_cache/autoround/llama3_example.py
diff --git a/examples/quantization_w4a16/README.md → examples/models/llama3/w4a16/README.md b/examples/quantization_w4a16/README.md → examples/models/llama3/w4a16/README.md
diff --git a/...es/autoround/quantization_w4a16/README.md → ...s/models/llama3/w4a16/autoround/README.md b/...es/autoround/quantization_w4a16/README.md → ...s/models/llama3/w4a16/autoround/README.md
diff --git a/...ound/quantization_w4a16/llama3_example.py → .../llama3/w4a16/autoround/llama3_example.py b/...ound/quantization_w4a16/llama3_example.py → .../llama3/w4a16/autoround/llama3_example.py
diff --git a/examples/awq/llama_example.py → .../models/llama3/w4a16/awq/llama_example.py b/examples/awq/llama_example.py → .../models/llama3/w4a16/awq/llama_example.py
diff --git a/examples/awq/llama_example_with_masking.py → ...3/w4a16/awq/llama_example_with_masking.py b/examples/awq/llama_example_with_masking.py → ...3/w4a16/awq/llama_example_with_masking.py
diff --git a/...ples/quantization_w4a16/llama3_example.py → ...les/models/llama3/w4a16/llama3_example.py b/...ples/quantization_w4a16/llama3_example.py → ...les/models/llama3/w4a16/llama3_example.py
diff --git a/examples/multimodal_vision/mllama_example.py → ...les/models/llama3/w4a16/mllama_example.py b/examples/multimodal_vision/mllama_example.py → ...les/models/llama3/w4a16/mllama_example.py
diff --git a/...ization_w4a16_fp4/mxfp4/llama3_example.py → .../llama3/w4a16_fp4/mxfp4/llama3_example.py b/...ization_w4a16_fp4/mxfp4/llama3_example.py → .../llama3/w4a16_fp4/mxfp4/llama3_example.py
diff --git a/...ization_w4a16_fp4/nvfp4/llama3_example.py → .../llama3/w4a16_fp4/nvfp4/llama3_example.py b/...ization_w4a16_fp4/nvfp4/llama3_example.py → .../llama3/w4a16_fp4/nvfp4/llama3_example.py
diff --git a/examples/quantization_w4a4_fp4/README.md → examples/models/llama3/w4a4_fp4/README.md b/examples/quantization_w4a4_fp4/README.md → examples/models/llama3/w4a4_fp4/README.md
diff --git a/...autoround/quantization_w4a4_fp4/README.md → ...odels/llama3/w4a4_fp4/autoround/README.md b/...autoround/quantization_w4a4_fp4/README.md → ...odels/llama3/w4a4_fp4/autoround/README.md
@@ -4,29 +4,15 @@
 
 AutoRound introduces three trainable parameters (V, α, and β) to optimize rounding values and clipping ranges during quantization. The method processes each decoder layer sequentially, using block-wise output reconstruction error as the training objective to fine-tune these parameters. This approach combines the efficiency of post-training quantization with the adaptability of parameter tuning, delivering robust compression for large language models while maintaining strong performance.
 
-## Installation
-
-To get started, install:
-
-```bash
-git clone https://github.com/vllm-project/llm-compressor.git
-cd llm-compressor
-pip install -e .
-```
-
-## Quickstart
-
-The example includes end-to-end scripts for applying the AutoRound quantization algorithm.
-
-### Llama 3.1 Example
+## Llama 3.1 Example
 
 ```bash
 python3 llama3.1_example.py
 ```
 
 The resulting model `Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound` is ready to be loaded into vLLM.
 
-#### Evaluate Accuracy
+### Evaluate Accuracy
 
 With the model created, we can now load and run in vLLM (after installing).
 
@@ -47,86 +33,32 @@ lm_eval --model vllm \
   --batch_size 'auto'
 ```
 
-##### meta-llama/Meta-Llama-3.1-8B-Instruct
+#### meta-llama/Meta-Llama-3.1-8B-Instruct
 |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
 |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
 |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7710|±  |0.0116|
 |     |       |strict-match    |     5|exact_match|↑  |0.7043|±  |0.0126|
 
-##### Meta-Llama-3.1-8B-Instruct-NVFP4 (QuantizationModifier)
+#### Meta-Llama-3.1-8B-Instruct-NVFP4 (QuantizationModifier)
 |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
 |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
 |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7248|±  |0.0123|
 |     |       |strict-match    |     5|exact_match|↑  |0.6611|±  |0.0130|
 
-
-##### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=0)
+#### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=0)
 |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
 |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
 |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7362|±  |0.0121|
 |     |       |strict-match    |     5|exact_match|↑  |0.6702|±  |0.0129|
 
-##### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=200)
+#### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=200)
 |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
 |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
 |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7210|±  |0.0124|
 |     |       |strict-match    |     5|exact_match|↑  |0.6945|±  |0.0127|
 
 > Note: quantized model accuracy may vary slightly due to nondeterminism.
 
-### Qwen3-VL Example
-
-```bash
-python3 qwen3_vl_example.py
-```
-
-The resulting model `Qwen3-VL-8B-Instruct-NVFP4-AutoRound` is ready to be loaded into vLLM.
-
-#### Evaluate Accuracy
-
-Run the following to test accuracy on GSM-8K and ChartQA:
-
-```bash
-lm_eval --model vllm-vlm \
-  --model_args pretrained="./Qwen3-VL-8B-Instruct-NVFP4-AutoRound",add_bos_token=true \
-  --tasks gsm8k \
-  --num_fewshot 5 \
-  --batch_size 'auto'
-
-lm_eval --model vllm-vlm \
-  --model_args pretrained="./Qwen3-VL-8B-Instruct-NVFP4-AutoRound",add_bos_token=true \
-  --tasks chartqa \
-  --batch_size 'auto' \
-  --apply_chat_template
-```
-
-##### Qwen/Qwen3-VL-8B-Instruct (Baseline)
-|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
-|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
-|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.8628|±  |0.0095|
-|     |       |strict-match    |     5|exact_match|↑  |0.8453|±  |0.0100|
-
-| Tasks |Version|Filter|n-shot|     Metric      |   |Value |   |Stderr|
-|-------|------:|------|-----:|-----------------|---|-----:|---|-----:|
-|chartqa|      0|none  |     0|anywhere_accuracy|↑  |0.7908|±  |0.0081|
-|       |       |none  |     0|exact_match      |↑  |0.5592|±  |0.0099|
-|       |       |none  |     0|relaxed_accuracy |↑  |0.7696|±  |0.0084|
-
-
-##### Qwen3-VL-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=200)
-|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
-|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
-|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.8415|±  |0.0101|
-|     |       |strict-match    |     5|exact_match|↑  |0.8408|±  |0.0101|
-
-| Tasks |Version|Filter|n-shot|     Metric      |   |Value |   |Stderr|
-|-------|------:|------|-----:|-----------------|---|-----:|---|-----:|
-|chartqa|      0|none  |     0|anywhere_accuracy|↑  |0.8220|±  |0.0077|
-|       |       |none  |     0|exact_match      |↑  |0.5748|±  |0.0099|
-|       |       |none  |     0|relaxed_accuracy |↑  |0.8044|±  |0.0079|
-
-> Note: quantized model accuracy may vary slightly due to nondeterminism.
-
 ### Questions or Feature Request?
 
 Please open up an issue on [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor) or [intel/auto-round](https://github.com/intel/auto-round).
diff --git a/...quantization_w4a4_fp4/llama3.1_example.py → ...a3/w4a4_fp4/autoround/llama3.1_example.py b/...quantization_w4a4_fp4/llama3.1_example.py → ...a3/w4a4_fp4/autoround/llama3.1_example.py
diff --git a/...s/quantization_w4a4_fp4/llama3_example.py → .../models/llama3/w4a4_fp4/llama3_example.py b/...s/quantization_w4a4_fp4/llama3_example.py → .../models/llama3/w4a4_fp4/llama3_example.py
diff --git a/...ation_w4a4_fp4/llama3_example_prefetch.py → ...lama3/w4a4_fp4/llama3_example_prefetch.py b/...ation_w4a4_fp4/llama3_example_prefetch.py → ...lama3/w4a4_fp4/llama3_example_prefetch.py
diff --git a/...ntization_w4a4_fp4/llama3_gptq_example.py → ...ls/llama3/w4a4_fp4/llama3_gptq_example.py b/...ntization_w4a4_fp4/llama3_gptq_example.py → ...ls/llama3/w4a4_fp4/llama3_gptq_example.py
diff --git a/examples/awq/w4a8_fp8_llama_example.py → ...a3/w4a8_fp8/awq/w4a8_fp8_llama_example.py b/examples/awq/w4a8_fp8_llama_example.py → ...a3/w4a8_fp8/awq/w4a8_fp8_llama_example.py
diff --git a/...s/quantization_w4a8_fp8/llama3_example.py → .../models/llama3/w4a8_fp8/llama3_example.py b/...s/quantization_w4a8_fp8/llama3_example.py → .../models/llama3/w4a8_fp8/llama3_example.py
diff --git a/examples/quantization_w8a8_fp8/README.md → examples/models/llama3/w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md → examples/models/llama3/w8a8_fp8/README.md
diff --git a/examples/awq/fp8_block_llama_example.py → ...3/w8a8_fp8/awq/fp8_block_llama_example.py b/examples/awq/fp8_block_llama_example.py → ...3/w8a8_fp8/awq/fp8_block_llama_example.py
diff --git a/examples/awq/fp8_dynamic_llama_example.py → ...w8a8_fp8/awq/fp8_dynamic_llama_example.py b/examples/awq/fp8_dynamic_llama_example.py → ...w8a8_fp8/awq/fp8_dynamic_llama_example.py
diff --git a/...ation_w8a8_fp8/llama3.2_vision_example.py → ...lama3/w8a8_fp8/llama3.2_vision_example.py b/...ation_w8a8_fp8/llama3.2_vision_example.py → ...lama3/w8a8_fp8/llama3.2_vision_example.py
diff --git a/...s/quantization_w8a8_fp8/llama3_example.py → .../models/llama3/w8a8_fp8/llama3_example.py b/...s/quantization_w8a8_fp8/llama3_example.py → .../models/llama3/w8a8_fp8/llama3_example.py
diff --git a/examples/quantization_w8a8_int8/README.md → examples/models/llama3/w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md → examples/models/llama3/w8a8_int8/README.md
diff --git a/.../quantization_w8a8_int8/llama3_example.py → ...models/llama3/w8a8_int8/llama3_example.py b/.../quantization_w8a8_int8/llama3_example.py → ...models/llama3/w8a8_int8/llama3_example.py
diff --git a/examples/multimodal_vision/llama4_example.py → ...les/models/llama4/w4a16/llama4_example.py b/examples/multimodal_vision/llama4_example.py → ...les/models/llama4/w4a16/llama4_example.py
diff --git a/...s/quantization_w4a4_fp4/llama4_example.py → .../models/llama4/w4a4_fp4/llama4_example.py b/...s/quantization_w4a4_fp4/llama4_example.py → .../models/llama4/w4a4_fp4/llama4_example.py
diff --git a/..._w8a8_fp8/llama4_dynamic_quant_example.py → ...autoround/llama4_dynamic_quant_example.py b/..._w8a8_fp8/llama4_dynamic_quant_example.py → ...autoround/llama4_dynamic_quant_example.py
diff --git a/...n_w8a8_fp8/llama4_static_quant_example.py → .../autoround/llama4_static_quant_example.py b/...n_w8a8_fp8/llama4_static_quant_example.py → .../autoround/llama4_static_quant_example.py
diff --git a/...tion_w8a8_fp8/llama4_fp8_block_example.py → ...ama4/w8a8_fp8/llama4_fp8_block_example.py b/...tion_w8a8_fp8/llama4_fp8_block_example.py → ...ama4/w8a8_fp8/llama4_fp8_block_example.py
diff --git a/examples/multimodal_vision/llava_example.py → examples/models/llava/w4a16/llava_example.py b/examples/multimodal_vision/llava_example.py → examples/models/llava/w4a16/llava_example.py
diff --git a/...quantization_w8a8_fp8/llava1.5_example.py → ...models/llava/w8a8_fp8/llava1.5_example.py b/...quantization_w8a8_fp8/llava1.5_example.py → ...models/llava/w8a8_fp8/llava1.5_example.py
diff --git a/...les/multimodal_vision/medgemma_example.py → ...models/medgemma/w4a16/medgemma_example.py b/...les/multimodal_vision/medgemma_example.py → ...models/medgemma/w4a16/medgemma_example.py
diff --git a/...imodal_vision/mistral3_chat_template.json → ...istral3/w4a16/mistral3_chat_template.json b/...imodal_vision/mistral3_chat_template.json → ...istral3/w4a16/mistral3_chat_template.json
diff --git a/...les/multimodal_vision/mistral3_example.py → ...models/mistral3/w4a16/mistral3_example.py b/...les/multimodal_vision/mistral3_example.py → ...models/mistral3/w4a16/mistral3_example.py
diff --git a/examples/quantizing_moe/mixtral_example.py → ...odels/mixtral/w8a8_fp8/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py → ...odels/mixtral/w8a8_fp8/mixtral_example.py