improve the robustness of scheme (#803)

wenhuach21 · web-flow · commit 9534d2aca23a · 2025-09-09T15:05:37.000+08:00
diff --git a/README.md b/README.md
@@ -38,11 +38,7 @@ and [fbaldassarri](https://huggingface.co/fbaldassarri). For usage instructions,
   all bits other than 3 bits. Example
   models: [Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound](https://huggingface.co/Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound)
   and [Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound](https://huggingface.co/Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound). **A more advanced algorithm** tailored for specific configurations may be available in
-  v0.6.2.
-
-[2025/05] AutoRound provides some recipes for **DeepSeek-R1-0528**, please refer
-  to [OPEA/DeepSeek-R1-0528-int2-mixed-AutoRound](https://huggingface.co/OPEA/DeepSeek-R1-0528-int2-mixed-AutoRound) and [OPEA/DeepSeek-R1-0528-int4-AutoRound](https://huggingface.co/OPEA/DeepSeek-R1-0528-int4-AutoRound) for
-  more details.
+  v0.7.1.
 
 [2025/05] AutoRound has been integrated into **vLLM**. You can now run models in the AutoRound format directly with
   vLLM versions later than v0.85.post1.
@@ -121,24 +117,24 @@ Please change to `auto-round-mllm` for visual-language models (VLMs) quantizatio
 auto-round \
     --model Qwen/Qwen3-0.6B \
     --scheme "W4A16" \
-    --format "auto_gptq,auto_awq,auto_round" \
+    --format "auto_round" \
     --output_dir ./tmp_autoround
 ```
 
-We offer another two configurations, `auto-round-best` and `auto-round-light`, designed for optimal accuracy and improved speed, respectively. Details are as follows.
+We offer another two recipes, `auto-round-best` and `auto-round-light`, designed for optimal accuracy and improved speed, respectively. Details are as follows.
 <details>
   <summary>Other Recipes</summary>
 
   ```bash
-## best accuracy, 3X slower, low_gpu_mem_usage could save ~20G but ~30% slower
+# Best accuracy, 3X slower, low_gpu_mem_usage could save ~20G but ~30% slower
 auto-round-best \
     --model Qwen/Qwen3-0.6B \
     --scheme "W4A16" \
     --low_gpu_mem_usage 
   ```
 
   ```bash
-## light accuracy, 2-3X speedup, slight accuracy drop at W4 and larger accuracy drop at W2
+# 2-3X speedup, slight accuracy drop at W4 and larger accuracy drop at W2
 auto-round-light \
     --model Qwen/Qwen3-0.6B \
     --scheme "W4A16" 
@@ -147,7 +143,7 @@ auto-round-light \
 
   <!-- ```bash
 auto-round-fast \
-## fast and low memory, 2-3X speedup, slight accuracy drop at W4G128
+# Fast and low memory, 2-3X speedup, slight accuracy drop at W4G128
     --model Qwen/Qwen3-0.6B \
     --bits 4 \
     --group_size 128 \
@@ -176,10 +172,8 @@ ar = AutoRound(model_name_or_path, scheme="W4A16")
 # Faster quantization (2–3× speedup) with slight accuracy drop at W4G128.
 # ar = AutoRound(model_name_or_path, nsamples=128, iters=50, lr=5e-3)
 
-# Save quantized model
-output_dir = "./tmp_autoround"
 # Supported formats: "auto_round" (default), "auto_gptq", "auto_awq", "llm_compressor", "gguf:q4_k_m", etc.
-ar.quantize_and_save(output_dir, format="auto_round")
+ar.quantize_and_save(output_dir="./tmp_autoround", format="auto_round")
 ```
 
 <details>
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
@@ -101,6 +101,7 @@ def is_preset_scheme(name: str) -> bool:
         "data_type": "mx_fp",
         "act_bits": 4,
         "act_data_type": "mx_fp_rceil",
+        "act_group_size": 32,
     }
 )
 
@@ -111,6 +112,7 @@ def is_preset_scheme(name: str) -> bool:
         "data_type": "mx_fp",
         "act_bits": 8,
         "act_data_type": "mx_fp_rceil",
+        "act_group_size": 32,
     }
 )
 
@@ -121,6 +123,7 @@ def is_preset_scheme(name: str) -> bool:
         "data_type": "nv_fp",
         "act_bits": 4,
         "act_data_type": "nv_fp4_with_static_gs",
+        "act_group_size": 16,
     }
 )
 

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,7 @@ def is_preset_scheme(name: str) -> bool:`
`101`	`101`	`"data_type": "mx_fp",`
`102`	`102`	`"act_bits": 4,`
`103`	`103`	`"act_data_type": "mx_fp_rceil",`
	`104`	`+ "act_group_size": 32,`
`104`	`105`	`}`
`105`	`106`	`)`
`106`	`107`
`@@ -111,6 +112,7 @@ def is_preset_scheme(name: str) -> bool:`
`111`	`112`	`"data_type": "mx_fp",`
`112`	`113`	`"act_bits": 8,`
`113`	`114`	`"act_data_type": "mx_fp_rceil",`
	`115`	`+ "act_group_size": 32,`
`114`	`116`	`}`
`115`	`117`	`)`
`116`	`118`
`@@ -121,6 +123,7 @@ def is_preset_scheme(name: str) -> bool:`
`121`	`123`	`"data_type": "nv_fp",`
`122`	`124`	`"act_bits": 4,`
`123`	`125`	`"act_data_type": "nv_fp4_with_static_gs",`
	`126`	`+ "act_group_size": 16,`
`124`	`127`	`}`
`125`	`128`	`)`
`126`	`129`