1
1
import os
2
2
import shutil
3
3
4
+ import pytest
4
5
import safetensors .torch
5
6
from transformers import AutoTokenizer
6
7
7
8
from auto_fp8 import AutoFP8ForCausalLM , BaseQuantizeConfig
8
9
10
+ MODELS = [
11
+ "facebook/opt-125m" ,
12
+ "Qwen/Qwen2-0.5B-Instruct" ,
13
+ ]
9
14
10
- def test_dynamic_quantization ():
11
- model_id = "facebook/opt-125m"
12
- quantized_model_dir = "opt-125m -fp8-dynamic"
15
+ @ pytest . mark . parametrize ( "model_id" , MODELS )
16
+ def test_dynamic_quantization ( model_id ):
17
+ quantized_model_dir = model_id . split ( "/" )[ - 1 ] + " -fp8-dynamic"
13
18
14
19
quantize_config = BaseQuantizeConfig (
15
20
quant_method = "fp8" , activation_scheme = "dynamic"
@@ -30,9 +35,9 @@ def test_dynamic_quantization():
30
35
assert model_size < target_size
31
36
32
37
33
- def test_static_quantization ():
34
- model_id = "facebook/opt-125m"
35
- quantized_model_dir = "opt-125m -fp8-static"
38
+ @ pytest . mark . parametrize ( "model_id" , MODELS )
39
+ def test_static_quantization ( model_id ):
40
+ quantized_model_dir = model_id . split ( "/" )[ - 1 ] + " -fp8-static"
36
41
37
42
tokenizer = AutoTokenizer .from_pretrained (model_id , use_fast = True )
38
43
examples = ["auto-fp8 is an easy-to-use model quantization library" ]
@@ -54,10 +59,9 @@ def test_static_quantization():
54
59
target_size = 160 * (1024 * 1024 )
55
60
assert model_size < target_size
56
61
57
-
58
- def test_kv_cache_static_quantization ():
59
- model_id = "facebook/opt-125m"
60
- quantized_model_dir = "opt-125m-fp8-static-kv"
62
+ @pytest .mark .parametrize ("model_id" , MODELS )
63
+ def test_kv_cache_static_quantization (model_id ):
64
+ quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-static-kv"
61
65
62
66
tokenizer = AutoTokenizer .from_pretrained (model_id , use_fast = True )
63
67
examples = ["auto-fp8 is an easy-to-use model quantization library" ]
0 commit comments