8
8
from auto_fp8 import AutoFP8ForCausalLM , BaseQuantizeConfig
9
9
10
10
MODELS = [
11
- "facebook/opt-125m" ,
12
- "Qwen/Qwen2-0.5B-Instruct" ,
11
+ ( "facebook/opt-125m" , 160 ) ,
12
+ ( "Qwen/Qwen2-0.5B-Instruct" , 600 ) ,
13
13
]
14
14
15
- @pytest .mark .parametrize ("model_id" , MODELS )
16
- def test_dynamic_quantization (model_id ):
15
+ @pytest .mark .parametrize ("model_id,target_size " , MODELS )
16
+ def test_dynamic_quantization (model_id , target_size ):
17
17
quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-dynamic"
18
18
19
19
quantize_config = BaseQuantizeConfig (
@@ -30,13 +30,13 @@ def test_dynamic_quantization(model_id):
30
30
model_size = os .path .getsize (f"{ quantized_model_dir } /model.safetensors" )
31
31
shutil .rmtree (quantized_model_dir )
32
32
33
- # We expect the model to be < 160MB
34
- target_size = 160 * (1024 * 1024 )
33
+ # We expect the model to be a certain size
34
+ target_size = target_size * (1024 * 1024 )
35
35
assert model_size < target_size
36
36
37
37
38
- @pytest .mark .parametrize ("model_id" , MODELS )
39
- def test_static_quantization (model_id ):
38
+ @pytest .mark .parametrize ("model_id,target_size " , MODELS )
39
+ def test_static_quantization (model_id , target_size ):
40
40
quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-static"
41
41
42
42
tokenizer = AutoTokenizer .from_pretrained (model_id , use_fast = True )
@@ -56,11 +56,11 @@ def test_static_quantization(model_id):
56
56
shutil .rmtree (quantized_model_dir )
57
57
58
58
# We expect the model to be < 160MB
59
- target_size = 160 * (1024 * 1024 )
59
+ target_size = target_size * (1024 * 1024 )
60
60
assert model_size < target_size
61
61
62
- @pytest .mark .parametrize ("model_id" , MODELS )
63
- def test_kv_cache_static_quantization (model_id ):
62
+ @pytest .mark .parametrize ("model_id,target_size " , MODELS )
63
+ def test_kv_cache_static_quantization (model_id , target_size ):
64
64
quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-static-kv"
65
65
66
66
tokenizer = AutoTokenizer .from_pretrained (model_id , use_fast = True )
@@ -94,5 +94,5 @@ def test_kv_cache_static_quantization(model_id):
94
94
shutil .rmtree (quantized_model_dir )
95
95
96
96
# We expect the model to be < 160MB
97
- target_size = 160 * (1024 * 1024 )
97
+ target_size = target_size * (1024 * 1024 )
98
98
assert model_size < target_size
0 commit comments