15
15
from auto_fp8 import AutoFP8ForCausalLM , BaseQuantizeConfig
16
16
17
17
MODELS = [
18
+ < << << << HEAD
18
19
<< < << << HEAD
19
20
("facebook/opt-125m" , 160 ),
20
21
("Qwen/Qwen2-0.5B-Instruct" , 620 ),
@@ -32,10 +33,14 @@ def test_dynamic_quantization():
32
33
== == == =
33
34
"facebook/opt-125m" ,
34
35
"Qwen/Qwen2-0.5B-Instruct" ,
36
+ == == == =
37
+ ("facebook/opt-125m" , 160 ),
38
+ ("Qwen/Qwen2-0.5B-Instruct" , 600 ),
39
+ >> >> >> > 415 c0b7 (Add fixed target sizes )
35
40
]
36
41
37
- @pytest .mark .parametrize ("model_id" , MODELS )
38
- def test_dynamic_quantization (model_id ):
42
+ @pytest .mark .parametrize ("model_id,target_size " , MODELS )
43
+ def test_dynamic_quantization (model_id , target_size ):
39
44
quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-dynamic"
40
45
> >> >> >> 2739 d61 (Add Qwen test )
41
46
@@ -53,6 +58,7 @@ def test_dynamic_quantization(model_id):
53
58
model_size = os .path .getsize (f"{ quantized_model_dir } /model.safetensors" )
54
59
shutil .rmtree (quantized_model_dir )
55
60
61
+ < << << << HEAD
56
62
< << << << HEAD
57
63
# We expect the quantized model to be a certain size
58
64
target_size = target_size * (1024 * 1024 )
@@ -76,6 +82,15 @@ def test_static_quantization():
76
82
== == == =
77
83
@pytest .mark .parametrize ("model_id" , MODELS )
78
84
def test_static_quantization (model_id ):
85
+ == == == =
86
+ # We expect the model to be a certain size
87
+ target_size = target_size * (1024 * 1024 )
88
+ assert model_size < target_size
89
+
90
+
91
+ @pytest .mark .parametrize ("model_id,target_size" , MODELS )
92
+ def test_static_quantization (model_id , target_size ):
93
+ > >> >> >> 415 c0b7 (Add fixed target sizes )
79
94
quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-static"
80
95
> >> >> >> 2739 d61 (Add Qwen test )
81
96
@@ -95,6 +110,7 @@ def test_static_quantization(model_id):
95
110
model_size = os .path .getsize (f"{ quantized_model_dir } /model.safetensors" )
96
111
shutil .rmtree (quantized_model_dir )
97
112
113
+ < << << << HEAD
98
114
# We expect the quantized model to be a certain size
99
115
target_size = target_size * (1024 * 1024 )
100
116
assert model_size < target_size
@@ -134,11 +150,14 @@ def test_kv_cache_static_quantization(model_id, target_size):
134
150
shutil .rmtree (quantized_model_dir )
135
151
136
152
# We expect the quantized model to be a certain size
153
+ == == == =
154
+ # We expect the model to be < 160MB
155
+ >> >> >> > 415 c0b7 (Add fixed target sizes )
137
156
target_size = target_size * (1024 * 1024 )
138
157
assert model_size < target_size
139
158
140
- @pytest .mark .parametrize ("model_id" , MODELS )
141
- def test_kv_cache_static_quantization (model_id ):
159
+ @pytest .mark .parametrize ("model_id,target_size " , MODELS )
160
+ def test_kv_cache_static_quantization (model_id , target_size ):
142
161
quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-static-kv"
143
162
144
163
tokenizer = AutoTokenizer .from_pretrained (model_id , use_fast = True )
@@ -172,5 +191,5 @@ def test_kv_cache_static_quantization(model_id):
172
191
shutil .rmtree (quantized_model_dir )
173
192
174
193
# We expect the model to be < 160MB
175
- target_size = 160 * (1024 * 1024 )
194
+ target_size = target_size * (1024 * 1024 )
176
195
assert model_size < target_size
0 commit comments