1
1
import os
2
2
import shutil
3
3
4
+ < << << << HEAD
4
5
< << << << HEAD
5
6
import pytest
6
7
== == == =
7
8
>> >> >> > 3 ee9283 (Support calibrating kv cache scales )
9
+ == == == =
10
+ import pytest
11
+ > >> >> >> 2739 d61 (Add Qwen test )
8
12
import safetensors .torch
9
13
from transformers import AutoTokenizer
10
14
11
15
from auto_fp8 import AutoFP8ForCausalLM , BaseQuantizeConfig
12
16
13
17
MODELS = [
18
+ < << << << HEAD
14
19
("facebook/opt-125m" , 160 ),
15
20
("Qwen/Qwen2-0.5B-Instruct" , 620 ),
16
21
]
@@ -24,6 +29,15 @@ def test_dynamic_quantization():
24
29
model_id = "facebook/opt-125m"
25
30
quantized_model_dir = "opt-125m-fp8-dynamic"
26
31
> >> >> >> 3 ee9283 (Support calibrating kv cache scales )
32
+ == == == =
33
+ "facebook/opt-125m" ,
34
+ "Qwen/Qwen2-0.5B-Instruct" ,
35
+ ]
36
+
37
+ @pytest .mark .parametrize ("model_id" , MODELS )
38
+ def test_dynamic_quantization (model_id ):
39
+ quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-dynamic"
40
+ > >> >> >> 2739 d61 (Add Qwen test )
27
41
28
42
quantize_config = BaseQuantizeConfig (
29
43
quant_method = "fp8" , activation_scheme = "dynamic"
@@ -54,10 +68,16 @@ def test_static_quantization(model_id, target_size):
54
68
assert model_size < target_size
55
69
56
70
71
+ << << << < HEAD
57
72
def test_static_quantization ():
58
73
model_id = "facebook/opt-125m"
59
74
quantized_model_dir = "opt-125m-fp8-static"
60
75
> >> >> >> 3 ee9283 (Support calibrating kv cache scales )
76
+ == == == =
77
+ @pytest .mark .parametrize ("model_id" , MODELS )
78
+ def test_static_quantization (model_id ):
79
+ quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-static"
80
+ > >> >> >> 2739 d61 (Add Qwen test )
61
81
62
82
tokenizer = AutoTokenizer .from_pretrained (model_id , use_fast = True )
63
83
examples = ["auto-fp8 is an easy-to-use model quantization library" ]
@@ -117,10 +137,9 @@ def test_kv_cache_static_quantization(model_id, target_size):
117
137
target_size = target_size * (1024 * 1024 )
118
138
assert model_size < target_size
119
139
120
-
121
- def test_kv_cache_static_quantization ():
122
- model_id = "facebook/opt-125m"
123
- quantized_model_dir = "opt-125m-fp8-static-kv"
140
+ @pytest .mark .parametrize ("model_id" , MODELS )
141
+ def test_kv_cache_static_quantization (model_id ):
142
+ quantized_model_dir = model_id .split ("/" )[- 1 ] + "-fp8-static-kv"
124
143
125
144
tokenizer = AutoTokenizer .from_pretrained (model_id , use_fast = True )
126
145
examples = ["auto-fp8 is an easy-to-use model quantization library" ]
0 commit comments