File tree Expand file tree Collapse file tree 8 files changed +32
-71
lines changed Expand file tree Collapse file tree 8 files changed +32
-71
lines changed Original file line number Diff line number Diff line change 4
4
"""
5
5
6
6
import pytest
7
- import torch
8
7
9
- from vllm .model_executor .layers .quantization import QUANTIZATION_METHODS
10
-
11
- aqlm_not_supported = True
12
-
13
- if torch .cuda .is_available ():
14
- capability = torch .cuda .get_device_capability ()
15
- capability = capability [0 ] * 10 + capability [1 ]
16
- aqlm_not_supported = (capability <
17
- QUANTIZATION_METHODS ["aqlm" ].get_min_capability ())
8
+ from tests .quantization .utils import is_quant_method_supported
18
9
19
10
# In this test we hardcode prompts and generations for the model so we don't
20
11
# need to require the AQLM package as a dependency
67
58
]
68
59
69
60
70
- @pytest .mark .skipif (aqlm_not_supported ,
61
+ @pytest .mark .skipif (not is_quant_method_supported ( "aqlm" ) ,
71
62
reason = "AQLM is not supported on this GPU type." )
72
63
@pytest .mark .parametrize ("model" , ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf" ])
73
64
@pytest .mark .parametrize ("dtype" , ["half" ])
Original file line number Diff line number Diff line change 8
8
import torch
9
9
from transformers import AutoTokenizer
10
10
11
+ from tests .quantization .utils import is_quant_method_supported
11
12
from vllm import LLM , SamplingParams
12
- from vllm .model_executor .layers .quantization import QUANTIZATION_METHODS
13
13
14
14
os .environ ["TOKENIZERS_PARALLELISM" ] = "true"
15
15
67
67
},
68
68
}
69
69
70
- fp8_not_supported = True
71
70
72
- if torch .cuda .is_available ():
73
- capability = torch .cuda .get_device_capability ()
74
- capability = capability [0 ] * 10 + capability [1 ]
75
- fp8_not_supported = (capability <
76
- QUANTIZATION_METHODS ["fp8" ].get_min_capability ())
77
-
78
-
79
- @pytest .mark .skipif (fp8_not_supported ,
71
+ @pytest .mark .skipif (not is_quant_method_supported ("fp8" ),
80
72
reason = "fp8 is not supported on this GPU type." )
81
73
@pytest .mark .parametrize ("model_name" , MODELS )
82
74
@pytest .mark .parametrize ("kv_cache_dtype" , ["auto" , "fp8" ])
Original file line number Diff line number Diff line change 11
11
import os
12
12
13
13
import pytest
14
- import torch
15
14
16
- from vllm . model_executor . layers . quantization import QUANTIZATION_METHODS
15
+ from tests . quantization . utils import is_quant_method_supported
17
16
from vllm .model_executor .layers .rotary_embedding import _ROPE_DICT
18
17
19
18
from .utils import check_logprobs_close
22
21
23
22
MAX_MODEL_LEN = 1024
24
23
25
- gptq_marlin_not_supported = True
26
-
27
- if torch .cuda .is_available ():
28
- capability = torch .cuda .get_device_capability ()
29
- capability = capability [0 ] * 10 + capability [1 ]
30
- gptq_marlin_not_supported = (
31
- capability < QUANTIZATION_METHODS ["gptq_marlin" ].get_min_capability ())
32
-
33
24
MODELS = [
34
25
# act_order==False, group_size=channelwise
35
26
("robertgshaw2/zephyr-7b-beta-channelwise-gptq" , "main" ),
53
44
54
45
55
46
@pytest .mark .flaky (reruns = 3 )
56
- @pytest .mark .skipif (gptq_marlin_not_supported ,
47
+ @pytest .mark .skipif (not is_quant_method_supported ( "gptq_marlin" ) ,
57
48
reason = "gptq_marlin is not supported on this GPU type." )
58
49
@pytest .mark .parametrize ("model" , MODELS )
59
50
@pytest .mark .parametrize ("dtype" , ["half" , "bfloat16" ])
Original file line number Diff line number Diff line change 9
9
from dataclasses import dataclass
10
10
11
11
import pytest
12
- import torch
13
12
14
13
from tests .models .utils import check_logprobs_close
15
- from vllm .model_executor .layers .quantization import QUANTIZATION_METHODS
16
-
17
- marlin_not_supported = True
18
-
19
- if torch .cuda .is_available ():
20
- capability = torch .cuda .get_device_capability ()
21
- capability = capability [0 ] * 10 + capability [1 ]
22
- marlin_not_supported = (
23
- capability < QUANTIZATION_METHODS ["marlin" ].get_min_capability ())
14
+ from tests .quantization .utils import is_quant_method_supported
24
15
25
16
26
17
@dataclass
@@ -47,7 +38,7 @@ class ModelPair:
47
38
48
39
49
40
@pytest .mark .flaky (reruns = 2 )
50
- @pytest .mark .skipif (marlin_not_supported ,
41
+ @pytest .mark .skipif (not is_quant_method_supported ( "gptq_marlin_24" ) ,
51
42
reason = "Marlin24 is not supported on this GPU type." )
52
43
@pytest .mark .parametrize ("model_pair" , model_pairs )
53
44
@pytest .mark .parametrize ("dtype" , ["half" ])
Original file line number Diff line number Diff line change 13
13
from dataclasses import dataclass
14
14
15
15
import pytest
16
- import torch
17
16
18
- from vllm . model_executor . layers . quantization import QUANTIZATION_METHODS
17
+ from tests . quantization . utils import is_quant_method_supported
19
18
20
19
from .utils import check_logprobs_close
21
20
22
- marlin_not_supported = True
23
-
24
- if torch .cuda .is_available ():
25
- capability = torch .cuda .get_device_capability ()
26
- capability = capability [0 ] * 10 + capability [1 ]
27
- marlin_not_supported = (
28
- capability < QUANTIZATION_METHODS ["marlin" ].get_min_capability ())
29
-
30
21
31
22
@dataclass
32
23
class ModelPair :
@@ -45,7 +36,7 @@ class ModelPair:
45
36
46
37
47
38
@pytest .mark .flaky (reruns = 2 )
48
- @pytest .mark .skipif (marlin_not_supported ,
39
+ @pytest .mark .skipif (not is_quant_method_supported ( "marlin" ) ,
49
40
reason = "Marlin is not supported on this GPU type." )
50
41
@pytest .mark .parametrize ("model_pair" , model_pairs )
51
42
@pytest .mark .parametrize ("dtype" , ["half" ])
Original file line number Diff line number Diff line change 5
5
import pytest
6
6
import torch
7
7
8
+ from tests .quantization .utils import is_quant_method_supported
8
9
from vllm import SamplingParams
9
- from vllm .model_executor .layers .quantization import QUANTIZATION_METHODS
10
10
11
- capability = torch .cuda .get_device_capability ()
12
- capability = capability [0 ] * 10 + capability [1 ]
13
11
14
-
15
- @pytest .mark .skipif (
16
- capability < QUANTIZATION_METHODS ['bitsandbytes' ].get_min_capability (),
17
- reason = 'bitsandbytes is not supported on this GPU type.' )
12
+ @pytest .mark .skipif (not is_quant_method_supported ("bitsandbytes" ),
13
+ reason = 'bitsandbytes is not supported on this GPU type.' )
18
14
def test_load_bnb_model (vllm_runner ) -> None :
19
15
with vllm_runner ('huggyllama/llama-7b' ,
20
16
quantization = 'bitsandbytes' ,
Original file line number Diff line number Diff line change 5
5
import pytest
6
6
import torch
7
7
8
+ from tests .quantization .utils import is_quant_method_supported
8
9
from vllm ._custom_ops import scaled_fp8_quant
9
- from vllm .model_executor .layers .quantization import QUANTIZATION_METHODS
10
10
from vllm .model_executor .layers .quantization .fp8 import Fp8LinearMethod
11
11
12
- capability = torch .cuda .get_device_capability ()
13
- capability = capability [0 ] * 10 + capability [1 ]
14
12
15
-
16
- @pytest .mark .skipif (
17
- capability < QUANTIZATION_METHODS ["fp8" ].get_min_capability (),
18
- reason = "FP8 is not supported on this GPU type." )
13
+ @pytest .mark .skipif (not is_quant_method_supported ("fp8" ),
14
+ reason = "FP8 is not supported on this GPU type." )
19
15
def test_load_fp16_model (vllm_runner ) -> None :
20
16
with vllm_runner ("facebook/opt-125m" , quantization = "fp8" ) as llm :
21
17
@@ -25,9 +21,8 @@ def test_load_fp16_model(vllm_runner) -> None:
25
21
assert fc1 .weight .dtype == torch .float8_e4m3fn
26
22
27
23
28
- @pytest .mark .skipif (
29
- capability < QUANTIZATION_METHODS ["fp8" ].get_min_capability (),
30
- reason = "FP8 is not supported on this GPU type." )
24
+ @pytest .mark .skipif (not is_quant_method_supported ("fp8" ),
25
+ reason = "FP8 is not supported on this GPU type." )
31
26
@pytest .mark .parametrize ("dtype" , [torch .float16 , torch .bfloat16 ])
32
27
def test_scaled_fp8_quant (dtype ) -> None :
33
28
Original file line number Diff line number Diff line change
1
+ import torch
2
+
3
+ from vllm .model_executor .layers .quantization import QUANTIZATION_METHODS
4
+
5
+
6
+ def is_quant_method_supported (quant_method : str ) -> bool :
7
+ # Currently, all quantization methods require Nvidia or AMD GPUs
8
+ if not torch .cuda .is_available ():
9
+ return False
10
+
11
+ capability = torch .cuda .get_device_capability ()
12
+ capability = capability [0 ] * 10 + capability [1 ]
13
+ return (capability <
14
+ QUANTIZATION_METHODS [quant_method ].get_min_capability ())
You can’t perform that action at this time.
0 commit comments