Skip to content

Commit 6e6e748

Browse files
committed
test(quant): add e2e tests for w4a8 quantization old and new versions
Signed-off-by: Anionex <[email protected]>
1 parent 4bc3a3c commit 6e6e748

File tree

1 file changed

+29
-4
lines changed

1 file changed

+29
-4
lines changed

tests/e2e/multicard/test_offline_inference_distributed.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,20 @@
3030
from tests.e2e.conftest import VllmRunner
3131

3232
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
33+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
3334

3435
QWEN_DENSE_MODELS = [
3536
"vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
3637
]
3738

39+
QWEN_W4A8_OLD_VERSION_MODELS = [
40+
"vllm-ascend/Qwen3-8B-W4A8",
41+
]
42+
43+
QWEN_W4A8_NEW_VERSION_MODELS = [
44+
"Anionex/Qwen3-1.7B-W4A8-V1",
45+
]
46+
3847
DEEPSEEK_W4A8_MODELS = [
3948
"vllm-ascend/DeepSeek-V3-W4A8-Pruing",
4049
"vllm-ascend/DeepSeek-V3.1-W4A8-puring"
@@ -98,20 +107,36 @@ def test_models_distributed_Qwen3_W8A8():
98107
vllm_model.generate_greedy(example_prompts, max_tokens)
99108

100109

101-
def test_models_distributed_Qwen3_W4A8DYNAMIC():
102-
example_prompts = [
110+
@pytest.mark.parametrize("model", QWEN_W4A8_OLD_VERSION_MODELS)
111+
def test_models_distributed_W4A8DYNAMIC_old_version(model):
112+
prompts = [
103113
"Hello, my name is",
104114
]
105115
max_tokens = 5
116+
with VllmRunner(
117+
snapshot_download(model),
118+
max_model_len=8192,
119+
dtype="auto",
120+
tensor_parallel_size=2,
121+
quantization="ascend",
122+
) as vllm_model:
123+
vllm_model.generate_greedy(prompts, max_tokens)
106124

125+
126+
@pytest.mark.parametrize("model", QWEN_W4A8_NEW_VERSION_MODELS)
127+
def test_models_distributed_W4A8DYNAMIC_new_version(model):
128+
prompts = [
129+
"Hello, my name is",
130+
]
131+
max_tokens = 5
107132
with VllmRunner(
108-
snapshot_download("vllm-ascend/Qwen3-8B-W4A8"),
133+
snapshot_download(model),
109134
max_model_len=8192,
110135
dtype="auto",
111136
tensor_parallel_size=2,
112137
quantization="ascend",
113138
) as vllm_model:
114-
vllm_model.generate_greedy(example_prompts, max_tokens)
139+
vllm_model.generate_greedy(prompts, max_tokens)
115140

116141

117142
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)

0 commit comments

Comments
 (0)