|
30 | 30 | from tests.e2e.conftest import VllmRunner
|
31 | 31 |
|
32 | 32 | os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
| 33 | +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" |
33 | 34 |
|
34 | 35 | QWEN_DENSE_MODELS = [
|
35 | 36 | "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
|
36 | 37 | ]
|
37 | 38 |
|
| 39 | +QWEN_W4A8_OLD_VERSION_MODELS = [ |
| 40 | + "vllm-ascend/Qwen3-8B-W4A8", |
| 41 | +] |
| 42 | + |
| 43 | +QWEN_W4A8_NEW_VERSION_MODELS = [ |
| 44 | + "Anionex/Qwen3-1.7B-W4A8-V1", |
| 45 | +] |
| 46 | + |
38 | 47 | DEEPSEEK_W4A8_MODELS = [
|
39 | 48 | "vllm-ascend/DeepSeek-V3-W4A8-Pruing",
|
40 | 49 | "vllm-ascend/DeepSeek-V3.1-W4A8-puring"
|
@@ -98,20 +107,36 @@ def test_models_distributed_Qwen3_W8A8():
|
98 | 107 | vllm_model.generate_greedy(example_prompts, max_tokens)
|
99 | 108 |
|
100 | 109 |
|
101 |
| -def test_models_distributed_Qwen3_W4A8DYNAMIC(): |
102 |
| - example_prompts = [ |
| 110 | +@pytest.mark.parametrize("model", QWEN_W4A8_OLD_VERSION_MODELS) |
| 111 | +def test_models_distributed_W4A8DYNAMIC_old_version(model): |
| 112 | + prompts = [ |
103 | 113 | "Hello, my name is",
|
104 | 114 | ]
|
105 | 115 | max_tokens = 5
|
| 116 | + with VllmRunner( |
| 117 | + snapshot_download(model), |
| 118 | + max_model_len=8192, |
| 119 | + dtype="auto", |
| 120 | + tensor_parallel_size=2, |
| 121 | + quantization="ascend", |
| 122 | + ) as vllm_model: |
| 123 | + vllm_model.generate_greedy(prompts, max_tokens) |
106 | 124 |
|
| 125 | + |
| 126 | +@pytest.mark.parametrize("model", QWEN_W4A8_NEW_VERSION_MODELS) |
| 127 | +def test_models_distributed_W4A8DYNAMIC_new_version(model): |
| 128 | + prompts = [ |
| 129 | + "Hello, my name is", |
| 130 | + ] |
| 131 | + max_tokens = 5 |
107 | 132 | with VllmRunner(
|
108 |
| - snapshot_download("vllm-ascend/Qwen3-8B-W4A8"), |
| 133 | + snapshot_download(model), |
109 | 134 | max_model_len=8192,
|
110 | 135 | dtype="auto",
|
111 | 136 | tensor_parallel_size=2,
|
112 | 137 | quantization="ascend",
|
113 | 138 | ) as vllm_model:
|
114 |
| - vllm_model.generate_greedy(example_prompts, max_tokens) |
| 139 | + vllm_model.generate_greedy(prompts, max_tokens) |
115 | 140 |
|
116 | 141 |
|
117 | 142 | @pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
|
0 commit comments