|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | +"""Tests whether TPU Int8 computation is enabled correctly. |
| 4 | +
|
| 5 | +Run `pytest tests/quantization/test_tpu_int8.py`. |
| 6 | +""" |
| 7 | +import pytest |
| 8 | + |
| 9 | +from vllm.model_executor.layers.linear import LinearBase |
| 10 | +from vllm.model_executor.layers.quantization.tpu_int8 import ( |
| 11 | + TPUInt8LinearMethod) |
| 12 | +from vllm.platforms import current_platform |
| 13 | + |
| 14 | +from ...models.registry import HF_EXAMPLE_MODELS |
| 15 | + |
| 16 | +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] |
| 17 | + |
| 18 | + |
| 19 | +@pytest.mark.skipif(not current_platform.is_tpu(), |
| 20 | + reason="TPU Int8 is only enabled for TPUs.") |
| 21 | +@pytest.mark.parametrize("model", MODELS) |
| 22 | +@pytest.mark.parametrize("dtype", ["bfloat16"]) |
| 23 | +@pytest.mark.parametrize("max_tokens", [10]) |
| 24 | +@pytest.mark.parametrize( |
| 25 | + "hf_overrides", |
| 26 | + [ |
| 27 | + # w8a8 dynamic activation |
| 28 | + { |
| 29 | + 'quantization_config': { |
| 30 | + 'quant_method': 'tpu_int8', |
| 31 | + 'activation_scheme': 'dynamic' |
| 32 | + } |
| 33 | + } |
| 34 | + ]) |
| 35 | +def test_model_tpu_int8(vllm_runner, model: str, dtype: str, max_tokens: int, |
| 36 | + hf_overrides: dict, monkeypatch) -> None: |
| 37 | + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) |
| 38 | + model_info.check_transformers_version(on_fail="skip") |
| 39 | + |
| 40 | + activation_scheme = hf_overrides.get('quantization_config', |
| 41 | + {}).get('activation_scheme') |
| 42 | + quantize_activation = activation_scheme == 'dynamic' |
| 43 | + |
| 44 | + # Allows using apply_model |
| 45 | + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") |
| 46 | + # Prevent error from re-initializing cache |
| 47 | + monkeypatch.setenv("VLLM_XLA_CACHE_PATH", "") |
| 48 | + |
| 49 | + prompts = [ |
| 50 | + "A robot may not injure a human being", |
| 51 | + "It is only with the heart that one can see rightly;", |
| 52 | + "The greatest glory in living lies not in never falling,", |
| 53 | + ] |
| 54 | + answers = [ |
| 55 | + "or, being injured, not kill, except in", |
| 56 | + "without the heart, one can only see wrongly.", |
| 57 | + "but in rising every time we fall. - Nelson" |
| 58 | + ] |
| 59 | + |
| 60 | + with vllm_runner(model, dtype=dtype, hf_overrides=hf_overrides) as vllm: |
| 61 | + |
| 62 | + def check_model(model): |
| 63 | + for name, module in model.named_modules(): |
| 64 | + if not isinstance(module, LinearBase): |
| 65 | + continue |
| 66 | + quant_method = module.quant_method |
| 67 | + assert isinstance(quant_method, TPUInt8LinearMethod) |
| 68 | + assert quant_method.quantize_activation == quantize_activation |
| 69 | + |
| 70 | + vllm.apply_model(check_model) |
| 71 | + outputs = vllm.generate_greedy(prompts, max_tokens) |
| 72 | + for (_, output), answer in zip(outputs, answers): |
| 73 | + assert answer in output |
0 commit comments