docs/gptq at master · 22dimensions/docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
[ERROR] 2025-07-14-13:46:35 (PID:2353055, Device:-1, RankID:-1) ERR99999 UNKNOWN applicaiton exception
root@adc8197e07af:~# python3 qwen3.py
INFO 07-14 13:48:23 [importing.py:17] Triton not installed or not compatible; certain GPU-related functions will not be available.
WARNING 07-14 13:48:23 [importing.py:29] Triton is not installed. Using dummy decorators. Install it via `pip install triton` to enable kernel compilation.
INFO 07-14 13:48:24 [__init__.py:39] Available plugins for group vllm.platform_plugins:
INFO 07-14 13:48:24 [__init__.py:41] - ascend -> vllm_ascend:register
INFO 07-14 13:48:24 [__init__.py:44] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 07-14 13:48:24 [__init__.py:235] Platform plugin ascend is activated
WARNING 07-14 13:48:28 [_custom_ops.py:22] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")
WARNING 07-14 13:48:32 [registry.py:401] Model architecture DeepSeekMTPModel is already registered, and will be overwritten by the new model class vllm_ascend.models.deepseek_mtp:CustomDeepSeekMTP.
WARNING 07-14 13:48:32 [registry.py:401] Model architecture Qwen2VLForConditionalGeneration is already registered, and will be overwritten by the new model class vllm_ascend.models.qwen2_vl:AscendQwen2VLForConditionalGeneration.
WARNING 07-14 13:48:32 [registry.py:401] Model architecture Qwen2_5_VLForConditionalGeneration is already registered, and will be overwritten by the new model class vllm_ascend.models.qwen2_5_vl:AscendQwen2_5_VLForConditionalGeneration.
WARNING 07-14 13:48:32 [registry.py:401] Model architecture DeepseekV2ForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend.models.deepseek_v2:CustomDeepseekV2ForCausalLM.
WARNING 07-14 13:48:32 [registry.py:401] Model architecture DeepseekV3ForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM.
WARNING 07-14 13:48:32 [registry.py:401] Model architecture Qwen3MoeForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM.
INFO 07-14 13:48:50 [config.py:823] This model supports multiple tasks: {'embed', 'generate', 'classify', 'score', 'reward'}. Defaulting to 'generate'.
WARNING 07-14 13:48:50 [config.py:931] ascend quantization is not fully optimized yet. The speed can be slower than non-quantized models.
INFO 07-14 13:48:50 [arg_utils.py:1653] npu is experimental on VLLM_USE_V1=1. Falling back to V0 Engine.
INFO 07-14 13:48:50 [config.py:1980] Disabled the custom all-reduce kernel because it is not supported on current platform.
INFO 07-14 13:48:50 [platform.py:168] Compilation disabled, using eager mode by default
INFO 07-14 13:48:50 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='/root/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B-GPTQ-Int8', speculative_config=None, tokenizer='/root/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=ascend, enforce_eager=False, kv_cache_dtype=auto,  device_config=npu, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/root/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B-GPTQ-Int8, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, pooler_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":[],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":256,"local_cache_dir":null}, use_cached_outputs=False,
WARNING 07-14 13:48:51 [utils.py:2737] Methods add_prompt_adapter,cache_config,compilation_config,current_platform,list_prompt_adapters,load_config,pin_prompt_adapter,remove_prompt_adapter not implemented in <vllm_ascend.worker.worker.NPUWorker object at 0xfffdae2b5480>
INFO 07-14 13:48:57 [parallel_state.py:1065] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 07-14 13:48:57 [model_runner.py:995] Starting to load model /root/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B-GPTQ-Int8...
[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/qwen3.py", line 12, in <module>
[rank0]:     llm = LLM(model="/root/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B-GPTQ-Int8",
[rank0]:   File "/vllm-workspace/vllm/vllm/entrypoints/llm.py", line 243, in __init__
[rank0]:     self.llm_engine = LLMEngine.from_engine_args(
[rank0]:   File "/vllm-workspace/vllm/vllm/engine/llm_engine.py", line 501, in from_engine_args
[rank0]:     return engine_cls.from_vllm_config(
[rank0]:   File "/vllm-workspace/vllm/vllm/engine/llm_engine.py", line 477, in from_vllm_config
[rank0]:     return cls(
[rank0]:   File "/vllm-workspace/vllm/vllm/engine/llm_engine.py", line 265, in __init__
[rank0]:     self.model_executor = executor_class(vllm_config=vllm_config)
[rank0]:   File "/vllm-workspace/vllm/vllm/executor/executor_base.py", line 53, in __init__
[rank0]:     self._init_executor()
[rank0]:   File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 48, in _init_executor
[rank0]:     self.collective_rpc("load_model")
[rank0]:   File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
[rank0]:     answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]:   File "/vllm-workspace/vllm/vllm/utils.py", line 2671, in run_method
[rank0]:     return func(*args, **kwargs)
[rank0]:   File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker.py", line 240, in load_model
[rank0]:     self.model_runner.load_model()
[rank0]:   File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner.py", line 997, in load_model
[rank0]:     self.model = get_model(vllm_config=self.vllm_config)
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 59, in get_model
[rank0]:     return loader.load_model(vllm_config=vllm_config,
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 38, in load_model
[rank0]:     model = initialize_model(vllm_config=vllm_config,
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/model_loader/utils.py", line 62, in initialize_model
[rank0]:     return model_class(vllm_config=vllm_config, prefix=prefix)
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/qwen3.py", line 271, in __init__
[rank0]:     self.model = Qwen3Model(vllm_config=vllm_config,
[rank0]:   File "/vllm-workspace/vllm/vllm/compilation/decorators.py", line 152, in __init__
[rank0]:     old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/qwen3.py", line 243, in __init__
[rank0]:     super().__init__(vllm_config=vllm_config,
[rank0]:   File "/vllm-workspace/vllm/vllm/compilation/decorators.py", line 152, in __init__
[rank0]:     old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 336, in __init__
[rank0]:     self.start_layer, self.end_layer, self.layers = make_layers(
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 626, in make_layers
[rank0]:     [PPMissingLayer() for _ in range(start_layer)] + [
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 627, in <listcomp>
[rank0]:     maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 338, in <lambda>
[rank0]:     lambda prefix: decoder_layer_type(config=config,
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/qwen3.py", line 174, in __init__
[rank0]:     self.self_attn = Qwen3Attention(
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/models/qwen3.py", line 93, in __init__
[rank0]:     self.qkv_proj = QKVParallelLinear(
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 851, in __init__
[rank0]:     super().__init__(input_size=input_size,
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 397, in __init__
[rank0]:     super().__init__(input_size,
[rank0]:   File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 243, in __init__
[rank0]:     self.quant_method = quant_config.get_quant_method(self,
[rank0]:   File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 92, in get_quant_method
[rank0]:     if self.is_layer_skipped_ascend(prefix,
[rank0]:   File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 126, in is_layer_skipped_ascend
[rank0]:     is_shard_skipped = self.quant_description[shard_prefix +
[rank0]: KeyError: 'model.layers.0.self_attn.q_proj.weight'
[ERROR] 2025-07-14-13:49:01 (PID:2354345, Device:0, RankID:-1) ERR99999 UNKNOWN applicaiton exception
root@adc8197e07af:~#