-
Notifications
You must be signed in to change notification settings - Fork 794
Description
System Info / 系統信息
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Jun_13_19:16:58_PDT_2023
Cuda compilation tools, release 12.2, V12.2.91
Build cuda_12.2.r12.2/compiler.32965470_0
Name: transformers
Version: 4.57.3
Name: xllamacpp
Version: 0.2.6
Name: vllm
Version: 0.11.0
Python 3.12.2
Linux version 4.18.0-553.22.1.el8_10.x86_64 (mockbuild@iad1-prod-build001.bld.equ.rockylinux.org) (gcc version 8.5.0 20210514 (Red Hat 8.5.0-22)
Running Xinference with Docker? / 是否使用 Docker 运行 Xinfernece?
- docker / docker
- pip install / 通过 pip install 安装
- installation from source / 从源码安装
Version info / 版本信息
Name: xinference
Version: 1.16.0
Summary: Model Serving Made Easy
Home-page: https://github.com/xorbitsai/inference
Author: Qin Xuye
Author-email: qinxuye@xprobe.io
License: Apache License 2.0
Location: /home/jmchang/miniconda3/lib/python3.12/site-packages
Requires: aiohttp, aioprometheus, async-timeout, bcrypt, click, fastapi, gradio, huggingface-hub, modelscope, nvidia-ml-py, openai, peft, pillow, pydantic, pynvml, python-jose, requests, setproctitle, sse_starlette, tabulate, timm, torch, tqdm, typing_extensions, uvicorn, xoscar
Required-by:
The command used to start Xinference / 用以启动 xinference 的命令
xinference-local -H 0.0.0.0
Reproduction / 复现过程
1.register API as follows:
{
"version": 2,
"context_length": 40960,
"model_name": "glm-4.7-awq",
"model_lang": [
"en",
"zh"
],
"model_ability": [
"generate",
"chat"
],
"model_description": "This is a custom model description.",
"model_family": "other",
"model_specs": [
{
"model_format": "awq",
"model_size_in_billions": 357,
"quantization": "AWQ",
"model_id": null,
"model_hub": "huggingface",
"model_uri": "/data/jmchang/workspace/modelscope/GLM-4-7-awq/",
"model_revision": null,
"activated_size_in_billions": null
}
],
"chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n{% for tool in tools %}\n{{ tool | tojson(ensure_ascii=False) }}\n{% endfor %}\n\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>{%- endif -%}\n{%- macro visible_text(content) -%}\n {%- if content is string -%}\n {{- content }}\n {%- elif content is iterable and content is not mapping -%}\n {%- for item in content -%}\n {%- if item is mapping and item.type == 'text' -%}\n {{- item.text }}\n {%- elif item is string -%}\n {{- item }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{- content }}\n {%- endif -%}\n{%- endmacro -%}\n{%- set ns = namespace(last_user_index=-1) %}\n{%- for m in messages %}\n {%- if m.role == 'user' %}\n {% set ns.last_user_index = loop.index0 -%}\n {%- endif %}\n{%- endfor %}\n{% for m in messages %}\n{%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}\n{%- elif m.role == 'assistant' -%}\n<|assistant|>\n{%- set reasoning_content = '' %}\n{%- set content = visible_text(m.content) %}\n{%- if m.reasoning_content is string %}\n {%- set reasoning_content = m.reasoning_content %}\n{%- else %}\n {%- if '' in content %}\n {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}\n {%- set content = content.split('')[-1].lstrip('\n') %}\n {%- endif %}\n{%- endif %}\n{%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%}\n{{ '' + reasoning_content.strip() + ''}}\n{%- else -%}\n{{ '' }}\n{%- endif -%}\n{%- if content.strip() -%}\n{{ content.strip() }}\n{%- endif -%}\n{% if m.tool_calls %}\n{% for tc in m.tool_calls %}\n{%- if tc.function %}\n {%- set tc = tc.function %}\n{%- endif %}\n{{- '<tool_call>' + tc.name -}}\n{% set _args = tc.arguments %}{% for k, v in _args.items() %}<arg_key>{{ k }}</arg_key><arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>{% endfor %}</tool_call>{% endfor %}\n{% endif %}\n{%- elif m.role == 'tool' -%}\n{%- if m.content is string -%}\n{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}\n {{- '<|observation|>' }}\n{%- endif %}\n{{- '<tool_response>' }}\n{{- m.content }}\n{{- '</tool_response>' }}\n{%- else -%}\n<|observation|>{% for tr in m.content %}\n<tool_response>{{ tr.output if tr.output is defined else tr }}</tool_response>{% endfor -%}\n{% endif -%}\n{%- elif m.role == 'system' -%}\n<|system|>{{ visible_text(m.content) }}\n{%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n <|assistant|>{{- '' if (enable_thinking is defined and not enable_thinking) else '' -}}\n{%- endif -%}",
"stop_token_ids": [],
"stop": [],
"architectures": null,
"reasoning_start_tag": null,
"reasoning_end_tag": null,
"cache_config": null,
"virtualenv": {
"packages": [],
"inherit_pip_config": true,
"index_url": null,
"extra_index_url": null,
"find_links": null,
"trusted_host": null,
"no_build_isolation": null
},
"tool_parser": null,
"is_builtin": false
}
2.when running, no choice to vllm engine
3.when running, error occured
Traceback (most recent call last):
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/api/restful_api.py", line 1284, in launch_model
model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/context.py", line 262, in send
return self._process_result_message(result)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/context.py", line 111, in _process_result_message
raise message.as_instanceof_cause()
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/pool.py", line 689, in send
result = await self._run_coro(message.message_id, coro)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/pool.py", line 389, in _run_coro
return await coro
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/api.py", line 418, in on_receive
return await super().on_receive(message) # type: ignore
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 564, in on_receive
raise ex
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
async with self._lock:
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 527, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 532, in xoscar.core._BaseActor.on_receive
result = await result
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/core/utils.py", line 94, in wrapped
ret = await func(*args, **kwargs)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/core/supervisor.py", line 1357, in launch_builtin_model
await _launch_model()
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/core/supervisor.py", line 1293, in _launch_model
raise result
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/core/supervisor.py", line 1107, in _launch_one_model
subpool_address = await worker_ref.launch_builtin_model(
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/context.py", line 262, in send
return self._process_result_message(result)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/context.py", line 111, in _process_result_message
raise message.as_instanceof_cause()
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/pool.py", line 689, in send
result = await self._run_coro(message.message_id, coro)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/pool.py", line 389, in _run_coro
return await coro
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/api.py", line 418, in on_receive
return await super().on_receive(message) # type: ignore
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 564, in on_receive
raise ex
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
async with self._lock:
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 527, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 532, in xoscar.core._BaseActor.on_receive
result = await result
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/core/utils.py", line 94, in wrapped
ret = await func(*args, **kwargs)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/core/worker.py", line 1596, in launch_builtin_model
await model_ref.load()
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/context.py", line 262, in send
return self._process_result_message(result)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/context.py", line 111, in _process_result_message
raise message.as_instanceof_cause()
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/pool.py", line 689, in send
result = await self._run_coro(message.message_id, coro)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/backends/pool.py", line 389, in _run_coro
return await coro
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xoscar/api.py", line 418, in on_receive
return await super().on_receive(message) # type: ignore
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 564, in on_receive
raise ex
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
async with self._lock:
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 527, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
^^^^^^^^^^^^^^^^^
File "xoscar/core.pyx", line 532, in xoscar.core._BaseActor.on_receive
result = await result
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/core/model.py", line 378, in load
await asyncio.to_thread(self._model.load)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/model/llm/transformers/core.py", line 1028, in load
super().load()
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/model/llm/transformers/core.py", line 344, in load
self._model, self._tokenizer = self._load_model(**kwargs)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/xinference/model/llm/transformers/core.py", line 219, in _load_model
model = AutoModelForCausalLM.from_pretrained(
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
return model_class.from_pretrained(
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 277, in _wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 4881, in from_pretrained
hf_quantizer, config, dtype, device_map = get_hf_quantizer(
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/quantizers/auto.py", line 305, in get_hf_quantizer
config.quantization_config = AutoHfQuantizer.merge_quantization_configs(
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/quantizers/auto.py", line 214, in merge_quantization_configs
quantization_config = AutoQuantizationConfig.from_dict(quantization_config)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/quantizers/auto.py", line 140, in from_dict
return target_cls.from_dict(quantization_config_dict)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/utils/quantization_config.py", line 122, in from_dict
config = cls(**config_dict)
^^^^^^^^^^^^^^^^^
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/utils/quantization_config.py", line 944, in init
self.post_init()
File "/home/jmchang/miniconda3/lib/python3.12/site-packages/transformers/utils/quantization_config.py", line 1003, in post_init
raise ValueError(
ValueError: [address=0.0.0.0:38119, pid=3788449] You current version of autoawq does not support module quantization skipping, please upgrade autoawq package to at least 0.1.8.
Expected behavior / 期待表现
This model can be registered and used