Support download models from www.modelscope.cn (#1588)

liuyhwangyh · web-flow · commit edb305584bda · 2023-11-17T20:38:31.000-08:00
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
@@ -40,6 +40,16 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
 
     llm = LLM(model="facebook/opt-125m")
 
+Use model from www.modelscope.cn
+
+.. code-block:: shell
+
+    export VLLM_USE_MODELSCOPE=True
+
+.. code-block:: python
+
+    llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
+
 Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
 
 .. code-block:: python
@@ -67,6 +77,16 @@ Start the server:
 
     $ python -m vllm.entrypoints.api_server
 
+Use model from www.modelscope.cn
+
+.. code-block:: console
+
+    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
+    $    --model="qwen/Qwen-7B-Chat" \
+    $    --revision="v1.1.8" \
+    $    --trust-remote-code
+
+
 By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
 
 Query the model in shell:
@@ -95,6 +115,13 @@ Start the server:
     $ python -m vllm.entrypoints.openai.api_server \
     $     --model facebook/opt-125m
 
+Use model from www.modelscope.cn
+
+.. code-block:: console
+
+    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
+    $     --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
+
 By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
 
 This server can be queried in the same format as OpenAI API. For example, list the models:
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -81,4 +81,18 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
         output = llm.generate("Hello, my name is")
         print(output)
 
+    To use model from www.modelscope.cn
+
+    .. code-block:: shell
+
+       $ export VLLM_USE_MODELSCOPE=True
+
+    .. code-block:: python
+
+        from vllm import LLM
+
+        llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
+        output = llm.generate("Hello, my name is")
+        print(output)
+
     If vLLM successfully generates text, it indicates that your model is supported.
diff --git a/vllm/config.py b/vllm/config.py
@@ -1,4 +1,5 @@
 from typing import Optional, Union
+import os
 
 import torch
 from transformers import PretrainedConfig
@@ -76,7 +77,18 @@ def __init__(
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
 
-        self.hf_config = get_config(model, trust_remote_code, revision)
+        if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
+            # download model from ModelScope hub,
+            # lazy import so that modelscope is not required for normal use.
+            from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
+            model_path = snapshot_download(model_id=model,
+                                           cache_dir=download_dir,
+                                           revision=revision)
+            self.model = model_path
+            self.download_dir = model_path
+            self.tokenizer = model_path
+
+        self.hf_config = get_config(self.model, trust_remote_code, revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
         self.max_model_len = _get_and_verify_max_len(self.hf_config,
                                                      max_model_len)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -648,9 +648,10 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
     max_model_len = engine_model_config.max_model_len
 
     # A separate tokenizer to map token IDs to strings.
-    tokenizer = get_tokenizer(engine_args.tokenizer,
-                              tokenizer_mode=engine_args.tokenizer_mode,
-                              trust_remote_code=engine_args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        engine_model_config.tokenizer,
+        tokenizer_mode=engine_model_config.tokenizer_mode,
+        trust_remote_code=engine_model_config.trust_remote_code)
 
     uvicorn.run(app,
                 host=args.host,