support qwen_image

shell-nlp · shell-nlp · commit 0ba8d28829b9 · 2025-10-25T13:07:15.000+08:00
diff --git a/README.md b/README.md
@@ -61,6 +61,7 @@
 <summary><b>2025</b></summary>
  
 ```plaintext
+2025-10-25 支持了 qwen_image 文生图模型
 2025-9-7   支持了 文本编辑模型 (代码样例见gpt_server/tests/test_image_edit.py)
 2025-8-8   初步支持了 embedding 的 vllm 加速
 2025-6-17  支持了 jina-reranker-m0 全球首个支持多模态多语言的重排模型
@@ -367,6 +368,7 @@ Chat UI界面:
 | Models / BackEnd | model_type |
 | :--------------: | :--------: |
 |    flux     | flux  |
+|    qwen_image     | qwen_image  |
 
 <br>
 
diff --git a/gpt_server/model_worker/qwen_image.py b/gpt_server/model_worker/qwen_image.py
@@ -0,0 +1,120 @@
+import os
+from typing import List
+import uuid
+from loguru import logger
+import shortuuid
+from gpt_server.model_worker.base.model_worker_base import ModelWorkerBase
+from gpt_server.model_worker.utils import pil_to_base64
+import torch
+from diffusers import DiffusionPipeline
+from gpt_server.utils import STATIC_DIR
+
+root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+positive_magic = {
+    "en": ", Ultra HD, 4K, cinematic composition.",  # for english prompt
+    "zh": ", 超清，4K，电影级构图.",  # for chinese prompt
+}
+
+aspect_ratios = {
+    "1:1": (1328, 1328),
+    "16:9": (1664, 928),
+    "9:16": (928, 1664),
+    "4:3": (1472, 1140),
+    "3:4": (1140, 1472),
+    "3:2": (1584, 1056),
+    "2:3": (1056, 1584),
+}
+
+width, height = aspect_ratios["16:9"]
+import re
+
+
+def contains_chinese(text):
+    pattern = re.compile(r"[\u4e00-\u9fff]")
+    return bool(pattern.search(text))
+
+
+class QwenImageWorker(ModelWorkerBase):
+    def __init__(
+        self,
+        controller_addr: str,
+        worker_addr: str,
+        worker_id: str,
+        model_path: str,
+        model_names: List[str],
+        limit_worker_concurrency: int,
+        conv_template: str = None,  # type: ignore
+    ):
+        super().__init__(
+            controller_addr,
+            worker_addr,
+            worker_id,
+            model_path,
+            model_names,
+            limit_worker_concurrency,
+            conv_template,
+            model_type="image",
+        )
+        backend = os.environ["backend"]
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.pipe = DiffusionPipeline.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16
+        ).to(self.device)
+
+        logger.warning(f"模型：{model_names[0]}")
+
+    async def get_image_output(self, params):
+        prompt = params["prompt"]
+        if contains_chinese(prompt):
+            prompt += positive_magic["zh"]
+        else:
+            prompt += positive_magic["en"]
+        response_format = params.get("response_format", "b64_json")
+        image = self.pipe(
+            prompt,
+            negative_prompt=" ",
+            height=height,
+            width=width,
+            num_inference_steps=50,
+            true_cfg_scale=4.0,
+            generator=torch.Generator(self.device).manual_seed(0),
+        ).images[0]
+        result = {}
+        if response_format == "b64_json":
+            # Convert PIL image to base64
+            base64 = pil_to_base64(pil_img=image)
+            result = {
+                "created": shortuuid.random(),
+                "data": [{"b64_json": base64}],
+                "usage": {
+                    "total_tokens": 0,
+                    "input_tokens": 0,
+                    "output_tokens": 0,
+                    "input_tokens_details": {"text_tokens": 0, "image_tokens": 0},
+                },
+            }
+            return result
+        elif response_format == "url":
+            # 生成唯一文件名（避免冲突）
+            file_name = str(uuid.uuid4()) + ".png"
+            save_path = STATIC_DIR / file_name
+            image.save(save_path, format="PNG")
+            WORKER_PORT = os.environ["WORKER_PORT"]
+            WORKER_HOST = os.environ["WORKER_HOST"]
+            url = f"http://{WORKER_HOST}:{WORKER_PORT}/static/{file_name}"
+            result = {
+                "created": shortuuid.random(),
+                "data": [{"url": url}],
+                "usage": {
+                    "total_tokens": 0,
+                    "input_tokens": 0,
+                    "output_tokens": 0,
+                    "input_tokens_details": {"text_tokens": 0, "image_tokens": 0},
+                },
+            }
+        return result
+
+
+if __name__ == "__main__":
+    QwenImageWorker.run()
diff --git a/gpt_server/script/config_example.yaml b/gpt_server/script/config_example.yaml
@@ -25,6 +25,7 @@ model_worker_args:
   limit_worker_concurrency: 1024 # worker的最大并发数,默认为 1024
 
 models:
+# --------------- 支持的大语言模型样例 ---------------
 - qwen:
     # 大语言模型
     #自定义的模型名称
@@ -63,7 +64,7 @@ models:
       #   - 1
 
 
-
+# --------------- 支持的多模态模型样例 ---------------
 - internvl2:
     # 多模态模型
     #自定义的模型名称
@@ -80,7 +81,7 @@ models:
     - gpus:
       # - 1
       - 0
-
+# --------------- 支持的rerank模型样例 ---------------
 - bge-reranker-base:
     # rerank模型
     alias: null # 别名   
@@ -93,7 +94,7 @@ models:
     workers:
     - gpus:
       - 2
-# 部署 qwen3-reranker 样例
+
 - qwen3-reranker:
     alias: null
     enable: true
@@ -108,7 +109,7 @@ models:
     workers:
     - gpus:
       - 6
-
+# --------------- 支持的多模态多语言的重排模型样例 ---------------
 - jina-reranker:
     # 多模态多语言的重排模型，这个模型task_type 只能是 auto
     alias: null
@@ -122,9 +123,8 @@ models:
     workers:
     - gpus:
       - 5
-
+# --------------- 支持的文本embedding模型样例 ---------------
 - acge_text_embedding:
-    # 文本embedding模型
     alias: text-embedding-ada-002 # 别名   
     enable: true # false true
     model_config:
@@ -136,9 +136,8 @@ models:
     workers:
     - gpus:
       - 2
-
+# --------------- 支持的vl-embedding 模型样例 --------------- 
 - bge-vl:
-    # vl-embedding 模型
     alias: null
     enable: true
     model_config:
@@ -149,9 +148,8 @@ models:
     workers:
     - gpus:
       - 2
-
+# --------------- 支持的文本审核模型样例 --------------- 
 - text-moderation:
-    # 文本审核模型
     alias: omni-moderation-latest
     enable: true
     model_config:
@@ -162,8 +160,8 @@ models:
     workers:
     - gpus:
       - 2
+# --------------- 支持的最新支持ASR模型样例 --------------- 
 - SenseVoiceSmall:
-    ## 最新支持ASR模型
     alias: null
     enable: true
     model_config:
@@ -175,8 +173,8 @@ models:
     workers:
     - gpus:
       - 2
+# --------------- 支持的TTS 模型的配置方式样例 --------------- 
 - tts:
-    # TTS 模型的配置方式
     alias: null
     enable: true
     model_config:
@@ -187,9 +185,9 @@ models:
     workers:
     - gpus:
       - 6
-
+# --------------- 支持的文生图模型样例 --------------- 
 - flux:
-    # 文生图模型
+
     alias: null
     enable: true
     model_config:
@@ -201,8 +199,19 @@ models:
     - gpus:
       - 7
 
+- qwen-image:
+    alias: null
+    enable: true
+    model_config:
+      model_name_or_path: /home/dev/model/Qwen/Qwen-Image/
+    model_type: qwen_image
+    work_mode: hf
+    device: gpu
+    workers:
+    - gpus:
+      - 7
+# --------------- 支持的图片编辑模型样例 --------------- 
 - image-edit:
-    # 图片编辑模型
     alias: null
     enable: true
     model_config:
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "sglang[all]>=0.5.3.post1",
     "flashinfer-python",
     "flashtts>=0.1.7",
-    "diffusers>=0.35.1",
+    "diffusers>=0.35.2",
     #"sqlmodel>=0.0.24",
     "autoawq>=0.2.9",
 ]
@@ -38,7 +38,7 @@ default-groups = [] # 默认只安装dependencies中的库
 prerelease = "allow"
 override-dependencies = [
     "setuptools==77.0.3",
-    "transformers==4.57.0",  #  infinity-emb
+    "transformers==4.57.1",  #  infinity-emb
     "soundfile==0.13.1",     # infinity
     "xgrammar==0.1.24",      #  sglang[all]==0.4.5 depends on xgrammar==0.1.17
     "outlines-core==0.2.11", # sglang 和 vllm 的冲突
diff --git a/uv.lock b/uv.lock