[Cherry-Pick]add 'max_image_tokens' param for docbee serving (#3888)

leo-q8 · web-flow · commit 2d32266cdff9 · 2025-04-22T22:27:26.000+08:00
diff --git a/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md b/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md
@@ -399,6 +399,13 @@ Below is a basic service deployment API reference and multilingual service call
 <td>Optional</td>
 <td>false</td>
 </tr>
+<tr>
+<td><code>max_image_tokens</code></td>
+<td><code>int</code></td>
+<td>Maximum number of tokens of input image</td>
+<td>Optional</td>
+<td>None</td>
+</tr>
 </tbody>
 </table>
 
diff --git a/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md b/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md
@@ -399,6 +399,13 @@ for res in output:
 <td>否</td>
 <td>false</td>
 </tr>
+<tr>
+<td><code>max_image_tokens</code></td>
+<td><code>int</code></td>
+<td>图像的最大输入token数</td>
+<td>否</td>
+<td>None</td>
+</tr>
 </tbody>
 </table>
 
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 import time
 from typing import Any, List
 
+from .....utils import logging
 from .....utils.deps import function_requires_deps, is_dep_available
 from ...infra import utils as serving_utils
 from ...infra.config import AppConfig
@@ -35,9 +37,11 @@
     from openai.types.chat import ChatCompletion
     from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
     from openai.types.chat.chat_completion_message import ChatCompletionMessage
+if is_dep_available("pillow"):
+    from PIL import Image
 
 
-@function_requires_deps("fastapi", "openai")
+@function_requires_deps("fastapi", "openai", "pillow")
 def create_pipeline_app(pipeline: Any, app_config: AppConfig) -> "FastAPI":
     app, ctx = create_app(
         pipeline=pipeline, app_config=app_config, app_aiohttp_session=True
@@ -55,6 +59,30 @@ def create_pipeline_app(pipeline: Any, app_config: AppConfig) -> "FastAPI":
     )
     async def _infer(request: InferRequest) -> "ChatCompletion":
         pipeline = ctx.pipeline
+        aiohttp_session = ctx.aiohttp_session
+
+        def _resize_image_with_token_limit(image, max_token_num=2200, tile_size=28):
+            image = Image.fromarray(image)
+            w0, h0 = image.width, image.height
+            tokens = math.ceil(w0 / tile_size) * math.ceil(h0 / tile_size)
+            if tokens <= max_token_num:
+                return image
+
+            k = math.sqrt(
+                max_token_num / (math.ceil(w0 / tile_size) * math.ceil(h0 / tile_size))
+            )
+            k = min(1.0, k)
+            w_new = max(int(w0 * k), tile_size)
+            h_new = max(int(h0 * k), tile_size)
+            new_size = (w_new, h_new)
+            resized_image = image.resize(new_size)
+            tokens_new = math.ceil(w_new / tile_size) * math.ceil(h_new / tile_size)
+            logging.info(
+                f"Resizing image from {w0}x{h0} to {w_new}x{h_new}, "
+                f"which will reduce the image tokens from {tokens} to {tokens_new}."
+            )
+
+            return resized_image
 
         def _process_messages(messages: List[Message]):
             system_message = ""
@@ -88,9 +116,20 @@ def _process_messages(messages: List[Message]):
             return system_message, user_message, image_url
 
         system_message, user_message, image_url = _process_messages(request.messages)
+        if request.max_image_tokens is not None:
+            if image_url.startswith("data:image"):
+                _, image_url = image_url.split(",", 1)
+            img_bytes = await serving_utils.get_raw_bytes_async(
+                image_url, aiohttp_session
+            )
+            image = serving_utils.image_bytes_to_array(img_bytes)
+            image = _resize_image_with_token_limit(image, request.max_image_tokens)
+        else:
+            image = image_url
+
         result = (
             await pipeline.infer(
-                {"image": image_url, "query": user_message},
+                {"image": image, "query": user_message},
             )
         )[0]
 
diff --git a/paddlex/inference/serving/schemas/doc_understanding.py b/paddlex/inference/serving/schemas/doc_understanding.py
@@ -70,6 +70,7 @@ class InferRequest(BaseModel):
     temperature: Optional[float] = 0.1
     top_p: Optional[float] = 0.95
     stream: Optional[bool] = False
+    max_image_tokens: Optional[int] = None
 
 
 PRIMARY_OPERATIONS: Final[PrimaryOperations] = {