[serve] guard imports (#39825)

gante · web-flow · commit d6fad86d2378 · 2025-08-18T16:28:10.000+01:00
guard imports
diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py
@@ -32,7 +32,6 @@
 
 from huggingface_hub import model_info
 from huggingface_hub.constants import HF_HUB_OFFLINE
-from PIL import Image
 
 import transformers
 from transformers.models.auto.modeling_auto import (
@@ -45,6 +44,7 @@
     is_openai_available,
     is_pydantic_available,
     is_uvicorn_available,
+    is_vision_available,
 )
 
 from .. import (
@@ -54,7 +54,6 @@
     ProcessorMixin,
     TextIteratorStreamer,
 )
-from ..generation.continuous_batching import ContinuousBatchingManager, RequestStatus
 from ..utils import is_torch_available, logging
 from . import BaseTransformersCLICommand
 
@@ -69,9 +68,14 @@
         PreTrainedModel,
     )
 
+    from ..generation.continuous_batching import ContinuousBatchingManager, RequestStatus
+
 if is_librosa_available():
     import librosa
 
+if is_vision_available():
+    from PIL import Image
+
 serve_dependencies_available = (
     is_pydantic_available() and is_fastapi_available() and is_uvicorn_available() and is_openai_available()
 )
@@ -811,7 +815,7 @@ def stream_chat_completion(_inputs):
         return stream_chat_completion(inputs[0])
 
     @staticmethod
-    def get_model_modality(model: PreTrainedModel) -> Modality:
+    def get_model_modality(model: "PreTrainedModel") -> Modality:
         model_classname = model.__class__.__name__
         if model_classname in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
             modality = Modality.VLM
@@ -1545,7 +1549,9 @@ def _load_model_and_data_processor(self, model_id_and_revision: str):
         logger.info(f"Loaded model {model_id_and_revision}")
         return model, data_processor
 
-    def load_model_and_processor(self, model_id_and_revision: str) -> tuple[PreTrainedModel, PreTrainedTokenizerFast]:
+    def load_model_and_processor(
+        self, model_id_and_revision: str
+    ) -> tuple["PreTrainedModel", PreTrainedTokenizerFast]:
         """
         Loads the text model and processor from the given model ID and revision into the ServeCommand instance.
 
@@ -1570,7 +1576,7 @@ def load_model_and_processor(self, model_id_and_revision: str) -> tuple[PreTrain
 
         return model, processor
 
-    def load_audio_model_and_processor(self, model_id_and_revision: str) -> tuple[PreTrainedModel, ProcessorMixin]:
+    def load_audio_model_and_processor(self, model_id_and_revision: str) -> tuple["PreTrainedModel", ProcessorMixin]:
         """
         Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.