[Inference] Support image to video task (#3289)

hanouticelina · Wauplin · commit 3720a5096f18 · 2025-08-08T11:13:33.000+02:00
diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
@@ -219,6 +219,7 @@ For more details, refer to the [Inference Providers pricing documentation](https
 | [`~InferenceClient.image_classification`]           | ❌                 | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        |
 | [`~InferenceClient.image_segmentation`]             | ❌                 | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        |
 | [`~InferenceClient.image_to_image`]                 | ❌                 | ❌        | ❌      | ✅      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ✅         | ❌         | ❌        |
+| [`~InferenceClient.image_to_video`]                 | ❌                 | ❌        | ❌      | ✅      | ❌              | ❌            | ❌    | ❌            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        |
 | [`~InferenceClient.image_to_text`]                  | ❌                 | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        |
 | [`~InferenceClient.object_detection`]               | ❌                 | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        |  | ❌ |
 | [`~InferenceClient.question_answering`]             | ❌                 | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        |
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -81,6 +81,7 @@
     ImageSegmentationSubtask,
     ImageToImageTargetSize,
     ImageToTextOutput,
+    ImageToVideoTargetSize,
     ObjectDetectionOutputElement,
     Padding,
     QuestionAnsweringOutputElement,
@@ -1339,6 +1340,85 @@ def image_to_image(
         response = provider_helper.get_response(response, request_parameters)
         return _bytes_to_image(response)
 
+    def image_to_video(
+        self,
+        image: ContentT,
+        *,
+        model: Optional[str] = None,
+        prompt: Optional[str] = None,
+        negative_prompt: Optional[str] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        seed: Optional[int] = None,
+        target_size: Optional[ImageToVideoTargetSize] = None,
+        **kwargs,
+    ) -> bytes:
+        """
+        Generate a video from an input image.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to generate a video from. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            prompt (`str`, *optional*):
+                The text prompt to guide the video generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in video generation.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            num_inference_steps (`int`, *optional*):
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            guidance_scale (`float`, *optional*):
+                For diffusion models. A higher guidance scale value encourages the model to generate videos closely
+                linked to the text prompt at the expense of lower image quality.
+            seed (`int`, *optional*):
+                The seed to use for the video generation.
+            target_size (`ImageToVideoTargetSize`, *optional*):
+                The size in pixel of the output video frames.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+
+        Returns:
+            `bytes`: The generated video.
+
+        Examples:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> video = client.image_to_video("cat.jpg", model="Wan-AI/Wan2.2-I2V-A14B", prompt="turn the cat into a tiger")
+        >>> with open("tiger.mp4", "wb") as f:
+        ...     f.write(video)
+        ```
+        """
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="image-to-video", model=model_id)
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                "seed": seed,
+                "target_size": target_size,
+                **kwargs,
+            },
+            headers=self.headers,
+            model=model_id,
+            api_key=self.token,
+        )
+        response = self._inner_post(request_parameters)
+        response = provider_helper.get_response(response, request_parameters)
+        return response
+
     def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
         """
         Takes an input image and return text.
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -66,6 +66,7 @@
     ImageSegmentationSubtask,
     ImageToImageTargetSize,
     ImageToTextOutput,
+    ImageToVideoTargetSize,
     ObjectDetectionOutputElement,
     Padding,
     QuestionAnsweringOutputElement,
@@ -1385,6 +1386,86 @@ async def image_to_image(
         response = provider_helper.get_response(response, request_parameters)
         return _bytes_to_image(response)
 
+    async def image_to_video(
+        self,
+        image: ContentT,
+        *,
+        model: Optional[str] = None,
+        prompt: Optional[str] = None,
+        negative_prompt: Optional[str] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        seed: Optional[int] = None,
+        target_size: Optional[ImageToVideoTargetSize] = None,
+        **kwargs,
+    ) -> bytes:
+        """
+        Generate a video from an input image.
+
+        Args:
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to generate a video from. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
+            model (`str`, *optional*):
+                The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
+                Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            prompt (`str`, *optional*):
+                The text prompt to guide the video generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in video generation.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            num_inference_steps (`int`, *optional*):
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            guidance_scale (`float`, *optional*):
+                For diffusion models. A higher guidance scale value encourages the model to generate videos closely
+                linked to the text prompt at the expense of lower image quality.
+            seed (`int`, *optional*):
+                The seed to use for the video generation.
+            target_size (`ImageToVideoTargetSize`, *optional*):
+                The size in pixel of the output video frames.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
+
+        Returns:
+            `bytes`: The generated video.
+
+        Examples:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> video = await client.image_to_video("cat.jpg", model="Wan-AI/Wan2.2-I2V-A14B", prompt="turn the cat into a tiger")
+        >>> with open("tiger.mp4", "wb") as f:
+        ...     f.write(video)
+        ```
+        """
+        model_id = model or self.model
+        provider_helper = get_provider_helper(self.provider, task="image-to-video", model=model_id)
+        request_parameters = provider_helper.prepare_request(
+            inputs=image,
+            parameters={
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+                "seed": seed,
+                "target_size": target_size,
+                **kwargs,
+            },
+            headers=self.headers,
+            model=model_id,
+            api_key=self.token,
+        )
+        response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response, request_parameters)
+        return response
+
     async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
         """
         Takes an input image and return text.
diff --git a/src/huggingface_hub/inference/_providers/__init__.py b/src/huggingface_hub/inference/_providers/__init__.py
@@ -13,6 +13,7 @@
 from .fal_ai import (
     FalAIAutomaticSpeechRecognitionTask,
     FalAIImageToImageTask,
+    FalAIImageToVideoTask,
     FalAITextToImageTask,
     FalAITextToSpeechTask,
     FalAITextToVideoTask,
@@ -79,6 +80,7 @@
         "text-to-image": FalAITextToImageTask(),
         "text-to-speech": FalAITextToSpeechTask(),
         "text-to-video": FalAITextToVideoTask(),
+        "image-to-video": FalAIImageToVideoTask(),
         "image-to-image": FalAIImageToImageTask(),
     },
     "featherless-ai": {
diff --git a/src/huggingface_hub/inference/_providers/fal_ai.py b/src/huggingface_hub/inference/_providers/fal_ai.py
@@ -213,3 +213,34 @@ def get_response(
         output = super().get_response(response, request_params)
         url = _as_dict(output)["images"][0]["url"]
         return get_session().get(url).content
+
+
+class FalAIImageToVideoTask(FalAIQueueTask):
+    def __init__(self):
+        super().__init__("image-to-video")
+
+    def _prepare_payload_as_dict(
+        self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+    ) -> Optional[Dict]:
+        image_url = _as_url(inputs, default_mime_type="image/jpeg")
+        payload: Dict[str, Any] = {
+            "image_url": image_url,
+            **filter_none(parameters),
+        }
+        if provider_mapping_info.adapter_weights_path is not None:
+            lora_path = constants.HUGGINGFACE_CO_URL_TEMPLATE.format(
+                repo_id=provider_mapping_info.hf_model_id,
+                revision="main",
+                filename=provider_mapping_info.adapter_weights_path,
+            )
+            payload["loras"] = [{"path": lora_path, "scale": 1}]
+        return payload
+
+    def get_response(
+        self,
+        response: Union[bytes, Dict],
+        request_params: Optional[RequestParameters] = None,
+    ) -> Any:
+        output = super().get_response(response, request_params)
+        url = _as_dict(output)["video"]["url"]
+        return get_session().get(url).content
diff --git a/tests/test_inference_providers.py b/tests/test_inference_providers.py
@@ -22,6 +22,7 @@
     _POLLING_INTERVAL,
     FalAIAutomaticSpeechRecognitionTask,
     FalAIImageToImageTask,
+    FalAIImageToVideoTask,
     FalAITextToImageTask,
     FalAITextToSpeechTask,
     FalAITextToVideoTask,
@@ -476,6 +477,79 @@ def test_image_to_image_response(self, mocker):
         mock_sleep.assert_called_once_with(_POLLING_INTERVAL)
         assert response == b"image_content"
 
+    def test_image_to_video_payload(self):
+        helper = FalAIImageToVideoTask()
+        mapping_info = InferenceProviderMapping(
+            provider="fal-ai",
+            hf_model_id="Wan-AI/Wan2.2-I2V-A14B",
+            providerId="Wan-AI/Wan2.2-I2V-A14B",
+            task="image-to-video",
+            status="live",
+        )
+        payload = helper._prepare_payload_as_dict(
+            "https://example.com/image.png",
+            {"prompt": "a cat"},
+            mapping_info,
+        )
+        assert payload == {"image_url": "https://example.com/image.png", "prompt": "a cat"}
+
+        payload = helper._prepare_payload_as_dict(
+            b"dummy_image_data",
+            {"prompt": "a dog"},
+            mapping_info,
+        )
+        assert payload == {
+            "image_url": f"data:image/jpeg;base64,{base64.b64encode(b'dummy_image_data').decode()}",
+            "prompt": "a dog",
+        }
+
+    def test_image_to_video_response(self, mocker):
+        helper = FalAIImageToVideoTask()
+        mock_session = mocker.patch("huggingface_hub.inference._providers.fal_ai.get_session")
+        mock_sleep = mocker.patch("huggingface_hub.inference._providers.fal_ai.time.sleep")
+        mock_session.return_value.get.side_effect = [
+            # First call: status
+            mocker.Mock(json=lambda: {"status": "COMPLETED"}, headers={"Content-Type": "application/json"}),
+            # Second call: get result
+            mocker.Mock(json=lambda: {"video": {"url": "video_url"}}, headers={"Content-Type": "application/json"}),
+            # Third call: get video content
+            mocker.Mock(content=b"video_content"),
+        ]
+        api_key = helper._prepare_api_key("hf_token")
+        headers = helper._prepare_headers({}, api_key)
+        url = helper._prepare_url(api_key, "username/repo_name")
+
+        request_params = RequestParameters(
+            url=url,
+            headers=headers,
+            task="image-to-video",
+            model="username/repo_name",
+            data=None,
+            json=None,
+        )
+        response = helper.get_response(
+            b'{"request_id": "test_request_id", "status": "PROCESSING", "response_url": "https://queue.fal.run/username_provider/repo_name_provider/requests/test_request_id", "status_url": "https://queue.fal.run/username_provider/repo_name_provider/requests/test_request_id/status"}',
+            request_params,
+        )
+
+        # Verify the correct URLs were called
+        assert mock_session.return_value.get.call_count == 3
+        mock_session.return_value.get.assert_has_calls(
+            [
+                mocker.call(
+                    "https://router.huggingface.co/fal-ai/username_provider/repo_name_provider/requests/test_request_id/status?_subdomain=queue",
+                    headers=request_params.headers,
+                ),
+                mocker.call(
+                    "https://router.huggingface.co/fal-ai/username_provider/repo_name_provider/requests/test_request_id?_subdomain=queue",
+                    headers=request_params.headers,
+                ),
+                mocker.call("video_url"),
+            ]
+        )
+        mock_sleep.assert_called_once_with(_POLLING_INTERVAL)
+        assert response == b"video_content"
+
 
 class TestFeatherlessAIProvider:
     def test_prepare_route_chat_completionurl(self):