Skip to content

Commit c5f5046

Browse files
[Inference] Support image to video task (#3289)
1 parent 00f12e5 commit c5f5046

File tree

6 files changed

+269
-0
lines changed

6 files changed

+269
-0
lines changed

docs/source/en/guides/inference.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ For more details, refer to the [Inference Providers pricing documentation](https
219219
| [`~InferenceClient.image_classification`] |||||||||||||||
220220
| [`~InferenceClient.image_segmentation`] |||||||||||||||
221221
| [`~InferenceClient.image_to_image`] |||||||||||||||
222+
| [`~InferenceClient.image_to_video`] |||||||||||||||
222223
| [`~InferenceClient.image_to_text`] |||||||||||||||
223224
| [`~InferenceClient.object_detection`] ||||||||||||||| ||
224225
| [`~InferenceClient.question_answering`] |||||||||||||||

src/huggingface_hub/inference/_client.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
ImageSegmentationSubtask,
8181
ImageToImageTargetSize,
8282
ImageToTextOutput,
83+
ImageToVideoTargetSize,
8384
ObjectDetectionOutputElement,
8485
Padding,
8586
QuestionAnsweringOutputElement,
@@ -1337,6 +1338,85 @@ def image_to_image(
13371338
response = provider_helper.get_response(response, request_parameters)
13381339
return _bytes_to_image(response)
13391340

1341+
def image_to_video(
1342+
self,
1343+
image: ContentT,
1344+
*,
1345+
model: Optional[str] = None,
1346+
prompt: Optional[str] = None,
1347+
negative_prompt: Optional[str] = None,
1348+
num_frames: Optional[float] = None,
1349+
num_inference_steps: Optional[int] = None,
1350+
guidance_scale: Optional[float] = None,
1351+
seed: Optional[int] = None,
1352+
target_size: Optional[ImageToVideoTargetSize] = None,
1353+
**kwargs,
1354+
) -> bytes:
1355+
"""
1356+
Generate a video from an input image.
1357+
1358+
Args:
1359+
image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
1360+
The input image to generate a video from. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
1361+
model (`str`, *optional*):
1362+
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
1363+
Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
1364+
prompt (`str`, *optional*):
1365+
The text prompt to guide the video generation.
1366+
negative_prompt (`str`, *optional*):
1367+
One prompt to guide what NOT to include in video generation.
1368+
num_frames (`float`, *optional*):
1369+
The num_frames parameter determines how many video frames are generated.
1370+
num_inference_steps (`int`, *optional*):
1371+
For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
1372+
quality image at the expense of slower inference.
1373+
guidance_scale (`float`, *optional*):
1374+
For diffusion models. A higher guidance scale value encourages the model to generate videos closely
1375+
linked to the text prompt at the expense of lower image quality.
1376+
seed (`int`, *optional*):
1377+
The seed to use for the video generation.
1378+
target_size (`ImageToVideoTargetSize`, *optional*):
1379+
The size in pixel of the output video frames.
1380+
num_inference_steps (`int`, *optional*):
1381+
The number of denoising steps. More denoising steps usually lead to a higher quality video at the
1382+
expense of slower inference.
1383+
seed (`int`, *optional*):
1384+
Seed for the random number generator.
1385+
1386+
Returns:
1387+
`bytes`: The generated video.
1388+
1389+
Examples:
1390+
```py
1391+
>>> from huggingface_hub import InferenceClient
1392+
>>> client = InferenceClient()
1393+
>>> video = client.image_to_video("cat.jpg", model="Wan-AI/Wan2.2-I2V-A14B", prompt="turn the cat into a tiger")
1394+
>>> with open("tiger.mp4", "wb") as f:
1395+
... f.write(video)
1396+
```
1397+
"""
1398+
model_id = model or self.model
1399+
provider_helper = get_provider_helper(self.provider, task="image-to-video", model=model_id)
1400+
request_parameters = provider_helper.prepare_request(
1401+
inputs=image,
1402+
parameters={
1403+
"prompt": prompt,
1404+
"negative_prompt": negative_prompt,
1405+
"num_frames": num_frames,
1406+
"num_inference_steps": num_inference_steps,
1407+
"guidance_scale": guidance_scale,
1408+
"seed": seed,
1409+
"target_size": target_size,
1410+
**kwargs,
1411+
},
1412+
headers=self.headers,
1413+
model=model_id,
1414+
api_key=self.token,
1415+
)
1416+
response = self._inner_post(request_parameters)
1417+
response = provider_helper.get_response(response, request_parameters)
1418+
return response
1419+
13401420
def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
13411421
"""
13421422
Takes an input image and return text.

src/huggingface_hub/inference/_generated/_async_client.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
ImageSegmentationSubtask,
6666
ImageToImageTargetSize,
6767
ImageToTextOutput,
68+
ImageToVideoTargetSize,
6869
ObjectDetectionOutputElement,
6970
Padding,
7071
QuestionAnsweringOutputElement,
@@ -1383,6 +1384,86 @@ async def image_to_image(
13831384
response = provider_helper.get_response(response, request_parameters)
13841385
return _bytes_to_image(response)
13851386

1387+
async def image_to_video(
1388+
self,
1389+
image: ContentT,
1390+
*,
1391+
model: Optional[str] = None,
1392+
prompt: Optional[str] = None,
1393+
negative_prompt: Optional[str] = None,
1394+
num_frames: Optional[float] = None,
1395+
num_inference_steps: Optional[int] = None,
1396+
guidance_scale: Optional[float] = None,
1397+
seed: Optional[int] = None,
1398+
target_size: Optional[ImageToVideoTargetSize] = None,
1399+
**kwargs,
1400+
) -> bytes:
1401+
"""
1402+
Generate a video from an input image.
1403+
1404+
Args:
1405+
image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
1406+
The input image to generate a video from. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
1407+
model (`str`, *optional*):
1408+
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
1409+
Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
1410+
prompt (`str`, *optional*):
1411+
The text prompt to guide the video generation.
1412+
negative_prompt (`str`, *optional*):
1413+
One prompt to guide what NOT to include in video generation.
1414+
num_frames (`float`, *optional*):
1415+
The num_frames parameter determines how many video frames are generated.
1416+
num_inference_steps (`int`, *optional*):
1417+
For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
1418+
quality image at the expense of slower inference.
1419+
guidance_scale (`float`, *optional*):
1420+
For diffusion models. A higher guidance scale value encourages the model to generate videos closely
1421+
linked to the text prompt at the expense of lower image quality.
1422+
seed (`int`, *optional*):
1423+
The seed to use for the video generation.
1424+
target_size (`ImageToVideoTargetSize`, *optional*):
1425+
The size in pixel of the output video frames.
1426+
num_inference_steps (`int`, *optional*):
1427+
The number of denoising steps. More denoising steps usually lead to a higher quality video at the
1428+
expense of slower inference.
1429+
seed (`int`, *optional*):
1430+
Seed for the random number generator.
1431+
1432+
Returns:
1433+
`bytes`: The generated video.
1434+
1435+
Examples:
1436+
```py
1437+
# Must be run in an async context
1438+
>>> from huggingface_hub import AsyncInferenceClient
1439+
>>> client = AsyncInferenceClient()
1440+
>>> video = await client.image_to_video("cat.jpg", model="Wan-AI/Wan2.2-I2V-A14B", prompt="turn the cat into a tiger")
1441+
>>> with open("tiger.mp4", "wb") as f:
1442+
... f.write(video)
1443+
```
1444+
"""
1445+
model_id = model or self.model
1446+
provider_helper = get_provider_helper(self.provider, task="image-to-video", model=model_id)
1447+
request_parameters = provider_helper.prepare_request(
1448+
inputs=image,
1449+
parameters={
1450+
"prompt": prompt,
1451+
"negative_prompt": negative_prompt,
1452+
"num_frames": num_frames,
1453+
"num_inference_steps": num_inference_steps,
1454+
"guidance_scale": guidance_scale,
1455+
"seed": seed,
1456+
"target_size": target_size,
1457+
**kwargs,
1458+
},
1459+
headers=self.headers,
1460+
model=model_id,
1461+
api_key=self.token,
1462+
)
1463+
response = await self._inner_post(request_parameters)
1464+
response = provider_helper.get_response(response, request_parameters)
1465+
return response
1466+
13861467
async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput:
13871468
"""
13881469
Takes an input image and return text.

src/huggingface_hub/inference/_providers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .fal_ai import (
1414
FalAIAutomaticSpeechRecognitionTask,
1515
FalAIImageToImageTask,
16+
FalAIImageToVideoTask,
1617
FalAITextToImageTask,
1718
FalAITextToSpeechTask,
1819
FalAITextToVideoTask,
@@ -79,6 +80,7 @@
7980
"text-to-image": FalAITextToImageTask(),
8081
"text-to-speech": FalAITextToSpeechTask(),
8182
"text-to-video": FalAITextToVideoTask(),
83+
"image-to-video": FalAIImageToVideoTask(),
8284
"image-to-image": FalAIImageToImageTask(),
8385
},
8486
"featherless-ai": {

src/huggingface_hub/inference/_providers/fal_ai.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,3 +213,34 @@ def get_response(
213213
output = super().get_response(response, request_params)
214214
url = _as_dict(output)["images"][0]["url"]
215215
return get_session().get(url).content
216+
217+
218+
class FalAIImageToVideoTask(FalAIQueueTask):
219+
def __init__(self):
220+
super().__init__("image-to-video")
221+
222+
def _prepare_payload_as_dict(
223+
self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
224+
) -> Optional[Dict]:
225+
image_url = _as_url(inputs, default_mime_type="image/jpeg")
226+
payload: Dict[str, Any] = {
227+
"image_url": image_url,
228+
**filter_none(parameters),
229+
}
230+
if provider_mapping_info.adapter_weights_path is not None:
231+
lora_path = constants.HUGGINGFACE_CO_URL_TEMPLATE.format(
232+
repo_id=provider_mapping_info.hf_model_id,
233+
revision="main",
234+
filename=provider_mapping_info.adapter_weights_path,
235+
)
236+
payload["loras"] = [{"path": lora_path, "scale": 1}]
237+
return payload
238+
239+
def get_response(
240+
self,
241+
response: Union[bytes, Dict],
242+
request_params: Optional[RequestParameters] = None,
243+
) -> Any:
244+
output = super().get_response(response, request_params)
245+
url = _as_dict(output)["video"]["url"]
246+
return get_session().get(url).content

tests/test_inference_providers.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
_POLLING_INTERVAL,
2323
FalAIAutomaticSpeechRecognitionTask,
2424
FalAIImageToImageTask,
25+
FalAIImageToVideoTask,
2526
FalAITextToImageTask,
2627
FalAITextToSpeechTask,
2728
FalAITextToVideoTask,
@@ -476,6 +477,79 @@ def test_image_to_image_response(self, mocker):
476477
mock_sleep.assert_called_once_with(_POLLING_INTERVAL)
477478
assert response == b"image_content"
478479

480+
def test_image_to_video_payload(self):
481+
helper = FalAIImageToVideoTask()
482+
mapping_info = InferenceProviderMapping(
483+
provider="fal-ai",
484+
hf_model_id="Wan-AI/Wan2.2-I2V-A14B",
485+
providerId="Wan-AI/Wan2.2-I2V-A14B",
486+
task="image-to-video",
487+
status="live",
488+
)
489+
payload = helper._prepare_payload_as_dict(
490+
"https://example.com/image.png",
491+
{"prompt": "a cat"},
492+
mapping_info,
493+
)
494+
assert payload == {"image_url": "https://example.com/image.png", "prompt": "a cat"}
495+
496+
payload = helper._prepare_payload_as_dict(
497+
b"dummy_image_data",
498+
{"prompt": "a dog"},
499+
mapping_info,
500+
)
501+
assert payload == {
502+
"image_url": f"data:image/jpeg;base64,{base64.b64encode(b'dummy_image_data').decode()}",
503+
"prompt": "a dog",
504+
}
505+
506+
def test_image_to_video_response(self, mocker):
507+
helper = FalAIImageToVideoTask()
508+
mock_session = mocker.patch("huggingface_hub.inference._providers.fal_ai.get_session")
509+
mock_sleep = mocker.patch("huggingface_hub.inference._providers.fal_ai.time.sleep")
510+
mock_session.return_value.get.side_effect = [
511+
# First call: status
512+
mocker.Mock(json=lambda: {"status": "COMPLETED"}, headers={"Content-Type": "application/json"}),
513+
# Second call: get result
514+
mocker.Mock(json=lambda: {"video": {"url": "video_url"}}, headers={"Content-Type": "application/json"}),
515+
# Third call: get video content
516+
mocker.Mock(content=b"video_content"),
517+
]
518+
api_key = helper._prepare_api_key("hf_token")
519+
headers = helper._prepare_headers({}, api_key)
520+
url = helper._prepare_url(api_key, "username/repo_name")
521+
522+
request_params = RequestParameters(
523+
url=url,
524+
headers=headers,
525+
task="image-to-video",
526+
model="username/repo_name",
527+
data=None,
528+
json=None,
529+
)
530+
response = helper.get_response(
531+
b'{"request_id": "test_request_id", "status": "PROCESSING", "response_url": "https://queue.fal.run/username_provider/repo_name_provider/requests/test_request_id", "status_url": "https://queue.fal.run/username_provider/repo_name_provider/requests/test_request_id/status"}',
532+
request_params,
533+
)
534+
535+
# Verify the correct URLs were called
536+
assert mock_session.return_value.get.call_count == 3
537+
mock_session.return_value.get.assert_has_calls(
538+
[
539+
mocker.call(
540+
"https://router.huggingface.co/fal-ai/username_provider/repo_name_provider/requests/test_request_id/status?_subdomain=queue",
541+
headers=request_params.headers,
542+
),
543+
mocker.call(
544+
"https://router.huggingface.co/fal-ai/username_provider/repo_name_provider/requests/test_request_id?_subdomain=queue",
545+
headers=request_params.headers,
546+
),
547+
mocker.call("video_url"),
548+
]
549+
)
550+
mock_sleep.assert_called_once_with(_POLLING_INTERVAL)
551+
assert response == b"video_content"
552+
479553

480554
class TestFeatherlessAIProvider:
481555
def test_prepare_route_chat_completionurl(self):

0 commit comments

Comments
 (0)