|
66 | 66 | ImageSegmentationSubtask, |
67 | 67 | ImageToImageTargetSize, |
68 | 68 | ImageToTextOutput, |
| 69 | + ImageToVideoTargetSize, |
69 | 70 | ObjectDetectionOutputElement, |
70 | 71 | Padding, |
71 | 72 | QuestionAnsweringOutputElement, |
@@ -1385,6 +1386,86 @@ async def image_to_image( |
1385 | 1386 | response = provider_helper.get_response(response, request_parameters) |
1386 | 1387 | return _bytes_to_image(response) |
1387 | 1388 |
|
| 1389 | + async def image_to_video( |
| 1390 | + self, |
| 1391 | + image: ContentT, |
| 1392 | + *, |
| 1393 | + model: Optional[str] = None, |
| 1394 | + prompt: Optional[str] = None, |
| 1395 | + negative_prompt: Optional[str] = None, |
| 1396 | + num_frames: Optional[float] = None, |
| 1397 | + num_inference_steps: Optional[int] = None, |
| 1398 | + guidance_scale: Optional[float] = None, |
| 1399 | + seed: Optional[int] = None, |
| 1400 | + target_size: Optional[ImageToVideoTargetSize] = None, |
| 1401 | + **kwargs, |
| 1402 | + ) -> bytes: |
| 1403 | + """ |
| 1404 | + Generate a video from an input image. |
| 1405 | +
|
| 1406 | + Args: |
| 1407 | + image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`): |
| 1408 | + The input image to generate a video from. It can be raw bytes, an image file, a URL to an online image, or a PIL Image. |
| 1409 | + model (`str`, *optional*): |
| 1410 | + The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed |
| 1411 | + Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None. |
| 1412 | + prompt (`str`, *optional*): |
| 1413 | + The text prompt to guide the video generation. |
| 1414 | + negative_prompt (`str`, *optional*): |
| 1415 | + One prompt to guide what NOT to include in video generation. |
| 1416 | + num_frames (`float`, *optional*): |
| 1417 | + The num_frames parameter determines how many video frames are generated. |
| 1418 | + num_inference_steps (`int`, *optional*): |
| 1419 | + For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher |
| 1420 | + quality image at the expense of slower inference. |
| 1421 | + guidance_scale (`float`, *optional*): |
| 1422 | + For diffusion models. A higher guidance scale value encourages the model to generate videos closely |
| 1423 | + linked to the text prompt at the expense of lower image quality. |
| 1424 | + seed (`int`, *optional*): |
| 1425 | + The seed to use for the video generation. |
| 1426 | + target_size (`ImageToVideoTargetSize`, *optional*): |
| 1427 | + The size in pixel of the output video frames. |
| 1428 | + num_inference_steps (`int`, *optional*): |
| 1429 | + The number of denoising steps. More denoising steps usually lead to a higher quality video at the |
| 1430 | + expense of slower inference. |
| 1431 | + seed (`int`, *optional*): |
| 1432 | + Seed for the random number generator. |
| 1433 | +
|
| 1434 | + Returns: |
| 1435 | + `bytes`: The generated video. |
| 1436 | +
|
| 1437 | + Examples: |
| 1438 | + ```py |
| 1439 | + # Must be run in an async context |
| 1440 | + >>> from huggingface_hub import AsyncInferenceClient |
| 1441 | + >>> client = AsyncInferenceClient() |
| 1442 | + >>> video = await client.image_to_video("cat.jpg", model="Wan-AI/Wan2.2-I2V-A14B", prompt="turn the cat into a tiger") |
| 1443 | + >>> with open("tiger.mp4", "wb") as f: |
| 1444 | + ... f.write(video) |
| 1445 | + ``` |
| 1446 | + """ |
| 1447 | + model_id = model or self.model |
| 1448 | + provider_helper = get_provider_helper(self.provider, task="image-to-video", model=model_id) |
| 1449 | + request_parameters = provider_helper.prepare_request( |
| 1450 | + inputs=image, |
| 1451 | + parameters={ |
| 1452 | + "prompt": prompt, |
| 1453 | + "negative_prompt": negative_prompt, |
| 1454 | + "num_frames": num_frames, |
| 1455 | + "num_inference_steps": num_inference_steps, |
| 1456 | + "guidance_scale": guidance_scale, |
| 1457 | + "seed": seed, |
| 1458 | + "target_size": target_size, |
| 1459 | + **kwargs, |
| 1460 | + }, |
| 1461 | + headers=self.headers, |
| 1462 | + model=model_id, |
| 1463 | + api_key=self.token, |
| 1464 | + ) |
| 1465 | + response = await self._inner_post(request_parameters) |
| 1466 | + response = provider_helper.get_response(response, request_parameters) |
| 1467 | + return response |
| 1468 | + |
1388 | 1469 | async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> ImageToTextOutput: |
1389 | 1470 | """ |
1390 | 1471 | Takes an input image and return text. |
|
0 commit comments