Skip to content

Commit 47d5221

Browse files
committed
add image-text-to-image and image-text-to-video tasks
1 parent c02dd89 commit 47d5221

File tree

19 files changed

+648
-0
lines changed

19 files changed

+648
-0
lines changed

packages/inference/src/lib/getProviderHelper.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ import type {
2828
ImageToImageTaskHelper,
2929
ImageToTextTaskHelper,
3030
ImageToVideoTaskHelper,
31+
ImageTextToImageTaskHelper,
32+
ImageTextToVideoTaskHelper,
3133
ObjectDetectionTaskHelper,
3234
QuestionAnsweringTaskHelper,
3335
SentenceSimilarityTaskHelper,
@@ -276,6 +278,14 @@ export function getProviderHelper(
276278
provider: InferenceProviderOrPolicy,
277279
task: "image-to-video"
278280
): ImageToVideoTaskHelper & TaskProviderHelper;
281+
export function getProviderHelper(
282+
provider: InferenceProviderOrPolicy,
283+
task: "image-text-to-image"
284+
): ImageTextToImageTaskHelper & TaskProviderHelper;
285+
export function getProviderHelper(
286+
provider: InferenceProviderOrPolicy,
287+
task: "image-text-to-video"
288+
): ImageTextToVideoTaskHelper & TaskProviderHelper;
279289
export function getProviderHelper(
280290
provider: InferenceProviderOrPolicy,
281291
task: "sentence-similarity"

packages/inference/src/providers/providerHelper.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ import type {
1919
ImageToTextInput,
2020
ImageToTextOutput,
2121
ImageToVideoInput,
22+
ImageTextToImageInput,
23+
ImageTextToVideoInput,
2224
ObjectDetectionInput,
2325
ObjectDetectionOutput,
2426
QuestionAnsweringInput,
@@ -54,6 +56,8 @@ import { toArray } from "../utils/toArray.js";
5456
import type { ImageToImageArgs } from "../tasks/cv/imageToImage.js";
5557
import type { AutomaticSpeechRecognitionArgs } from "../tasks/audio/automaticSpeechRecognition.js";
5658
import type { ImageToVideoArgs } from "../tasks/cv/imageToVideo.js";
59+
import type { ImageTextToImageArgs } from "../tasks/cv/imageTextToImage.js";
60+
import type { ImageTextToVideoArgs } from "../tasks/cv/imageTextToVideo.js";
5761
import type { ImageSegmentationArgs } from "../tasks/cv/imageSegmentation.js";
5862

5963
/**
@@ -159,6 +163,18 @@ export interface ImageToVideoTaskHelper {
159163
preparePayloadAsync(args: ImageToVideoArgs): Promise<RequestArgs>;
160164
}
161165

166+
export interface ImageTextToImageTaskHelper {
167+
getResponse(response: unknown, url?: string, headers?: HeadersInit): Promise<Blob>;
168+
preparePayload(params: BodyParams<ImageTextToImageInput & BaseArgs>): Record<string, unknown>;
169+
preparePayloadAsync(args: ImageTextToImageArgs): Promise<RequestArgs>;
170+
}
171+
172+
export interface ImageTextToVideoTaskHelper {
173+
getResponse(response: unknown, url?: string, headers?: HeadersInit): Promise<Blob>;
174+
preparePayload(params: BodyParams<ImageTextToVideoInput & BaseArgs>): Record<string, unknown>;
175+
preparePayloadAsync(args: ImageTextToVideoArgs): Promise<RequestArgs>;
176+
}
177+
162178
export interface ImageSegmentationTaskHelper {
163179
getResponse(response: unknown, url?: string, headers?: HeadersInit): Promise<ImageSegmentationOutput>;
164180
preparePayload(params: BodyParams<ImageSegmentationInput & BaseArgs>): Record<string, unknown> | BodyInit;

packages/inference/src/snippets/getInferenceSnippets.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,10 @@ const HF_PYTHON_METHODS: Partial<Record<WidgetType, string>> = {
9696
"image-classification": "image_classification",
9797
"image-segmentation": "image_segmentation",
9898
"image-to-image": "image_to_image",
99+
"image-to-video": "image_to_video",
99100
"image-to-text": "image_to_text",
101+
"image-text-to-image": "image_text_to_image",
102+
"image-text-to-video": "image_text_to_video",
100103
"object-detection": "object_detection",
101104
"question-answering": "question_answering",
102105
"sentence-similarity": "sentence_similarity",
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import type { ImageTextToImageInput } from "@huggingface/tasks";
2+
import { resolveProvider } from "../../lib/getInferenceProviderMapping.js";
3+
import { getProviderHelper } from "../../lib/getProviderHelper.js";
4+
import type { BaseArgs, Options } from "../../types.js";
5+
import { innerRequest } from "../../utils/request.js";
6+
7+
export type ImageTextToImageArgs = BaseArgs & ImageTextToImageInput;
8+
9+
/**
10+
* This task takes an image and text input and outputs a new generated image.
11+
* Recommended model: black-forest-labs/FLUX.2-dev
12+
*/
13+
export async function imageTextToImage(args: ImageTextToImageArgs, options?: Options): Promise<Blob> {
14+
const provider = await resolveProvider(args.provider, args.model, args.endpointUrl);
15+
const providerHelper = getProviderHelper(provider, "image-text-to-image");
16+
const payload = await providerHelper.preparePayloadAsync(args);
17+
const { data: res, requestContext } = await innerRequest<Blob>(payload, providerHelper, {
18+
...options,
19+
task: "image-text-to-image",
20+
});
21+
return providerHelper.getResponse(res, requestContext.url, requestContext.info.headers as Record<string, string>);
22+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import type { ImageTextToVideoInput } from "@huggingface/tasks";
2+
import { resolveProvider } from "../../lib/getInferenceProviderMapping.js";
3+
import { getProviderHelper } from "../../lib/getProviderHelper.js";
4+
import type { BaseArgs, Options } from "../../types.js";
5+
import { innerRequest } from "../../utils/request.js";
6+
7+
export type ImageTextToVideoArgs = BaseArgs & ImageTextToVideoInput;
8+
9+
/**
10+
* This task takes an image and text input and outputs a generated video.
11+
* Recommended model: Lightricks/LTX-Video
12+
*/
13+
export async function imageTextToVideo(args: ImageTextToVideoArgs, options?: Options): Promise<Blob> {
14+
const provider = await resolveProvider(args.provider, args.model, args.endpointUrl);
15+
const providerHelper = getProviderHelper(provider, "image-text-to-video");
16+
const payload = await providerHelper.preparePayloadAsync(args);
17+
const { data: res, requestContext } = await innerRequest<Blob>(payload, providerHelper, {
18+
...options,
19+
task: "image-text-to-video",
20+
});
21+
return providerHelper.getResponse(res, requestContext.url, requestContext.info.headers as Record<string, string>);
22+
}

packages/inference/src/tasks/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ export * from "./cv/imageSegmentation.js";
1414
export * from "./cv/imageToImage.js";
1515
export * from "./cv/imageToText.js";
1616
export * from "./cv/imageToVideo.js";
17+
export * from "./cv/imageTextToImage.js";
18+
export * from "./cv/imageTextToVideo.js";
1719
export * from "./cv/objectDetection.js";
1820
export * from "./cv/textToImage.js";
1921
export * from "./cv/textToVideo.js";

packages/tasks/src/pipelines.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,14 @@ export const PIPELINE_DATA = {
557557
name: "Image-Text-to-Text",
558558
modality: "multimodal",
559559
},
560+
"image-text-to-image": {
561+
name: "Image-Text-to-Image",
562+
modality: "multimodal",
563+
},
564+
"image-text-to-video": {
565+
name: "Image-Text-to-Video",
566+
modality: "multimodal",
567+
},
560568
"visual-question-answering": {
561569
name: "Visual Question Answering",
562570
subtasks: [

packages/tasks/src/snippets/inputs.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ const inputsImageToVideo = () => `{
9494
"prompt": "The cat starts to dance"
9595
}`;
9696

97+
const inputsImageTextToImage = () => `{
98+
"image": "cat.png",
99+
"prompt": "Turn the cat into a tiger."
100+
}`;
101+
102+
const inputsImageTextToVideo = () => `{
103+
"image": "cat.png",
104+
"prompt": "The cat starts to dance"
105+
}`;
106+
97107
const inputsImageSegmentation = () => `"cats.jpg"`;
98108

99109
const inputsObjectDetection = () => `"cats.jpg"`;
@@ -130,6 +140,8 @@ const modelInputSnippets: {
130140
"image-to-text": inputsImageToText,
131141
"image-to-image": inputsImageToImage,
132142
"image-to-video": inputsImageToVideo,
143+
"image-text-to-image": inputsImageTextToImage,
144+
"image-text-to-video": inputsImageTextToVideo,
133145
"image-segmentation": inputsImageSegmentation,
134146
"object-detection": inputsObjectDetection,
135147
"question-answering": inputsQuestionAnswering,
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
## Use Cases
2+
3+
### Instruction-based Image Editing
4+
5+
Image-text-to-image models can be used to edit images based on natural language instructions. For example, you can provide an image of a summer landscape and the instruction "Make it winter, add snow" to generate a winter version of the same scene.
6+
7+
### Style Transfer
8+
9+
These models can apply artistic styles or transformations to images based on text descriptions. For instance, you can transform a photo into a painting style by providing prompts like "Make it look like a Van Gogh painting" or "Convert to watercolor style."
10+
11+
### Image Variations
12+
13+
Generate variations of an existing image by providing different text prompts. This is useful for creative workflows where you want to explore different versions of the same image with specific modifications.
14+
15+
### Guided Image Generation
16+
17+
Use a reference image along with text prompts to guide the generation process. This allows for more controlled image generation compared to text-to-image models alone, as the reference image provides structural guidance.
18+
19+
### Image Inpainting and Outpainting
20+
21+
Fill in missing or masked parts of an image based on text descriptions, or extend an image beyond its original boundaries with text-guided generation.
22+
23+
## Task Variants
24+
25+
### Instruction-based Editing
26+
27+
Models that follow natural language instructions to edit images, which can perform complex edits like object removal, color changes, and compositional modifications.
28+
29+
### Reference-guided Generation
30+
31+
Models that use a reference image to guide the generation process while incorporating text prompts to control specific attributes or modifications.
32+
33+
### Conditional Image-to-Image
34+
35+
Models that perform specific transformations based on text conditions, such as changing weather conditions, time of day, or seasonal variations.
36+
37+
## Inference
38+
39+
You can use the Diffusers library to interact with image-text-to-image models.
40+
41+
```python
42+
from diffusers import FluxControlPipeline
43+
from PIL import Image
44+
import torch
45+
46+
# Load the model
47+
pipe = FluxControlPipeline.from_pretrained(
48+
"black-forest-labs/FLUX.2-dev",
49+
torch_dtype=torch.bfloat16
50+
).to("cuda")
51+
52+
# Load input image
53+
image = Image.open("input.jpg").convert("RGB")
54+
55+
# Edit the image with a text prompt
56+
prompt = "Make it a snowy winter scene"
57+
edited_image = pipe(prompt=prompt, image=image).images[0]
58+
edited_image.save("edited_image.png")
59+
```
60+
61+
## Useful Resources
62+
63+
- [FLUX.2 Model Card](https://huggingface.co/black-forest-labs/FLUX.2-dev)
64+
- [Diffusers documentation on Image-to-Image](https://huggingface.co/docs/diffusers/using-diffusers/img2img)
65+
- [ControlNet for Conditional Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import type { TaskDataCustom } from "../index.js";
2+
3+
const taskData: TaskDataCustom = {
4+
datasets: [],
5+
demo: {
6+
inputs: [
7+
{
8+
filename: "image-text-to-image-input.png",
9+
type: "img",
10+
},
11+
{
12+
label: "Text Prompt",
13+
content: "Make it winter, add snow",
14+
type: "text",
15+
},
16+
],
17+
outputs: [
18+
{
19+
filename: "image-text-to-image-output.png",
20+
type: "img",
21+
},
22+
],
23+
},
24+
metrics: [
25+
{
26+
description:
27+
"The Fréchet Inception Distance (FID) calculates the distance between distributions between synthetic and real samples. A lower FID score indicates better similarity between the distributions of real and generated images.",
28+
id: "FID",
29+
},
30+
{
31+
description:
32+
"CLIP Score measures the similarity between the generated image and the text prompt using CLIP embeddings. A higher score indicates better alignment with the text prompt.",
33+
id: "CLIP",
34+
},
35+
],
36+
models: [
37+
{
38+
description: "A powerful model for image-text-to-image generation.",
39+
id: "black-forest-labs/FLUX.2-dev",
40+
},
41+
],
42+
spaces: [
43+
{
44+
description: "An application for image-text-to-image generation.",
45+
id: "black-forest-labs/FLUX.2-dev",
46+
},
47+
],
48+
summary:
49+
"Image-text-to-image models take an image and a text prompt as input and generate a new image based on the reference image and text instructions. These models are useful for image editing, style transfer, image variations, and guided image generation tasks.",
50+
widgetModels: ["black-forest-labs/FLUX.2-dev"],
51+
youtubeId: undefined,
52+
};
53+
54+
export default taskData;

0 commit comments

Comments
 (0)