Skip to content

Commit bad4eb6

Browse files
committed
improve data.ts
1 parent 4ef5171 commit bad4eb6

File tree

1 file changed

+124
-12
lines changed
  • packages/tasks/src/tasks/image-to-video

1 file changed

+124
-12
lines changed

packages/tasks/src/tasks/image-to-video/data.ts

Lines changed: 124 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,53 +6,165 @@ const taskData: TaskDataCustom = {
66
description: "A dataset of images and short video clips for image-to-video generation research.",
77
id: "some/image-to-video-dataset",
88
},
9+
{
10+
description: "A benchmark dataset for reference-based video generation.",
11+
id: "ali-vilab/VACE-Benchmark",
12+
},
13+
{
14+
description: "A dataset for scoring video generation styles.",
15+
id: "Rapidata/sora-video-generation-style-likert-scoring",
16+
},
17+
{
18+
description: "A dataset for captioned videos ",
19+
id: "BestWishYsh/ChronoMagic",
20+
},
921
],
1022
demo: {
1123
inputs: [
1224
{
13-
filename: "image-to-video-input.jpeg",
25+
filename: "image-to-video-input.jpg",
1426
type: "img",
1527
},
1628
{
1729
label: "Optional prompt",
18-
content: "A dog running in a field of flowers, cinematic lighting",
30+
content: "This penguin is dancing",
1931
type: "text",
2032
},
2133
],
2234
outputs: [
2335
{
2436
filename: "image-to-video-output.gif",
25-
type: "img", // Representing video as gif for demo
37+
type: "img",
2638
},
2739
],
2840
},
2941
metrics: [
3042
{
3143
description:
32-
"Frechet Video Distance (FVD) is a common metric for evaluating the quality of generated videos, comparing them to real videos.",
44+
"Fréchet Video Distance (FVD) measures the perceptual similarity between the distributions of generated videos and a set of real videos, assessing overall visual quality and temporal coherence of the video generated from an input image.",
3345
id: "fvd",
3446
},
3547
{
3648
description:
37-
"Inception Score (IS) can be adapted for videos to measure the diversity and quality of generated frames.",
38-
id: "is_video",
49+
"CLIP Score measures the semantic similarity between a textual prompt (if provided alongside the input image) and the generated video frames. It evaluates how well the video's generated content and motion align with the textual description, conditioned on the initial image.",
50+
id: "clip_score",
51+
},
52+
{
53+
description:
54+
"First Frame Fidelity, often measured using LPIPS (Learned Perceptual Image Patch Similarity), PSNR, or SSIM, quantifies how closely the first frame of the generated video matches the input conditioning image.",
55+
id: "lpips",
56+
},
57+
{
58+
description:
59+
"Identity Preservation Score measures the consistency of identity (e.g., a person's face or a specific object's characteristics) between the input image and throughout the generated video frames, often calculated using features from specialized models like face recognition (e.g., ArcFace) or re-identification models.",
60+
id: "identity_preservation",
61+
},
62+
{
63+
description:
64+
"Motion Score evaluates the quality, realism, and temporal consistency of motion in the video generated from a static image. This can be based on optical flow analysis (e.g., smoothness, magnitude), consistency of object trajectories, or specific motion plausibility assessments.",
65+
id: "motion_score",
3966
},
4067
],
4168
models: [
4269
{
43-
description: "A generic model for image-to-video generation.",
44-
id: "generic/image-to-video-model",
70+
description: "LTX-Video, a 13B parameter model for high quality video generation",
71+
id: "Lightricks/LTX-Video-0.9.7-dev",
72+
},
73+
{
74+
description: "A 1.3B parameter model for reference-based video generation",
75+
id: "Wan-AI/Wan2.1-VACE-1.3B",
76+
},
77+
{
78+
description: "An image-to-video generation model using FramePack methodology with Hunyuan-DiT architecture.",
79+
id: "lllyasviel/FramePackI2V_HY",
80+
},
81+
{
82+
description: "An image-to-video generation model using FramePack F1 methodology with Hunyuan-DiT architecture",
83+
id: "lllyasviel/FramePack_F1_I2V_HY_20250503",
84+
},
85+
{
86+
description: "A distilled version of the LTX-Video-0.9.7-dev model for faster inference",
87+
id: "Lightricks/LTX-Video-0.9.7-distilled",
88+
},
89+
{
90+
description: "An image-to-video generation model by Skywork AI, 1.3B parameters, producing 540p videos.",
91+
id: "Skywork/SkyReels-V2-I2V-1.3B-540P",
92+
},
93+
{
94+
description: "An image-to-video generation model by Skywork AI, 14B parameters, producing 720p videos.",
95+
id: "Skywork/SkyReels-V2-I2V-14B-720P",
96+
},
97+
{
98+
description: "An image-to-video generation model by Skywork AI, 14B parameters, producing 540p videos.",
99+
id: "Skywork/SkyReels-V2-I2V-14B-540P",
100+
},
101+
{
102+
description: "Diffusers version of Hunyuan-DiT for image-to-video generation.",
103+
id: "hunyuanvideo-community/HunyuanVideo-I2V",
104+
},
105+
{
106+
description: "Tencent's Hunyuan-DiT model for image-to-video generation.",
107+
id: "tencent/HunyuanVideo-I2V",
108+
},
109+
{
110+
description: "A 14B parameter model for 480p image-to-video generation by Wan-AI.",
111+
id: "Wan-AI/Wan2.1-I2V-14B-480P",
112+
},
113+
{
114+
description: "A 14B parameter model for 720p image-to-video generation by Wan-AI.",
115+
id: "Wan-AI/Wan2.1-I2V-14B-720P",
116+
},
117+
{
118+
description: "A Diffusers version of the Wan2.1-I2V-14B-720P model for 720p image-to-video generation.",
119+
id: "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
120+
},
121+
{
122+
description:
123+
"A 14B parameter model for frame-level feature to video (FLF2V) generation by Wan-AI, producing 720p videos (Diffusers version).",
124+
id: "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers",
125+
},
126+
{
127+
description: "A Diffusers version of the Wan2.1-I2V-14B-480P model for 480p image-to-video generation.",
128+
id: "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
129+
},
130+
{
131+
description: "A video generation model based on LTX-Video-0.9, evaluated on the VACE benchmark.",
132+
id: "ali-vilab/VACE-LTX-Video-0.9",
133+
},
134+
{
135+
description: "An image-to-video model by Stability AI for generating short videos from images.",
136+
id: "stabilityai/stable-video-diffusion-img2vid",
137+
},
138+
{
139+
description: "A 5 billion parameter model for image-to-video generation by THUDM.",
140+
id: "THUDM/CogVideoX-5b-I2V",
45141
},
46142
],
47143
spaces: [
48144
{
49-
description: "An application that generates video from an image.",
50-
id: "user/image-to-video-space",
145+
description: "Generate videos fast withthe LTX-Video distilled model.",
146+
id: "Lightricks/ltx-video-distilled",
147+
},
148+
{
149+
description: "Generate videos with the FramePack-F1",
150+
id: "linoyts/FramePack-F1",
151+
},
152+
{
153+
description: "Generate videos with the FramePack",
154+
id: "lisonallen/framepack-i2v",
155+
},
156+
{
157+
description: "Wan2.1 with CausVid LoRA",
158+
id: "multimodalart/wan2-1-fast",
159+
},
160+
{
161+
description: "A demo for Stable Video Diffusion",
162+
id: "multimodalart/stable-video-diffusion",
51163
},
52164
],
53165
summary:
54-
"Image-to-video models take a still image as input and generate a video sequence. These models can be guided by text prompts to influence the content and style of the output video.",
55-
widgetModels: ["generic/image-to-video-model-widget"],
166+
"Image-to-video models take a still image as input and generate a video. These models can be guided by text prompts to influence the content and style of the output video.",
167+
widgetModels: [""],
56168
youtubeId: undefined,
57169
};
58170

0 commit comments

Comments
 (0)