You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
content: "A dog running in a field of flowers, cinematic lighting",
30
+
content: "This penguin is dancing",
19
31
type: "text",
20
32
},
21
33
],
22
34
outputs: [
23
35
{
24
36
filename: "image-to-video-output.gif",
25
-
type: "img",// Representing video as gif for demo
37
+
type: "img",
26
38
},
27
39
],
28
40
},
29
41
metrics: [
30
42
{
31
43
description:
32
-
"Frechet Video Distance (FVD) is a common metric for evaluating the quality of generated videos, comparing them to real videos.",
44
+
"Fréchet Video Distance (FVD) measures the perceptual similarity between the distributions of generated videos and a set of real videos, assessing overall visual quality and temporal coherence of the video generated from an input image.",
33
45
id: "fvd",
34
46
},
35
47
{
36
48
description:
37
-
"Inception Score (IS) can be adapted for videos to measure the diversity and quality of generated frames.",
38
-
id: "is_video",
49
+
"CLIP Score measures the semantic similarity between a textual prompt (if provided alongside the input image) and the generated video frames. It evaluates how well the video's generated content and motion align with the textual description, conditioned on the initial image.",
50
+
id: "clip_score",
51
+
},
52
+
{
53
+
description:
54
+
"First Frame Fidelity, often measured using LPIPS (Learned Perceptual Image Patch Similarity), PSNR, or SSIM, quantifies how closely the first frame of the generated video matches the input conditioning image.",
55
+
id: "lpips",
56
+
},
57
+
{
58
+
description:
59
+
"Identity Preservation Score measures the consistency of identity (e.g., a person's face or a specific object's characteristics) between the input image and throughout the generated video frames, often calculated using features from specialized models like face recognition (e.g., ArcFace) or re-identification models.",
60
+
id: "identity_preservation",
61
+
},
62
+
{
63
+
description:
64
+
"Motion Score evaluates the quality, realism, and temporal consistency of motion in the video generated from a static image. This can be based on optical flow analysis (e.g., smoothness, magnitude), consistency of object trajectories, or specific motion plausibility assessments.",
65
+
id: "motion_score",
39
66
},
40
67
],
41
68
models: [
42
69
{
43
-
description: "A generic model for image-to-video generation.",
44
-
id: "generic/image-to-video-model",
70
+
description: "LTX-Video, a 13B parameter model for high quality video generation",
71
+
id: "Lightricks/LTX-Video-0.9.7-dev",
72
+
},
73
+
{
74
+
description: "A 1.3B parameter model for reference-based video generation",
75
+
id: "Wan-AI/Wan2.1-VACE-1.3B",
76
+
},
77
+
{
78
+
description: "An image-to-video generation model using FramePack methodology with Hunyuan-DiT architecture.",
79
+
id: "lllyasviel/FramePackI2V_HY",
80
+
},
81
+
{
82
+
description: "An image-to-video generation model using FramePack F1 methodology with Hunyuan-DiT architecture",
83
+
id: "lllyasviel/FramePack_F1_I2V_HY_20250503",
84
+
},
85
+
{
86
+
description: "A distilled version of the LTX-Video-0.9.7-dev model for faster inference",
87
+
id: "Lightricks/LTX-Video-0.9.7-distilled",
88
+
},
89
+
{
90
+
description: "An image-to-video generation model by Skywork AI, 1.3B parameters, producing 540p videos.",
91
+
id: "Skywork/SkyReels-V2-I2V-1.3B-540P",
92
+
},
93
+
{
94
+
description: "An image-to-video generation model by Skywork AI, 14B parameters, producing 720p videos.",
95
+
id: "Skywork/SkyReels-V2-I2V-14B-720P",
96
+
},
97
+
{
98
+
description: "An image-to-video generation model by Skywork AI, 14B parameters, producing 540p videos.",
99
+
id: "Skywork/SkyReels-V2-I2V-14B-540P",
100
+
},
101
+
{
102
+
description: "Diffusers version of Hunyuan-DiT for image-to-video generation.",
103
+
id: "hunyuanvideo-community/HunyuanVideo-I2V",
104
+
},
105
+
{
106
+
description: "Tencent's Hunyuan-DiT model for image-to-video generation.",
107
+
id: "tencent/HunyuanVideo-I2V",
108
+
},
109
+
{
110
+
description: "A 14B parameter model for 480p image-to-video generation by Wan-AI.",
111
+
id: "Wan-AI/Wan2.1-I2V-14B-480P",
112
+
},
113
+
{
114
+
description: "A 14B parameter model for 720p image-to-video generation by Wan-AI.",
115
+
id: "Wan-AI/Wan2.1-I2V-14B-720P",
116
+
},
117
+
{
118
+
description: "A Diffusers version of the Wan2.1-I2V-14B-720P model for 720p image-to-video generation.",
119
+
id: "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
120
+
},
121
+
{
122
+
description:
123
+
"A 14B parameter model for frame-level feature to video (FLF2V) generation by Wan-AI, producing 720p videos (Diffusers version).",
124
+
id: "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers",
125
+
},
126
+
{
127
+
description: "A Diffusers version of the Wan2.1-I2V-14B-480P model for 480p image-to-video generation.",
128
+
id: "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
129
+
},
130
+
{
131
+
description: "A video generation model based on LTX-Video-0.9, evaluated on the VACE benchmark.",
132
+
id: "ali-vilab/VACE-LTX-Video-0.9",
133
+
},
134
+
{
135
+
description: "An image-to-video model by Stability AI for generating short videos from images.",
136
+
id: "stabilityai/stable-video-diffusion-img2vid",
137
+
},
138
+
{
139
+
description: "A 5 billion parameter model for image-to-video generation by THUDM.",
140
+
id: "THUDM/CogVideoX-5b-I2V",
45
141
},
46
142
],
47
143
spaces: [
48
144
{
49
-
description: "An application that generates video from an image.",
50
-
id: "user/image-to-video-space",
145
+
description: "Generate videos fast withthe LTX-Video distilled model.",
146
+
id: "Lightricks/ltx-video-distilled",
147
+
},
148
+
{
149
+
description: "Generate videos with the FramePack-F1",
150
+
id: "linoyts/FramePack-F1",
151
+
},
152
+
{
153
+
description: "Generate videos with the FramePack",
154
+
id: "lisonallen/framepack-i2v",
155
+
},
156
+
{
157
+
description: "Wan2.1 with CausVid LoRA",
158
+
id: "multimodalart/wan2-1-fast",
159
+
},
160
+
{
161
+
description: "A demo for Stable Video Diffusion",
162
+
id: "multimodalart/stable-video-diffusion",
51
163
},
52
164
],
53
165
summary:
54
-
"Image-to-video models take a still image as input and generate a video sequence. These models can be guided by text prompts to influence the content and style of the output video.",
"Image-to-video models take a still image as input and generate a video. These models can be guided by text prompts to influence the content and style of the output video.",
0 commit comments