|
| 1 | +import type { TaskDataCustom } from "../index.js"; |
| 2 | + |
| 3 | +const taskData: TaskDataCustom = { |
| 4 | + datasets: [ |
| 5 | + { |
| 6 | + description: "A benchmark dataset for reference image controlled video generation.", |
| 7 | + id: "ali-vilab/VACE-Benchmark", |
| 8 | + }, |
| 9 | + { |
| 10 | + description: "A dataset of video generation style preferences.", |
| 11 | + id: "Rapidata/sora-video-generation-style-likert-scoring", |
| 12 | + }, |
| 13 | + { |
| 14 | + description: "A dataset with videos and captions throughout the videos.", |
| 15 | + id: "BestWishYsh/ChronoMagic", |
| 16 | + }, |
| 17 | + ], |
| 18 | + demo: { |
| 19 | + inputs: [ |
| 20 | + { |
| 21 | + filename: "image-to-video-input.jpg", |
| 22 | + type: "img", |
| 23 | + }, |
| 24 | + { |
| 25 | + label: "Optional Text Prompt", |
| 26 | + content: "This penguin is dancing", |
| 27 | + type: "text", |
| 28 | + }, |
| 29 | + ], |
| 30 | + outputs: [ |
| 31 | + { |
| 32 | + filename: "image-to-video-output.gif", |
| 33 | + type: "img", |
| 34 | + }, |
| 35 | + ], |
| 36 | + }, |
| 37 | + metrics: [ |
| 38 | + { |
| 39 | + description: |
| 40 | + "Fréchet Video Distance (FVD) measures the perceptual similarity between the distributions of generated videos and a set of real videos, assessing overall visual quality and temporal coherence of the video generated from an input image.", |
| 41 | + id: "fvd", |
| 42 | + }, |
| 43 | + { |
| 44 | + description: |
| 45 | + "CLIP Score measures the semantic similarity between a textual prompt (if provided alongside the input image) and the generated video frames. It evaluates how well the video's generated content and motion align with the textual description, conditioned on the initial image.", |
| 46 | + id: "clip_score", |
| 47 | + }, |
| 48 | + { |
| 49 | + description: |
| 50 | + "First Frame Fidelity, often measured using LPIPS (Learned Perceptual Image Patch Similarity), PSNR, or SSIM, quantifies how closely the first frame of the generated video matches the input conditioning image.", |
| 51 | + id: "lpips", |
| 52 | + }, |
| 53 | + { |
| 54 | + description: |
| 55 | + "Identity Preservation Score measures the consistency of identity (e.g., a person's face or a specific object's characteristics) between the input image and throughout the generated video frames, often calculated using features from specialized models like face recognition (e.g., ArcFace) or re-identification models.", |
| 56 | + id: "identity_preservation", |
| 57 | + }, |
| 58 | + { |
| 59 | + description: |
| 60 | + "Motion Score evaluates the quality, realism, and temporal consistency of motion in the video generated from a static image. This can be based on optical flow analysis (e.g., smoothness, magnitude), consistency of object trajectories, or specific motion plausibility assessments.", |
| 61 | + id: "motion_score", |
| 62 | + }, |
| 63 | + ], |
| 64 | + models: [ |
| 65 | + { |
| 66 | + description: "LTX-Video, a 13B parameter model for high quality video generation", |
| 67 | + id: "Lightricks/LTX-Video-0.9.7-dev", |
| 68 | + }, |
| 69 | + { |
| 70 | + description: "A 14B parameter model for reference image controlled video generation", |
| 71 | + id: "Wan-AI/Wan2.1-VACE-14B", |
| 72 | + }, |
| 73 | + { |
| 74 | + description: "An image-to-video generation model using FramePack F1 methodology with Hunyuan-DiT architecture", |
| 75 | + id: "lllyasviel/FramePack_F1_I2V_HY_20250503", |
| 76 | + }, |
| 77 | + { |
| 78 | + description: "A distilled version of the LTX-Video-0.9.7-dev model for faster inference", |
| 79 | + id: "Lightricks/LTX-Video-0.9.7-distilled", |
| 80 | + }, |
| 81 | + { |
| 82 | + description: "An image-to-video generation model by Skywork AI, 14B parameters, producing 720p videos.", |
| 83 | + id: "Skywork/SkyReels-V2-I2V-14B-720P", |
| 84 | + }, |
| 85 | + { |
| 86 | + description: "Image-to-video variant of Tencent's HunyuanVideo.", |
| 87 | + id: "tencent/HunyuanVideo-I2V", |
| 88 | + }, |
| 89 | + { |
| 90 | + description: "A 14B parameter model for 720p image-to-video generation by Wan-AI.", |
| 91 | + id: "Wan-AI/Wan2.1-I2V-14B-720P", |
| 92 | + }, |
| 93 | + { |
| 94 | + description: "A Diffusers version of the Wan2.1-I2V-14B-720P model for 720p image-to-video generation.", |
| 95 | + id: "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers", |
| 96 | + }, |
| 97 | + ], |
| 98 | + spaces: [ |
| 99 | + { |
| 100 | + description: "An application to generate videos fast.", |
| 101 | + id: "Lightricks/ltx-video-distilled", |
| 102 | + }, |
| 103 | + { |
| 104 | + description: "Generate videos with the FramePack-F1", |
| 105 | + id: "linoyts/FramePack-F1", |
| 106 | + }, |
| 107 | + { |
| 108 | + description: "Generate videos with the FramePack", |
| 109 | + id: "lisonallen/framepack-i2v", |
| 110 | + }, |
| 111 | + { |
| 112 | + description: "Wan2.1 with CausVid LoRA", |
| 113 | + id: "multimodalart/wan2-1-fast", |
| 114 | + }, |
| 115 | + { |
| 116 | + description: "A demo for Stable Video Diffusion", |
| 117 | + id: "multimodalart/stable-video-diffusion", |
| 118 | + }, |
| 119 | + ], |
| 120 | + summary: |
| 121 | + "Image-to-video models take a still image as input and generate a video. These models can be guided by text prompts to influence the content and style of the output video.", |
| 122 | + widgetModels: [], |
| 123 | + youtubeId: undefined, |
| 124 | +}; |
| 125 | + |
| 126 | +export default taskData; |
0 commit comments