[wwb] Add video generation to wwb (#3134)

sbalandi · sgonorov · web-flow · commit af7838e41ab5 · 2026-01-22T12:53:00.000Z
## Description  How to run: ``` cd ./openvino.genai/tools/who_what_benchmark pip install . python ./whowhatbench/wwb.py --base-model Lightricks/LTX-Video --gt-data ltx_gt.csv --model-type text-to-video --num-inference-steps 40 --video-frames-num 25 optimum-cli export openvino -m Lightricks/LTX-Video ./models/ltx_video python ./whowhatbench/wwb.py --target-model ./models/ltx_video --gt-data ltx_gt.csv --model-type text-to-video -v --output ./output/ --num-inference-steps 40 --video-frames-num 25 ```  CVS-176896  Fixes #(issue) ## Checklist: - [ ] Tests have been updated or added to cover the new code.  - [ ] This patch fully addresses the ticket.  - [ ] I have made corresponding changes to the documentation.  --------- Co-authored-by: Stanislav Gonorovskii <stanislav.gonorovskii@intel.com>
diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt
@@ -13,4 +13,6 @@ autoawq<0.2.8; sys_platform == "linux"
 sentencepiece
 jinja2>=3.1.0
 scipy
-opencv-python
+opencv-python
+imageio[pyav]
+imageio-ffmpeg
diff --git a/tools/who_what_benchmark/setup.py b/tools/who_what_benchmark/setup.py
@@ -42,5 +42,5 @@ def set_version(base_version: str):
     packages=find_packages(),
     install_requires=required,
     entry_points={"console_scripts": ["wwb=whowhatbench.wwb:main"]},
-    package_data={"whowhatbench": ["prompts/*.yaml"]}
+    package_data={"whowhatbench": ["prompts/*.yaml", "prompts/*.json"]},
 )
diff --git a/tools/who_what_benchmark/tests/test_cli_videos.py b/tools/who_what_benchmark/tests/test_cli_videos.py
@@ -0,0 +1,106 @@
+import subprocess  # nosec B404
+import os
+import shutil
+import sys
+import pytest
+import logging
+import tempfile
+from test_cli_image import run_wwb, get_similarity
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+MODEL_CACHE = tempfile.mkdtemp()
+OV_VIDEO_MODELS = ["optimum-intel-internal-testing/tiny-random-ltx-video"]
+
+
+def setup_module():
+    for model_id in OV_VIDEO_MODELS:
+        MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "_"))
+        subprocess.run(
+            ["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH], capture_output=True, text=True
+        )
+
+
+def teardown_module():
+    logger.info("Remove models")
+    shutil.rmtree(MODEL_CACHE)
+
+
+@pytest.mark.xfail(sys.platform == "darwin", reason="Not enough memory on macOS CI runners. Ticket CVS-179749")
+@pytest.mark.xfail(sys.platform == "win32", reason="Access violation in OVLTXPipeline on Windows. Ticket CVS-179750")
+@pytest.mark.parametrize(
+    ("model_id", "model_type"),
+    [("optimum-intel-internal-testing/tiny-random-ltx-video", "text-to-video")],
+)
+def test_video_model_genai(model_id, model_type, tmp_path):
+    GT_FILE = tmp_path / "gt.csv"
+    MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "_"))
+
+    run_wwb(
+        [
+            "--base-model",
+            model_id,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--num-inference-steps",
+            "2",
+            "--video-frames-num",
+            "9",
+        ]
+    )
+    assert GT_FILE.exists()
+    assert (tmp_path / "reference").exists()
+
+    output = run_wwb(
+        [
+            "--target-model",
+            MODEL_PATH,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--genai",
+            "--num-inference-steps",
+            "2",
+            "--video-frames-num",
+            "9",
+            "--output",
+            tmp_path,
+        ]
+    )
+
+    assert "Metrics for model" in output
+    similarity = get_similarity(output)
+    assert similarity >= 0.89
+    assert (tmp_path / "target").exists()
+
+    # test w/o models
+    run_wwb(
+        [
+            "--target-data",
+            tmp_path / "target.csv",
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--num-inference-steps",
+            "2",
+            "--video-frames-num",
+            "9",
+        ]
+    )
diff --git a/tools/who_what_benchmark/whowhatbench/__init__.py b/tools/who_what_benchmark/whowhatbench/__init__.py
@@ -7,6 +7,7 @@
 from .inpaint_evaluator import InpaintingEvaluator
 from .embeddings_evaluator import EmbeddingsEvaluator
 from .reranking_evaluator import RerankingEvaluator
+from .text2video_evaluator import Text2VideoEvaluator
 
 
 __all__ = [
@@ -19,5 +20,6 @@
     "InpaintingEvaluator",
     "EmbeddingsEvaluator",
     "RerankingEvaluator",
+    "Text2VideoEvaluator",
     "EVALUATOR_REGISTRY",
 ]
diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -30,12 +30,18 @@ def __init__(self, model, model_dir, model_type):
         self.model = model
         self.model_type = model_type
 
-        if model_type in ["text", "visual-text", "visual-video-text", "text-embedding", "text-reranking"]:
+        if model_type in (
+            "text",
+            "visual-text",
+            "visual-video-text",
+            "text-embedding",
+            "text-reranking",
+        ):
             try:
                 self.config = AutoConfig.from_pretrained(model_dir)
             except Exception:
                 self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
-        elif model_type == "text-to-image":
+        elif model_type in ("text-to-image", "text-to-video"):
             from diffusers import DiffusionPipeline
             try:
                 self.config = DiffusionPipeline.load_config(model_dir)
@@ -643,6 +649,43 @@ def load_reranking_model(model_id, device="CPU", ov_config=None, use_hf=False, u
     return model
 
 
+def load_text2video_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwargs):
+    import openvino_genai
+
+    return GenAIModelWrapper(
+        openvino_genai.Text2VideoPipeline(model_dir, device=device, **ov_config), model_dir, "text-to-video"
+    )
+
+
+def load_text2video_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, **kwargs):
+    if use_genai:
+        logger.info("Using OpenVINO GenAI API")
+        model = load_text2video_genai_pipeline(model_id, device, ov_config, **kwargs)
+    elif use_hf:
+        from diffusers import LTXPipeline
+
+        logger.info("Using HF Transformers API")
+        try:
+            model = LTXPipeline.from_pretrained(model_id)
+        except ValueError:
+            model = LTXPipeline.from_pretrained(model_id, trust_remote_code=True)
+    else:
+        logger.info("Using Optimum API")
+        from optimum.intel import OVLTXPipeline
+
+        model_kwargs = {"ov_config": ov_config, "safety_checker": None}
+        if kwargs.get("from_onnx"):
+            model_kwargs["from_onnx"] = kwargs["from_onnx"]
+        try:
+            model = OVLTXPipeline.from_pretrained(model_id, device=device, **model_kwargs)
+        except ValueError:
+            model = OVLTXPipeline.from_pretrained(
+                model_id, trust_remote_code=True, use_cache=True, device=device, **model_kwargs
+            )
+
+    return model
+
+
 def load_model(
     model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False, **kwargs
 ):
@@ -672,5 +715,7 @@ def load_model(
         return load_embedding_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
     elif model_type == "text-reranking":
         return load_reranking_model(model_id, device, ov_options, use_hf, use_genai)
+    elif model_type == "text-to-video":
+        return load_text2video_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
     else:
         raise ValueError(f"Unsupported model type: {model_type}")
diff --git a/tools/who_what_benchmark/whowhatbench/prompts/text_to_video_prompts.json b/tools/who_what_benchmark/whowhatbench/prompts/text_to_video_prompts.json
@@ -0,0 +1,82 @@
+[
+    {
+        "prompt": "A woman with light skin, wearing a blue jacket and a black hat with a veil, looks down and to her right, then back up as she speaks; she has brown hair styled in an updo, light brown eyebrows, and is wearing a white collared shirt under her jacket; the camera remains stationary on her face as she speaks; the background is out of focus, but shows trees and people in period clothing; the scene is captured in real-life footage.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 256,
+        "height": 128,
+        "guidance_scale": 3,
+        "guidance_rescale": 0.3
+    },
+    {
+        "prompt": "A woman with blonde hair styled up, wearing a black dress with sequins and pearl earrings, looks down with a sad expression on her face. The camera remains stationary, focused on the woman's face. The lighting is dim, casting soft shadows on her face. The scene appears to be from a movie or TV show.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 256,
+        "height": 256,
+        "guidance_scale": 3,
+        "guidance_rescale": 0.3
+    },
+    {
+        "prompt": "A man with graying hair, a beard, and a gray shirt looks down and to his right, then turns his head to the left. The camera angle is a close-up, focused on the man's face. The lighting is dim, with a greenish tint. The scene appears to be real-life footage.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 704,
+        "height": 480,
+        "guidance_scale": 3,
+        "guidance_rescale": 0.3
+    },
+    {
+        "prompt": "The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 1216,
+        "height": 704,
+        "guidance_scale": 3,
+        "guidance_rescale": 0.3
+    },
+    {
+        "prompt": "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 704,
+        "height": 480,
+        "guidance_scale": 3,
+        "guidance_rescale": 0
+    },
+    {
+        "prompt": "A woman with short brown hair, wearing a maroon sleeveless top and a silver necklace, walks through a room while talking, then a woman with pink hair and a white shirt appears in the doorway and yells. The first woman walks from left to right, her expression serious; she has light skin and her eyebrows are slightly furrowed. The second woman stands in the doorway, her mouth open in a yell; she has light skin and her eyes are wide. The room is dimly lit, with a bookshelf visible in the background. The camera follows the first woman as she walks, then cuts to a close-up of the second woman's face. The scene is captured in real-life footage.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 704,
+        "height": 480,
+        "guidance_scale": 5,
+        "guidance_rescale": 0.3
+    },
+    {
+        "prompt": "The waves crash against the jagged rocks of the shoreline, sending spray high into the air.The rocks are a dark gray color, with sharp edges and deep crevices. The water is a clear blue-green, with white foam where the waves break against the rocks. The sky is a light gray, with a few white clouds dotting the horizon.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 704,
+        "height": 480,
+        "guidance_scale": 5,
+        "guidance_rescale": 0
+    },
+    {
+        "prompt": "A man walks towards a window, looks out, and then turns around. He has short, dark hair, dark skin, and is wearing a brown coat over a red and gray scarf. He walks from left to right towards a window, his gaze fixed on something outside. The camera follows him from behind at a medium distance. The room is brightly lit, with white walls and a large window covered by a white curtain. As he approaches the window, he turns his head slightly to the left, then back to the right. He then turns his entire body to the right, facing the window. The camera remains stationary as he stands in front of the window. The scene is captured in real-life footage.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 256,
+        "height": 128,
+        "guidance_scale": 3,
+        "guidance_rescale": 0.3
+    },
+    {
+        "prompt": "The camera pans over a snow-covered mountain range, revealing a vast expanse of snow-capped peaks and valleys.The mountains are covered in a thick layer of snow, with some areas appearing almost white while others have a slightly darker, almost grayish hue. The peaks are jagged and irregular, with some rising sharply into the sky while others are more rounded. The valleys are deep and narrow, with steep slopes that are also covered in snow. The trees in the foreground are mostly bare, with only a few leaves remaining on their branches. The sky is overcast, with thick clouds obscuring the sun. The overall impression is one of peace and tranquility, with the snow-covered mountains standing as a testament to the power and beauty of nature.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 704,
+        "height": 480,
+        "guidance_scale": 3,
+        "guidance_rescale": 0.3
+    },
+    {
+        "prompt": "A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 256,
+        "height": 128,
+        "guidance_scale": 3,
+        "guidance_rescale": 0.3
+    }
+]
diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
@@ -29,6 +29,8 @@
 
 @register_evaluator("text-to-image")
 class Text2ImageEvaluator(BaseEvaluator):
+    DEF_NUM_INFERENCE_STEP = 4
+
     def __init__(
         self,
         base_model: Any = None,
@@ -37,7 +39,7 @@ def __init__(
         metrics="similarity",
         similarity_model_id: str = "openai/clip-vit-large-patch14",
         resolution=(512, 512),
-        num_inference_steps=4,
+        num_inference_steps=None,
         crop_prompts=True,
         num_samples=None,
         gen_image_fn=None,
@@ -54,7 +56,7 @@ def __init__(
         self.resolution = resolution
         self.crop_prompt = crop_prompts
         self.num_samples = num_samples
-        self.num_inference_steps = num_inference_steps
+        self.num_inference_steps = num_inference_steps or self.DEF_NUM_INFERENCE_STEP
         self.seed = seed
         self.similarity = None
         self.similarity = ImageSimilarity(similarity_model_id)
diff --git a/tools/who_what_benchmark/whowhatbench/text2video_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2video_evaluator.py
diff --git a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py

Original file line number	Diff line number	Diff line change
`@@ -42,5 +42,5 @@ def set_version(base_version: str):`
`42`	`42`	`packages=find_packages(),`
`43`	`43`	`install_requires=required,`
`44`	`44`	`entry_points={"console_scripts": ["wwb=whowhatbench.wwb:main"]},`
`45`		`- package_data={"whowhatbench": ["prompts/*.yaml"]}`
	`45`	`+ package_data={"whowhatbench": ["prompts/.yaml", "prompts/.json"]},`
`46`	`46`	`)`