Skip to content

Commit af7838e

Browse files
sbalandisgonorov
andauthored
[wwb] Add video generation to wwb (#3134)
<!-- Keep your pull requests (PRs) as atomic as possible. That increases the likelihood that an individual PR won't be stuck because of adjacent problems, merge conflicts, or code review. Your merged PR is going to appear in the automatically generated release notes on GitHub. So the clearer the title the better. --> ## Description <!-- Please include a summary of the change. Also include relevant motivation and context. --> How to run: ``` cd ./openvino.genai/tools/who_what_benchmark pip install . python ./whowhatbench/wwb.py --base-model Lightricks/LTX-Video --gt-data ltx_gt.csv --model-type text-to-video --num-inference-steps 40 --video-frames-num 25 optimum-cli export openvino -m Lightricks/LTX-Video ./models/ltx_video python ./whowhatbench/wwb.py --target-model ./models/ltx_video --gt-data ltx_gt.csv --model-type text-to-video -v --output ./output/ --num-inference-steps 40 --video-frames-num 25 ``` <!-- Jira ticket number (e.g., 123). Delete if there's no ticket. --> CVS-176896 <!-- Remove if not applicable --> Fixes #(issue) ## Checklist: - [ ] Tests have been updated or added to cover the new code. <!-- If the change isn't maintenance related, update the tests at https://github.com/openvinotoolkit/openvino.genai/tree/master/tests or explain in the description why the tests don't need an update. --> - [ ] This patch fully addresses the ticket. <!--- If follow-up pull requests are needed, specify in description. --> - [ ] I have made corresponding changes to the documentation. <!-- Run github.com/\<username>/openvino.genai/actions/workflows/deploy_gh_pages.yml on your fork with your branch as a parameter to deploy a test version with the updated content. Replace this comment with the link to the built docs. --> --------- Co-authored-by: Stanislav Gonorovskii <stanislav.gonorovskii@intel.com>
1 parent bd86e96 commit af7838e

File tree

10 files changed

+563
-22
lines changed

10 files changed

+563
-22
lines changed

tools/who_what_benchmark/requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,6 @@ autoawq<0.2.8; sys_platform == "linux"
1313
sentencepiece
1414
jinja2>=3.1.0
1515
scipy
16-
opencv-python
16+
opencv-python
17+
imageio[pyav]
18+
imageio-ffmpeg

tools/who_what_benchmark/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,5 @@ def set_version(base_version: str):
4242
packages=find_packages(),
4343
install_requires=required,
4444
entry_points={"console_scripts": ["wwb=whowhatbench.wwb:main"]},
45-
package_data={"whowhatbench": ["prompts/*.yaml"]}
45+
package_data={"whowhatbench": ["prompts/*.yaml", "prompts/*.json"]},
4646
)
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import subprocess # nosec B404
2+
import os
3+
import shutil
4+
import sys
5+
import pytest
6+
import logging
7+
import tempfile
8+
from test_cli_image import run_wwb, get_similarity
9+
10+
logging.basicConfig(level=logging.INFO)
11+
logger = logging.getLogger(__name__)
12+
13+
MODEL_CACHE = tempfile.mkdtemp()
14+
OV_VIDEO_MODELS = ["optimum-intel-internal-testing/tiny-random-ltx-video"]
15+
16+
17+
def setup_module():
18+
for model_id in OV_VIDEO_MODELS:
19+
MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "_"))
20+
subprocess.run(
21+
["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH], capture_output=True, text=True
22+
)
23+
24+
25+
def teardown_module():
26+
logger.info("Remove models")
27+
shutil.rmtree(MODEL_CACHE)
28+
29+
30+
@pytest.mark.xfail(sys.platform == "darwin", reason="Not enough memory on macOS CI runners. Ticket CVS-179749")
31+
@pytest.mark.xfail(sys.platform == "win32", reason="Access violation in OVLTXPipeline on Windows. Ticket CVS-179750")
32+
@pytest.mark.parametrize(
33+
("model_id", "model_type"),
34+
[("optimum-intel-internal-testing/tiny-random-ltx-video", "text-to-video")],
35+
)
36+
def test_video_model_genai(model_id, model_type, tmp_path):
37+
GT_FILE = tmp_path / "gt.csv"
38+
MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "_"))
39+
40+
run_wwb(
41+
[
42+
"--base-model",
43+
model_id,
44+
"--num-samples",
45+
"1",
46+
"--gt-data",
47+
GT_FILE,
48+
"--device",
49+
"CPU",
50+
"--model-type",
51+
model_type,
52+
"--num-inference-steps",
53+
"2",
54+
"--video-frames-num",
55+
"9",
56+
]
57+
)
58+
assert GT_FILE.exists()
59+
assert (tmp_path / "reference").exists()
60+
61+
output = run_wwb(
62+
[
63+
"--target-model",
64+
MODEL_PATH,
65+
"--num-samples",
66+
"1",
67+
"--gt-data",
68+
GT_FILE,
69+
"--device",
70+
"CPU",
71+
"--model-type",
72+
model_type,
73+
"--genai",
74+
"--num-inference-steps",
75+
"2",
76+
"--video-frames-num",
77+
"9",
78+
"--output",
79+
tmp_path,
80+
]
81+
)
82+
83+
assert "Metrics for model" in output
84+
similarity = get_similarity(output)
85+
assert similarity >= 0.89
86+
assert (tmp_path / "target").exists()
87+
88+
# test w/o models
89+
run_wwb(
90+
[
91+
"--target-data",
92+
tmp_path / "target.csv",
93+
"--num-samples",
94+
"1",
95+
"--gt-data",
96+
GT_FILE,
97+
"--device",
98+
"CPU",
99+
"--model-type",
100+
model_type,
101+
"--num-inference-steps",
102+
"2",
103+
"--video-frames-num",
104+
"9",
105+
]
106+
)

tools/who_what_benchmark/whowhatbench/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .inpaint_evaluator import InpaintingEvaluator
88
from .embeddings_evaluator import EmbeddingsEvaluator
99
from .reranking_evaluator import RerankingEvaluator
10+
from .text2video_evaluator import Text2VideoEvaluator
1011

1112

1213
__all__ = [
@@ -19,5 +20,6 @@
1920
"InpaintingEvaluator",
2021
"EmbeddingsEvaluator",
2122
"RerankingEvaluator",
23+
"Text2VideoEvaluator",
2224
"EVALUATOR_REGISTRY",
2325
]

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,18 @@ def __init__(self, model, model_dir, model_type):
3030
self.model = model
3131
self.model_type = model_type
3232

33-
if model_type in ["text", "visual-text", "visual-video-text", "text-embedding", "text-reranking"]:
33+
if model_type in (
34+
"text",
35+
"visual-text",
36+
"visual-video-text",
37+
"text-embedding",
38+
"text-reranking",
39+
):
3440
try:
3541
self.config = AutoConfig.from_pretrained(model_dir)
3642
except Exception:
3743
self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
38-
elif model_type == "text-to-image":
44+
elif model_type in ("text-to-image", "text-to-video"):
3945
from diffusers import DiffusionPipeline
4046
try:
4147
self.config = DiffusionPipeline.load_config(model_dir)
@@ -643,6 +649,43 @@ def load_reranking_model(model_id, device="CPU", ov_config=None, use_hf=False, u
643649
return model
644650

645651

652+
def load_text2video_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwargs):
653+
import openvino_genai
654+
655+
return GenAIModelWrapper(
656+
openvino_genai.Text2VideoPipeline(model_dir, device=device, **ov_config), model_dir, "text-to-video"
657+
)
658+
659+
660+
def load_text2video_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, **kwargs):
661+
if use_genai:
662+
logger.info("Using OpenVINO GenAI API")
663+
model = load_text2video_genai_pipeline(model_id, device, ov_config, **kwargs)
664+
elif use_hf:
665+
from diffusers import LTXPipeline
666+
667+
logger.info("Using HF Transformers API")
668+
try:
669+
model = LTXPipeline.from_pretrained(model_id)
670+
except ValueError:
671+
model = LTXPipeline.from_pretrained(model_id, trust_remote_code=True)
672+
else:
673+
logger.info("Using Optimum API")
674+
from optimum.intel import OVLTXPipeline
675+
676+
model_kwargs = {"ov_config": ov_config, "safety_checker": None}
677+
if kwargs.get("from_onnx"):
678+
model_kwargs["from_onnx"] = kwargs["from_onnx"]
679+
try:
680+
model = OVLTXPipeline.from_pretrained(model_id, device=device, **model_kwargs)
681+
except ValueError:
682+
model = OVLTXPipeline.from_pretrained(
683+
model_id, trust_remote_code=True, use_cache=True, device=device, **model_kwargs
684+
)
685+
686+
return model
687+
688+
646689
def load_model(
647690
model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False, **kwargs
648691
):
@@ -672,5 +715,7 @@ def load_model(
672715
return load_embedding_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
673716
elif model_type == "text-reranking":
674717
return load_reranking_model(model_id, device, ov_options, use_hf, use_genai)
718+
elif model_type == "text-to-video":
719+
return load_text2video_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
675720
else:
676721
raise ValueError(f"Unsupported model type: {model_type}")
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
[
2+
{
3+
"prompt": "A woman with light skin, wearing a blue jacket and a black hat with a veil, looks down and to her right, then back up as she speaks; she has brown hair styled in an updo, light brown eyebrows, and is wearing a white collared shirt under her jacket; the camera remains stationary on her face as she speaks; the background is out of focus, but shows trees and people in period clothing; the scene is captured in real-life footage.",
4+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
5+
"width": 256,
6+
"height": 128,
7+
"guidance_scale": 3,
8+
"guidance_rescale": 0.3
9+
},
10+
{
11+
"prompt": "A woman with blonde hair styled up, wearing a black dress with sequins and pearl earrings, looks down with a sad expression on her face. The camera remains stationary, focused on the woman's face. The lighting is dim, casting soft shadows on her face. The scene appears to be from a movie or TV show.",
12+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
13+
"width": 256,
14+
"height": 256,
15+
"guidance_scale": 3,
16+
"guidance_rescale": 0.3
17+
},
18+
{
19+
"prompt": "A man with graying hair, a beard, and a gray shirt looks down and to his right, then turns his head to the left. The camera angle is a close-up, focused on the man's face. The lighting is dim, with a greenish tint. The scene appears to be real-life footage.",
20+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
21+
"width": 704,
22+
"height": 480,
23+
"guidance_scale": 3,
24+
"guidance_rescale": 0.3
25+
},
26+
{
27+
"prompt": "The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.",
28+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
29+
"width": 1216,
30+
"height": 704,
31+
"guidance_scale": 3,
32+
"guidance_rescale": 0.3
33+
},
34+
{
35+
"prompt": "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie.",
36+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
37+
"width": 704,
38+
"height": 480,
39+
"guidance_scale": 3,
40+
"guidance_rescale": 0
41+
},
42+
{
43+
"prompt": "A woman with short brown hair, wearing a maroon sleeveless top and a silver necklace, walks through a room while talking, then a woman with pink hair and a white shirt appears in the doorway and yells. The first woman walks from left to right, her expression serious; she has light skin and her eyebrows are slightly furrowed. The second woman stands in the doorway, her mouth open in a yell; she has light skin and her eyes are wide. The room is dimly lit, with a bookshelf visible in the background. The camera follows the first woman as she walks, then cuts to a close-up of the second woman's face. The scene is captured in real-life footage.",
44+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
45+
"width": 704,
46+
"height": 480,
47+
"guidance_scale": 5,
48+
"guidance_rescale": 0.3
49+
},
50+
{
51+
"prompt": "The waves crash against the jagged rocks of the shoreline, sending spray high into the air.The rocks are a dark gray color, with sharp edges and deep crevices. The water is a clear blue-green, with white foam where the waves break against the rocks. The sky is a light gray, with a few white clouds dotting the horizon.",
52+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
53+
"width": 704,
54+
"height": 480,
55+
"guidance_scale": 5,
56+
"guidance_rescale": 0
57+
},
58+
{
59+
"prompt": "A man walks towards a window, looks out, and then turns around. He has short, dark hair, dark skin, and is wearing a brown coat over a red and gray scarf. He walks from left to right towards a window, his gaze fixed on something outside. The camera follows him from behind at a medium distance. The room is brightly lit, with white walls and a large window covered by a white curtain. As he approaches the window, he turns his head slightly to the left, then back to the right. He then turns his entire body to the right, facing the window. The camera remains stationary as he stands in front of the window. The scene is captured in real-life footage.",
60+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
61+
"width": 256,
62+
"height": 128,
63+
"guidance_scale": 3,
64+
"guidance_rescale": 0.3
65+
},
66+
{
67+
"prompt": "The camera pans over a snow-covered mountain range, revealing a vast expanse of snow-capped peaks and valleys.The mountains are covered in a thick layer of snow, with some areas appearing almost white while others have a slightly darker, almost grayish hue. The peaks are jagged and irregular, with some rising sharply into the sky while others are more rounded. The valleys are deep and narrow, with steep slopes that are also covered in snow. The trees in the foreground are mostly bare, with only a few leaves remaining on their branches. The sky is overcast, with thick clouds obscuring the sun. The overall impression is one of peace and tranquility, with the snow-covered mountains standing as a testament to the power and beauty of nature.",
68+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
69+
"width": 704,
70+
"height": 480,
71+
"guidance_scale": 3,
72+
"guidance_rescale": 0.3
73+
},
74+
{
75+
"prompt": "A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.",
76+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
77+
"width": 256,
78+
"height": 128,
79+
"guidance_scale": 3,
80+
"guidance_rescale": 0.3
81+
}
82+
]

tools/who_what_benchmark/whowhatbench/text2image_evaluator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
@register_evaluator("text-to-image")
3131
class Text2ImageEvaluator(BaseEvaluator):
32+
DEF_NUM_INFERENCE_STEP = 4
33+
3234
def __init__(
3335
self,
3436
base_model: Any = None,
@@ -37,7 +39,7 @@ def __init__(
3739
metrics="similarity",
3840
similarity_model_id: str = "openai/clip-vit-large-patch14",
3941
resolution=(512, 512),
40-
num_inference_steps=4,
42+
num_inference_steps=None,
4143
crop_prompts=True,
4244
num_samples=None,
4345
gen_image_fn=None,
@@ -54,7 +56,7 @@ def __init__(
5456
self.resolution = resolution
5557
self.crop_prompt = crop_prompts
5658
self.num_samples = num_samples
57-
self.num_inference_steps = num_inference_steps
59+
self.num_inference_steps = num_inference_steps or self.DEF_NUM_INFERENCE_STEP
5860
self.seed = seed
5961
self.similarity = None
6062
self.similarity = ImageSimilarity(similarity_model_id)

0 commit comments

Comments
 (0)