fixes

sbalandi · sbalandi · commit 0210073dec50 · 2025-12-24T13:46:46.000Z
diff --git a/tools/who_what_benchmark/tests/test_cli_videos.py b/tools/who_what_benchmark/tests/test_cli_videos.py
@@ -28,7 +28,7 @@ def teardown_module():
 
 @pytest.mark.parametrize(
     ("model_id"),
-    ["Lightricks/LTX-Video"],
+    [("optimum-intel-internal-testing/tiny-random-ltx-video", "text-to-video")],
 )
 def test_image_model_genai(model_id, model_type, tmp_path):
     GT_FILE = tmp_path / "gt.csv"
diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py
@@ -646,11 +646,7 @@ def load_text2video_model(model_id, device="CPU", ov_config=None, use_hf=False,
             model = OVLTXPipeline.from_pretrained(model_id, device=device, **model_kwargs)
         except ValueError:
             model = OVLTXPipeline.from_pretrained(
-                model_id,
-                trust_remote_code=True,
-                use_cache=True,
-                device=device,
-                **model_kwargs
+                model_id, trust_remote_code=True, use_cache=True, device=device, **model_kwargs
             )
 
     return model
diff --git a/tools/who_what_benchmark/whowhatbench/text2video_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2video_evaluator.py
@@ -79,7 +79,7 @@
     },
     {
         "prompt": "Levitating woman uses magic and fairy dusty spews forth from her fingers.  cinematic shot  photos taken by ARRI, photos taken "
-                  + "by sony, photos taken by canon, photos taken by nikon, photos taken by sony, photos taken by hasselblad  ",
+        + "by sony, photos taken by canon, photos taken by nikon, photos taken by sony, photos taken by hasselblad  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
         "width": 480,
         "height": 704,
@@ -88,8 +88,8 @@
     },
     {
         "prompt": "A mythical river adventure in the Yellow River basin during ancient times, where majestic dragons soar through the turbulent waters, "
-                  + "casting a vibrant glow on the submerged landscapes, blending a sense of awe and fantasy, Sculpture, intricate clay model with luminescent "
-                  + "elements, --ar 16:9 --v 5  ",
+        + "casting a vibrant glow on the submerged landscapes, blending a sense of awe and fantasy, Sculpture, intricate clay model with luminescent "
+        + "elements, --ar 16:9 --v 5  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
         "width": 480,
         "height": 704,
@@ -123,9 +123,9 @@ def __init__(
         seed=42,
         is_genai=False,
     ) -> None:
-        assert (
-            base_model is not None or gt_data is not None
-        ), "Text generation pipeline for evaluation or ground trush data must be defined"
+        assert base_model is not None or gt_data is not None, (
+            "Text generation pipeline for evaluation or ground trush data must be defined"
+        )
 
         self.test_data = test_data
         self.metrics = metrics
@@ -158,9 +158,7 @@ def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs):
         if isinstance(model_or_data, str) and os.path.exists(model_or_data):
             predictions = pd.read_csv(model_or_data, keep_default_na=False)
         else:
-            predictions = self._generate_data(
-                model_or_data, gen_image_fn, video_folder
-            )
+            predictions = self._generate_data(model_or_data, gen_image_fn, video_folder)
         self.predictions = predictions
 
         all_metrics_per_prompt = {}
diff --git a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py
@@ -11,7 +11,7 @@
 
 import numpy as np
 from sentence_transformers import SentenceTransformer, util
-from transformers import CLIPImageProcessor, CLIPModel, LlavaNextVideoProcessor, LlavaNextVideoModel
+from transformers import CLIPImageProcessor, CLIPModel
 from tqdm import tqdm
 from sklearn.metrics.pairwise import cosine_similarity
 
@@ -236,6 +236,8 @@ def evaluate(self, data_gold, data_prediction):
 
 class VideoSimilarity:
     def __init__(self) -> None:
+        from transformers import LlavaNextVideoProcessor, LlavaNextVideoModel
+
         self.processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
         self.model = LlavaNextVideoModel.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf").eval()
 
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -63,8 +63,17 @@ def parse_args():
     parser.add_argument(
         "--model-type",
         type=str,
-        choices=["text", "text-to-image", "text-to-video", "visual-text", "visual-video-text", "image-to-image",
-                 "image-inpainting", "text-embedding", "text-reranking"],
+        choices=[
+            "text",
+            "text-to-image",
+            "text-to-video",
+            "visual-text",
+            "visual-video-text",
+            "image-to-image",
+            "image-inpainting",
+            "text-embedding",
+            "text-reranking",
+        ],
         default="text",
         help="Indicated the model type: text - for causal text generation, visual-text - for Visual Language Models with image inputs, "
         "visual-video-text - for Visual Language Models with video inputs, text-to-image - for image generation, "
@@ -862,7 +871,11 @@ def main():
     if args.verbose and (args.target_model or args.target_data):
         if args.model_type in ["text", "visual-text", "visual-video-text"]:
             print_text_results(evaluator)
-        elif "text-to-image" in args.model_type or "image-to-image" in args.model_type or "text-to-video" in args.model_type:
+        elif (
+            "text-to-image" in args.model_type
+            or "image-to-image" in args.model_type
+            or "text-to-video" in args.model_type
+        ):
             print_image_results(evaluator)
         elif args.model_type in ['text-embedding']:
             print_embeds_results(evaluator)

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ def teardown_module():`
`28`	`28`
`29`	`29`	`@pytest.mark.parametrize(`
`30`	`30`	`("model_id"),`
`31`		`- ["Lightricks/LTX-Video"],`
	`31`	`+ [("optimum-intel-internal-testing/tiny-random-ltx-video", "text-to-video")],`
`32`	`32`	`)`
`33`	`33`	`def test_image_model_genai(model_id, model_type, tmp_path):`
`34`	`34`	`GT_FILE = tmp_path / "gt.csv"`