fixes2

sbalandi · sbalandi · commit 8edff7c73015 · 2025-12-24T18:41:16.000Z
diff --git a/tools/who_what_benchmark/tests/test_cli_videos.py b/tools/who_what_benchmark/tests/test_cli_videos.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger(__name__)
 
 MODEL_CACHE = tempfile.mkdtemp()
-OV_VIDEO_MODELS = ["Lightricks/LTX-Video"]
+OV_VIDEO_MODELS = ["optimum-intel-internal-testing/tiny-random-ltx-video"]
 
 
 def setup_module():
@@ -27,7 +27,7 @@ def teardown_module():
 
 
 @pytest.mark.parametrize(
-    ("model_id"),
+    ("model_id", "model_type"),
     [("optimum-intel-internal-testing/tiny-random-ltx-video", "text-to-video")],
 )
 def test_image_model_genai(model_id, model_type, tmp_path):
@@ -48,6 +48,8 @@ def test_image_model_genai(model_id, model_type, tmp_path):
             model_type,
             "--num-inference-steps",
             "2",
+            "--video-frames-num",
+            "8",
         ]
     )
     assert GT_FILE.exists()
@@ -68,6 +70,8 @@ def test_image_model_genai(model_id, model_type, tmp_path):
             "--genai",
             "--num-inference-steps",
             "2",
+            "--video-frames-num",
+            "8",
         ]
     )
 
@@ -92,6 +96,8 @@ def test_image_model_genai(model_id, model_type, tmp_path):
     #     "--genai",
     #     "--num-inference-steps",
     #     "2",
+    #     "--video-frames-num",
+    #     "8",
     # ])
     # assert (tmp_path / "target").exists()
     # assert (tmp_path / "target.csv").exists()
@@ -111,5 +117,7 @@ def test_image_model_genai(model_id, model_type, tmp_path):
             model_type,
             "--num-inference-steps",
             "2",
+            "--video-frames-num",
+            "8",
         ]
     )
diff --git a/tools/who_what_benchmark/whowhatbench/text2video_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2video_evaluator.py
@@ -10,89 +10,102 @@
 from .registry import register_evaluator, BaseEvaluator
 
 from .whowhat_metrics import VideoSimilarity
+# from diffusers.utils import export_to_video
 
 
+# let's agreed default parameter will be:
+# width: 704, height: 480, guidance_scale: 3, guidance_rescale: 0.3
 default_data = [
+    # small resolution
     {
-        "prompt": "cowboy running in slow motion in a field  ",
+        "prompt": "octopus figure skating, cartoon  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
-        "height": 704,
+        "width": 256,
+        "height": 128,
         "guidance_scale": 3,
         "guidance_rescale": 0.3,
     },
+    # small resolution
     {
-        "prompt": "House in front of a lake and the wind blowing through the trees  ",
+        "prompt": "slow motion, hydrogen bond energy, atom, 4k, cinematic -gs 24 -motion 2 -ar 16:9 -fps 24  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 1216,
-        "height": 704,
+        "width": 256,
+        "height": 256,
         "guidance_scale": 3,
         "guidance_rescale": 0.3,
     },
+    # middle/common resolution
     {
-        "prompt": "slow motion, hydrogen bond energy, atom, 4k, cinematic -gs 24 -motion 2 -ar 16:9 -fps 24  ",
+        "prompt": "cowboy running in slow motion in a field  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 256,
-        "height": 256,
+        "width": 704,
+        "height": 480,
         "guidance_scale": 3,
         "guidance_rescale": 0.3,
     },
+    # big resolution
     {
-        "prompt": "fight naruto vs saske  ",
+        "prompt": "House in front of a lake and the wind blowing through the trees  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
+        "width": 1216,
         "height": 704,
         "guidance_scale": 3,
+        "guidance_rescale": 0.3,
+    },
+    # guidance_rescale 0
+    {
+        "prompt": "fight naruto vs saske  ",
+        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
+        "width": 704,
+        "height": 480,
+        "guidance_scale": 3,
         "guidance_rescale": 0,
     },
+    # guidance_scale 1
     {
         "prompt": "reporter in front of the TV cameras talking about the joker  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
-        "height": 704,
+        "width": 704,
+        "height": 480,
         "guidance_scale": 1,
         "guidance_rescale": 0.3,
     },
+    # guidance_scale 1 guidance_rescale 0
     {
         "prompt": "Realistic night silhouette of a white Lwxux LX III 2008 with headlights on driving on in the fog in the dark  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
-        "height": 704,
+        "width": 704,
+        "height": 480,
         "guidance_scale": 1,
         "guidance_rescale": 0,
     },
+    # guidance_scale 1 guidance_rescale 0
     {
         "prompt": "indian womens wahsing clothes at river side  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
-        "height": 704,
-        "guidance_scale": 3,
-        "guidance_rescale": 0.3,
-    },
-    {
-        "prompt": "octopus figure skating, cartoon  ",
-        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
-        "height": 704,
+        "width": 256,
+        "height": 128,
         "guidance_scale": 3,
         "guidance_rescale": 0.3,
     },
+    # big prompt
     {
         "prompt": "Levitating woman uses magic and fairy dusty spews forth from her fingers.  cinematic shot  photos taken by ARRI, photos taken "
         + "by sony, photos taken by canon, photos taken by nikon, photos taken by sony, photos taken by hasselblad  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
-        "height": 704,
+        "width": 704,
+        "height": 480,
         "guidance_scale": 3,
         "guidance_rescale": 0.3,
     },
+    # big prompt, small resolution
     {
         "prompt": "A mythical river adventure in the Yellow River basin during ancient times, where majestic dragons soar through the turbulent waters, "
         + "casting a vibrant glow on the submerged landscapes, blending a sense of awe and fantasy, Sculpture, intricate clay model with luminescent "
         + "elements, --ar 16:9 --v 5  ",
         "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
-        "width": 480,
-        "height": 704,
+        "width": 256,
+        "height": 128,
         "guidance_scale": 3,
         "guidance_rescale": 0.3,
     },
@@ -104,8 +117,8 @@ class Text2VideoEvaluator(BaseEvaluator):
     DEF_NUM_FRAMES = 25
     DEF_NUM_INF_STEPS = 25
     DEF_FRAME_RATE = 25
-    DEF_WIDTH = 480
-    DEF_HEIGHT = 704
+    DEF_WIDTH = 704
+    DEF_HEIGHT = 480
     DEF_GUIDANCE_SCALE = 3
     DEF_GUIDANCE_RESCALE = 0.3
 
@@ -119,7 +132,7 @@ def __init__(
         num_frames=25,
         crop_prompts=True,
         num_samples=None,
-        gen_image_fn=None,
+        gen_video_fn=None,
         seed=42,
         is_genai=False,
     ) -> None:
@@ -136,20 +149,20 @@ def __init__(
         self.similarity = VideoSimilarity()
         self.last_cmp = None
         self.gt_dir = os.path.dirname(gt_data)
-        self.generation_fn = gen_image_fn
+        self.generation_fn = gen_video_fn
         self.is_genai = is_genai
         self.num_frames = num_frames or self.DEF_NUM_FRAMES
         self.frame_rate = self.DEF_FRAME_RATE
 
         if base_model:
-            self.gt_data = self._generate_data(base_model, gen_image_fn, os.path.join(self.gt_dir, "reference"))
+            self.gt_data = self._generate_data(base_model, gen_video_fn, os.path.join(self.gt_dir, "reference"))
         else:
             self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
 
     def get_generation_fn(self):
         return self.generation_fn
 
-    def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs):
+    def score(self, model_or_data, gen_video_fn=None, output_dir=None, **kwargs):
         if output_dir is None:
             video_folder = os.path.join(self.gt_dir, "target")
         else:
@@ -158,7 +171,7 @@ def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs):
         if isinstance(model_or_data, str) and os.path.exists(model_or_data):
             predictions = pd.read_csv(model_or_data, keep_default_na=False)
         else:
-            predictions = self._generate_data(model_or_data, gen_image_fn, video_folder)
+            predictions = self._generate_data(model_or_data, gen_video_fn, video_folder)
         self.predictions = predictions
 
         all_metrics_per_prompt = {}
@@ -185,8 +198,8 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
 
         return res
 
-    def _generate_data(self, model, gen_image_fn=None, videos_dir="reference"):
-        def default_gen_image_fn(
+    def _generate_data(self, model, gen_video_fn=None, videos_dir="reference"):
+        def default_gen_video_fn(
             model,
             prompt,
             negative_prompt,
@@ -214,8 +227,8 @@ def default_gen_image_fn(
                 )
             return output.frames[0]
 
-        # generation_fn = gen_image_fn or default_gen_image_fn
-        generation_fn = default_gen_image_fn
+        # generation_fn = gen_video_fn or default_gen_video_fn
+        generation_fn = default_gen_video_fn
 
         if self.test_data:
             if isinstance(self.test_data, str):
@@ -262,6 +275,9 @@ def default_gen_image_fn(
                 frame_path = os.path.join(video_path, f"{number}.png")
                 frame.save(frame_path)
             videos.append(video_path)
+            # video_path = os.path.join(videos_dir, f"video_{i}.mp4")
+            # export_to_video(frames, video_path, self.frame_rate)
+            # videos.append(video_path)
 
         res_data["videos"] = videos
         df = pd.DataFrame(res_data)
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -180,7 +180,7 @@ def parse_args():
         "--num-inference-steps",
         type=int,
         default=4,
-        help="Text-to-image specific parameter that defines the number of denoising steps.",
+        help="Text-to-image/text-to-video specific parameter that defines the number of denoising steps.",
     )
     parser.add_argument(
         "--seed",
@@ -283,8 +283,9 @@ def parse_args():
         "--video-frames-num",
         type=int,
         default=None,
-        help="The number of frames that will be taken from video for input, the frames will be taken evenly across the entire length, "
-             "applicable for Visual Language Models with video inputs",
+        help="For Visual Language Models with video inputs - the number of frames that will be taken from video for input, "
+        "the frames will be taken evenly across the entire length. "
+        "For Text-to-video - the number of frames, which will be generated by model.",
     )
 
     return parser.parse_args()
@@ -514,6 +515,22 @@ def genai_gen_image2image(model, prompt, image, num_inference_steps, generator=N
     return image
 
 
+def genai_gen_text2video(
+    model,
+    prompt,
+    negative_prompt,
+    num_inference_steps,
+    width=704,
+    height=480,
+    num_frames=25,
+    frame_rate=25,
+    guidance_scale=3,
+    guidance_rescale=0.3,
+    generator=None,
+):
+    return None
+
+
 def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, generator=None):
     image_data = ov.Tensor(np.array(image)[None])
     mask_data = ov.Tensor(np.array(mask)[None])
@@ -624,7 +641,7 @@ def create_evaluator(base_model, args):
                 num_samples=args.num_samples,
                 num_inference_steps=args.num_inference_steps,
                 num_frames=args.video_frames_num,
-                gen_image_fn=genai_gen_image if args.genai else None,
+                gen_video_fn=genai_gen_text2video if args.genai else None,
                 is_genai=args.genai,
                 seed=args.seed,
             )