Skip to content

Commit 8edff7c

Browse files
committed
fixes2
1 parent 0210073 commit 8edff7c

File tree

3 files changed

+87
-46
lines changed

3 files changed

+87
-46
lines changed

tools/who_what_benchmark/tests/test_cli_videos.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
logger = logging.getLogger(__name__)
1111

1212
MODEL_CACHE = tempfile.mkdtemp()
13-
OV_VIDEO_MODELS = ["Lightricks/LTX-Video"]
13+
OV_VIDEO_MODELS = ["optimum-intel-internal-testing/tiny-random-ltx-video"]
1414

1515

1616
def setup_module():
@@ -27,7 +27,7 @@ def teardown_module():
2727

2828

2929
@pytest.mark.parametrize(
30-
("model_id"),
30+
("model_id", "model_type"),
3131
[("optimum-intel-internal-testing/tiny-random-ltx-video", "text-to-video")],
3232
)
3333
def test_image_model_genai(model_id, model_type, tmp_path):
@@ -48,6 +48,8 @@ def test_image_model_genai(model_id, model_type, tmp_path):
4848
model_type,
4949
"--num-inference-steps",
5050
"2",
51+
"--video-frames-num",
52+
"8",
5153
]
5254
)
5355
assert GT_FILE.exists()
@@ -68,6 +70,8 @@ def test_image_model_genai(model_id, model_type, tmp_path):
6870
"--genai",
6971
"--num-inference-steps",
7072
"2",
73+
"--video-frames-num",
74+
"8",
7175
]
7276
)
7377

@@ -92,6 +96,8 @@ def test_image_model_genai(model_id, model_type, tmp_path):
9296
# "--genai",
9397
# "--num-inference-steps",
9498
# "2",
99+
# "--video-frames-num",
100+
# "8",
95101
# ])
96102
# assert (tmp_path / "target").exists()
97103
# assert (tmp_path / "target.csv").exists()
@@ -111,5 +117,7 @@ def test_image_model_genai(model_id, model_type, tmp_path):
111117
model_type,
112118
"--num-inference-steps",
113119
"2",
120+
"--video-frames-num",
121+
"8",
114122
]
115123
)

tools/who_what_benchmark/whowhatbench/text2video_evaluator.py

Lines changed: 56 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,89 +10,102 @@
1010
from .registry import register_evaluator, BaseEvaluator
1111

1212
from .whowhat_metrics import VideoSimilarity
13+
# from diffusers.utils import export_to_video
1314

1415

16+
# let's agreed default parameter will be:
17+
# width: 704, height: 480, guidance_scale: 3, guidance_rescale: 0.3
1518
default_data = [
19+
# small resolution
1620
{
17-
"prompt": "cowboy running in slow motion in a field ",
21+
"prompt": "octopus figure skating, cartoon ",
1822
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
19-
"width": 480,
20-
"height": 704,
23+
"width": 256,
24+
"height": 128,
2125
"guidance_scale": 3,
2226
"guidance_rescale": 0.3,
2327
},
28+
# small resolution
2429
{
25-
"prompt": "House in front of a lake and the wind blowing through the trees ",
30+
"prompt": "slow motion, hydrogen bond energy, atom, 4k, cinematic -gs 24 -motion 2 -ar 16:9 -fps 24 ",
2631
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
27-
"width": 1216,
28-
"height": 704,
32+
"width": 256,
33+
"height": 256,
2934
"guidance_scale": 3,
3035
"guidance_rescale": 0.3,
3136
},
37+
# middle/common resolution
3238
{
33-
"prompt": "slow motion, hydrogen bond energy, atom, 4k, cinematic -gs 24 -motion 2 -ar 16:9 -fps 24 ",
39+
"prompt": "cowboy running in slow motion in a field ",
3440
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
35-
"width": 256,
36-
"height": 256,
41+
"width": 704,
42+
"height": 480,
3743
"guidance_scale": 3,
3844
"guidance_rescale": 0.3,
3945
},
46+
# big resolution
4047
{
41-
"prompt": "fight naruto vs saske ",
48+
"prompt": "House in front of a lake and the wind blowing through the trees ",
4249
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
43-
"width": 480,
50+
"width": 1216,
4451
"height": 704,
4552
"guidance_scale": 3,
53+
"guidance_rescale": 0.3,
54+
},
55+
# guidance_rescale 0
56+
{
57+
"prompt": "fight naruto vs saske ",
58+
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
59+
"width": 704,
60+
"height": 480,
61+
"guidance_scale": 3,
4662
"guidance_rescale": 0,
4763
},
64+
# guidance_scale 1
4865
{
4966
"prompt": "reporter in front of the TV cameras talking about the joker ",
5067
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
51-
"width": 480,
52-
"height": 704,
68+
"width": 704,
69+
"height": 480,
5370
"guidance_scale": 1,
5471
"guidance_rescale": 0.3,
5572
},
73+
# guidance_scale 1 guidance_rescale 0
5674
{
5775
"prompt": "Realistic night silhouette of a white Lwxux LX III 2008 with headlights on driving on in the fog in the dark ",
5876
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
59-
"width": 480,
60-
"height": 704,
77+
"width": 704,
78+
"height": 480,
6179
"guidance_scale": 1,
6280
"guidance_rescale": 0,
6381
},
82+
# guidance_scale 1 guidance_rescale 0
6483
{
6584
"prompt": "indian womens wahsing clothes at river side ",
6685
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
67-
"width": 480,
68-
"height": 704,
69-
"guidance_scale": 3,
70-
"guidance_rescale": 0.3,
71-
},
72-
{
73-
"prompt": "octopus figure skating, cartoon ",
74-
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
75-
"width": 480,
76-
"height": 704,
86+
"width": 256,
87+
"height": 128,
7788
"guidance_scale": 3,
7889
"guidance_rescale": 0.3,
7990
},
91+
# big prompt
8092
{
8193
"prompt": "Levitating woman uses magic and fairy dusty spews forth from her fingers. cinematic shot photos taken by ARRI, photos taken "
8294
+ "by sony, photos taken by canon, photos taken by nikon, photos taken by sony, photos taken by hasselblad ",
8395
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
84-
"width": 480,
85-
"height": 704,
96+
"width": 704,
97+
"height": 480,
8698
"guidance_scale": 3,
8799
"guidance_rescale": 0.3,
88100
},
101+
# big prompt, small resolution
89102
{
90103
"prompt": "A mythical river adventure in the Yellow River basin during ancient times, where majestic dragons soar through the turbulent waters, "
91104
+ "casting a vibrant glow on the submerged landscapes, blending a sense of awe and fantasy, Sculpture, intricate clay model with luminescent "
92105
+ "elements, --ar 16:9 --v 5 ",
93106
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
94-
"width": 480,
95-
"height": 704,
107+
"width": 256,
108+
"height": 128,
96109
"guidance_scale": 3,
97110
"guidance_rescale": 0.3,
98111
},
@@ -104,8 +117,8 @@ class Text2VideoEvaluator(BaseEvaluator):
104117
DEF_NUM_FRAMES = 25
105118
DEF_NUM_INF_STEPS = 25
106119
DEF_FRAME_RATE = 25
107-
DEF_WIDTH = 480
108-
DEF_HEIGHT = 704
120+
DEF_WIDTH = 704
121+
DEF_HEIGHT = 480
109122
DEF_GUIDANCE_SCALE = 3
110123
DEF_GUIDANCE_RESCALE = 0.3
111124

@@ -119,7 +132,7 @@ def __init__(
119132
num_frames=25,
120133
crop_prompts=True,
121134
num_samples=None,
122-
gen_image_fn=None,
135+
gen_video_fn=None,
123136
seed=42,
124137
is_genai=False,
125138
) -> None:
@@ -136,20 +149,20 @@ def __init__(
136149
self.similarity = VideoSimilarity()
137150
self.last_cmp = None
138151
self.gt_dir = os.path.dirname(gt_data)
139-
self.generation_fn = gen_image_fn
152+
self.generation_fn = gen_video_fn
140153
self.is_genai = is_genai
141154
self.num_frames = num_frames or self.DEF_NUM_FRAMES
142155
self.frame_rate = self.DEF_FRAME_RATE
143156

144157
if base_model:
145-
self.gt_data = self._generate_data(base_model, gen_image_fn, os.path.join(self.gt_dir, "reference"))
158+
self.gt_data = self._generate_data(base_model, gen_video_fn, os.path.join(self.gt_dir, "reference"))
146159
else:
147160
self.gt_data = pd.read_csv(gt_data, keep_default_na=False)
148161

149162
def get_generation_fn(self):
150163
return self.generation_fn
151164

152-
def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs):
165+
def score(self, model_or_data, gen_video_fn=None, output_dir=None, **kwargs):
153166
if output_dir is None:
154167
video_folder = os.path.join(self.gt_dir, "target")
155168
else:
@@ -158,7 +171,7 @@ def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs):
158171
if isinstance(model_or_data, str) and os.path.exists(model_or_data):
159172
predictions = pd.read_csv(model_or_data, keep_default_na=False)
160173
else:
161-
predictions = self._generate_data(model_or_data, gen_image_fn, video_folder)
174+
predictions = self._generate_data(model_or_data, gen_video_fn, video_folder)
162175
self.predictions = predictions
163176

164177
all_metrics_per_prompt = {}
@@ -185,8 +198,8 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
185198

186199
return res
187200

188-
def _generate_data(self, model, gen_image_fn=None, videos_dir="reference"):
189-
def default_gen_image_fn(
201+
def _generate_data(self, model, gen_video_fn=None, videos_dir="reference"):
202+
def default_gen_video_fn(
190203
model,
191204
prompt,
192205
negative_prompt,
@@ -214,8 +227,8 @@ def default_gen_image_fn(
214227
)
215228
return output.frames[0]
216229

217-
# generation_fn = gen_image_fn or default_gen_image_fn
218-
generation_fn = default_gen_image_fn
230+
# generation_fn = gen_video_fn or default_gen_video_fn
231+
generation_fn = default_gen_video_fn
219232

220233
if self.test_data:
221234
if isinstance(self.test_data, str):
@@ -262,6 +275,9 @@ def default_gen_image_fn(
262275
frame_path = os.path.join(video_path, f"{number}.png")
263276
frame.save(frame_path)
264277
videos.append(video_path)
278+
# video_path = os.path.join(videos_dir, f"video_{i}.mp4")
279+
# export_to_video(frames, video_path, self.frame_rate)
280+
# videos.append(video_path)
265281

266282
res_data["videos"] = videos
267283
df = pd.DataFrame(res_data)

tools/who_what_benchmark/whowhatbench/wwb.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def parse_args():
180180
"--num-inference-steps",
181181
type=int,
182182
default=4,
183-
help="Text-to-image specific parameter that defines the number of denoising steps.",
183+
help="Text-to-image/text-to-video specific parameter that defines the number of denoising steps.",
184184
)
185185
parser.add_argument(
186186
"--seed",
@@ -283,8 +283,9 @@ def parse_args():
283283
"--video-frames-num",
284284
type=int,
285285
default=None,
286-
help="The number of frames that will be taken from video for input, the frames will be taken evenly across the entire length, "
287-
"applicable for Visual Language Models with video inputs",
286+
help="For Visual Language Models with video inputs - the number of frames that will be taken from video for input, "
287+
"the frames will be taken evenly across the entire length. "
288+
"For Text-to-video - the number of frames, which will be generated by model.",
288289
)
289290

290291
return parser.parse_args()
@@ -514,6 +515,22 @@ def genai_gen_image2image(model, prompt, image, num_inference_steps, generator=N
514515
return image
515516

516517

518+
def genai_gen_text2video(
519+
model,
520+
prompt,
521+
negative_prompt,
522+
num_inference_steps,
523+
width=704,
524+
height=480,
525+
num_frames=25,
526+
frame_rate=25,
527+
guidance_scale=3,
528+
guidance_rescale=0.3,
529+
generator=None,
530+
):
531+
return None
532+
533+
517534
def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, generator=None):
518535
image_data = ov.Tensor(np.array(image)[None])
519536
mask_data = ov.Tensor(np.array(mask)[None])
@@ -624,7 +641,7 @@ def create_evaluator(base_model, args):
624641
num_samples=args.num_samples,
625642
num_inference_steps=args.num_inference_steps,
626643
num_frames=args.video_frames_num,
627-
gen_image_fn=genai_gen_image if args.genai else None,
644+
gen_video_fn=genai_gen_text2video if args.genai else None,
628645
is_genai=args.genai,
629646
seed=args.seed,
630647
)

0 commit comments

Comments
 (0)