Skip to content

Commit 0210073

Browse files
committed
fixes
1 parent 031c80c commit 0210073

File tree

5 files changed

+28
-19
lines changed

5 files changed

+28
-19
lines changed

tools/who_what_benchmark/tests/test_cli_videos.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def teardown_module():
2828

2929
@pytest.mark.parametrize(
3030
("model_id"),
31-
["Lightricks/LTX-Video"],
31+
[("optimum-intel-internal-testing/tiny-random-ltx-video", "text-to-video")],
3232
)
3333
def test_image_model_genai(model_id, model_type, tmp_path):
3434
GT_FILE = tmp_path / "gt.csv"

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -646,11 +646,7 @@ def load_text2video_model(model_id, device="CPU", ov_config=None, use_hf=False,
646646
model = OVLTXPipeline.from_pretrained(model_id, device=device, **model_kwargs)
647647
except ValueError:
648648
model = OVLTXPipeline.from_pretrained(
649-
model_id,
650-
trust_remote_code=True,
651-
use_cache=True,
652-
device=device,
653-
**model_kwargs
649+
model_id, trust_remote_code=True, use_cache=True, device=device, **model_kwargs
654650
)
655651

656652
return model

tools/who_what_benchmark/whowhatbench/text2video_evaluator.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
},
8080
{
8181
"prompt": "Levitating woman uses magic and fairy dusty spews forth from her fingers. cinematic shot photos taken by ARRI, photos taken "
82-
+ "by sony, photos taken by canon, photos taken by nikon, photos taken by sony, photos taken by hasselblad ",
82+
+ "by sony, photos taken by canon, photos taken by nikon, photos taken by sony, photos taken by hasselblad ",
8383
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
8484
"width": 480,
8585
"height": 704,
@@ -88,8 +88,8 @@
8888
},
8989
{
9090
"prompt": "A mythical river adventure in the Yellow River basin during ancient times, where majestic dragons soar through the turbulent waters, "
91-
+ "casting a vibrant glow on the submerged landscapes, blending a sense of awe and fantasy, Sculpture, intricate clay model with luminescent "
92-
+ "elements, --ar 16:9 --v 5 ",
91+
+ "casting a vibrant glow on the submerged landscapes, blending a sense of awe and fantasy, Sculpture, intricate clay model with luminescent "
92+
+ "elements, --ar 16:9 --v 5 ",
9393
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
9494
"width": 480,
9595
"height": 704,
@@ -123,9 +123,9 @@ def __init__(
123123
seed=42,
124124
is_genai=False,
125125
) -> None:
126-
assert (
127-
base_model is not None or gt_data is not None
128-
), "Text generation pipeline for evaluation or ground trush data must be defined"
126+
assert base_model is not None or gt_data is not None, (
127+
"Text generation pipeline for evaluation or ground trush data must be defined"
128+
)
129129

130130
self.test_data = test_data
131131
self.metrics = metrics
@@ -158,9 +158,7 @@ def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs):
158158
if isinstance(model_or_data, str) and os.path.exists(model_or_data):
159159
predictions = pd.read_csv(model_or_data, keep_default_na=False)
160160
else:
161-
predictions = self._generate_data(
162-
model_or_data, gen_image_fn, video_folder
163-
)
161+
predictions = self._generate_data(model_or_data, gen_image_fn, video_folder)
164162
self.predictions = predictions
165163

166164
all_metrics_per_prompt = {}

tools/who_what_benchmark/whowhatbench/whowhat_metrics.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import numpy as np
1313
from sentence_transformers import SentenceTransformer, util
14-
from transformers import CLIPImageProcessor, CLIPModel, LlavaNextVideoProcessor, LlavaNextVideoModel
14+
from transformers import CLIPImageProcessor, CLIPModel
1515
from tqdm import tqdm
1616
from sklearn.metrics.pairwise import cosine_similarity
1717

@@ -236,6 +236,8 @@ def evaluate(self, data_gold, data_prediction):
236236

237237
class VideoSimilarity:
238238
def __init__(self) -> None:
239+
from transformers import LlavaNextVideoProcessor, LlavaNextVideoModel
240+
239241
self.processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
240242
self.model = LlavaNextVideoModel.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf").eval()
241243

tools/who_what_benchmark/whowhatbench/wwb.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,17 @@ def parse_args():
6363
parser.add_argument(
6464
"--model-type",
6565
type=str,
66-
choices=["text", "text-to-image", "text-to-video", "visual-text", "visual-video-text", "image-to-image",
67-
"image-inpainting", "text-embedding", "text-reranking"],
66+
choices=[
67+
"text",
68+
"text-to-image",
69+
"text-to-video",
70+
"visual-text",
71+
"visual-video-text",
72+
"image-to-image",
73+
"image-inpainting",
74+
"text-embedding",
75+
"text-reranking",
76+
],
6877
default="text",
6978
help="Indicated the model type: text - for causal text generation, visual-text - for Visual Language Models with image inputs, "
7079
"visual-video-text - for Visual Language Models with video inputs, text-to-image - for image generation, "
@@ -862,7 +871,11 @@ def main():
862871
if args.verbose and (args.target_model or args.target_data):
863872
if args.model_type in ["text", "visual-text", "visual-video-text"]:
864873
print_text_results(evaluator)
865-
elif "text-to-image" in args.model_type or "image-to-image" in args.model_type or "text-to-video" in args.model_type:
874+
elif (
875+
"text-to-image" in args.model_type
876+
or "image-to-image" in args.model_type
877+
or "text-to-video" in args.model_type
878+
):
866879
print_image_results(evaluator)
867880
elif args.model_type in ['text-embedding']:
868881
print_embeds_results(evaluator)

0 commit comments

Comments
 (0)