33"""
44from llava .action .ek_eval import prepare_llava
55from llava .action .generate_interval_pred import get_lookup_dict
6- from llava .action .inference import llava_inference
7-
8- val_metadata = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'
9- root = '/data/shaokai/EK100_512/EK100'
6+ from llava .action .llava_inference import llava_inference
7+
8+ from llava .constants import IMAGE_TOKEN_INDEX , DEFAULT_IMAGE_TOKEN
9+ # val_metadata = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'
10+ # root = '/data/shaokai/EK100_512/EK100'
11+ val_metadata = '/iopsstor/scratch/cscs/hqi/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv'
12+ root = '/iopsstor/scratch/cscs/hqi/VFM/onevision/EK100_512/EK100'
13+
1014n_frames = 32
1115action_representation = 'GT_random_narration'
16+ perspective = 'first_person'
17+ include_time_instruction = False
18+ image_token = DEFAULT_IMAGE_TOKEN
19+
20+
1221
1322def get_frames_by_uid (uid , root ):
1423 from llava .action .utils import avion_video_loader
@@ -29,32 +38,63 @@ def get_frames_by_uid(uid, root):
2938 fast_rrc = False ,
3039 fast_rcc = False ,
3140 jitter = False )
32- return frames
41+ return frames , time_meta
42+ #
43+
44+
45+
46+
47+
3348
34- def inference_task_by_uid (checkpoint_folder , uid , task ):
49+ # for prior actions
50+ def get_meta_data ():
51+ pass
52+
53+
54+ def inference_task_by_uid (question , checkpoint_folder , uid , task ):
3555
3656 tokenizer , model , image_processor , max_length = prepare_llava (checkpoint_folder )
3757
38- frames = get_frames_by_uid (uid , root )
39-
58+ frames , time_meta = get_frames_by_uid (uid , root )
59+
60+ meta_data = None
61+ learn_neighbor_actions = ""
4062 if 'temporal_cot' in task :
41- get_lookup_dict (val_metadata ,
63+ lookup_table = get_lookup_dict (val_metadata ,
4264 action_representation ,
4365 test_type = task ,
4466 pseudo_folder = '' )
45- pred = llava_inference (
46- frames ,
47- tokenizer ,
48- model ,
49- image_processor ,
50- mc_data ,
51- test_type = test_type ,
52- clip_length = clip_length ,
53- num_frames = num_frames ,
54- temperature = temperature ,
55- time_meta = time_meta ,
56- learn_neighbor_actions = learn_neighbor_actions ,
57- meta_data = meta_data ,
58- perspective = perspective ,
59- include_time_instruction = include_time_instruction
60- )
67+ meta_data = lookup_table .get (uid , None )
68+ learn_neighbor_actions = "prior"
69+
70+ video_duration = time_meta ['duration' ]
71+
72+
73+ pred = llava_inference (
74+ [frames ],
75+ tokenizer ,
76+ model ,
77+ image_processor ,
78+ question ,
79+ test_type = task ,
80+ clip_length = n_frames ,
81+ num_frames = n_frames ,
82+ temperature = 0 ,
83+ time_meta = time_meta ,
84+ learn_neighbor_actions = learn_neighbor_actions ,
85+ meta_data = meta_data ,
86+ perspective = perspective ,
87+ include_time_instruction = include_time_instruction
88+ )
89+ print (pred )
90+
91+ if __name__ == '__main__' :
92+ pretrained_model_folder = 'experiments/dev_LLaVA-Video-7B-Qwen2_64f_top5_gpt4o_avion_tim_last_layer_one_token_detection_direct_neighbor_178K_100percent_time'
93+ uid = 'P28-P28_15_50.66_51.69'
94+ task = 'open-ended'
95+ question = "What is the object that is to the left of the knife?"
96+
97+ inference_task_by_uid (question ,
98+ pretrained_model_folder ,
99+ uid ,
100+ task )
0 commit comments