Skip to content

Commit 8880316

Browse files
author
Ye Shaokai
committed
updates
1 parent dfa22ed commit 8880316

File tree

6 files changed

+172
-43
lines changed

6 files changed

+172
-43
lines changed

llava/action/make_visualizations.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def save_visualization(vis_folder, frames, uid):
163163
video_out.write(bgr_frame)
164164
video_out.release()
165165

166-
def visualize_with_uid(uid, out_folder):
166+
def visualize_with_uid(data_root, uid, out_folder):
167167
from llava.action.utils import avion_video_loader
168168

169169
vid_path = '_'.join(uid.split('_')[:2]).replace('-', '/')
@@ -252,7 +252,8 @@ def visualize_with_llava(pretrained_path, uid, question_type, gen_type):
252252
# llava_pretrained_path = 'experiments/LLaVA-Video-7B-Qwen2'
253253
# uid = 'P01-P01_11_182.65_192.07'
254254
# visualize_with_llava(llava_pretrained_path, uid, 'caption', 'tim')
255-
# visualize_with_uid("P28-P28_16_73.84_74.66")
256-
# visualize_with_uid("P28-P28_15_50.66_51.69")
257-
# visualize_with_uid("P26-P26_41_113.0_114.1")
258-
visualize_with_uid("P28-P28_26_45.97_46.97", "key_confusing_examples")
255+
# visualize_with_uid(root, "P28-P28_16_73.84_74.66")
256+
# visualize_with_uid(root, "P28-P28_15_50.66_51.69")
257+
# visualize_with_uid(root, "P26-P26_41_113.0_114.1")
258+
visualize_with_uid(root, 'P23-P23_05_217.41_218.39', 'figure1_vis')
259+
# visualize_with_uid(root, "P28-P28_26_45.97_46.97", "key_confusing_examples")

llava/action/prepare_demo.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import json
2+
from llava.action.make_visualizations import visualize_with_uid
3+
from llava.action.selective_inference import SelectiveInferencer
4+
import random
5+
import os
6+
# 1) iterate through llava_win json, retrieve a list of uids
7+
# 2) save the corresponding video clips
8+
# 3) add caption and free-end question answering
9+
# after 1) and 3), there should be one single json file that uses uid as the key
10+
# and it contains: caption (chatgpt, llavaction), mqa (chatgpt, llavaction, gt)
11+
12+
13+
def load_llava_wins(path):
14+
with open(path, 'r') as f:
15+
data = json.load(f)
16+
return data
17+
18+
def save_video_clips_with_uids(data_root, llava_win_path, vis_folder, checkpoint_folder):
19+
llava_wins = load_llava_wins(llava_win_path)
20+
uids = list(llava_wins.keys())
21+
random.shuffle(uids)
22+
sample_uids = uids[:20]
23+
ret = {}
24+
inferencer = SelectiveInferencer(data_root,
25+
checkpoint_folder,
26+
include_time_instruction = False,
27+
n_frames = 32)
28+
count = 0
29+
for uid in sample_uids:
30+
if count > 10:
31+
break
32+
data = llava_wins[uid]
33+
if data['tim_chatgpt_pred'] not in data['llavaction_options']:
34+
continue
35+
data.pop('llava_pred')
36+
data.pop('llava_options')
37+
#data.pop('tim_chatgpt_pred')
38+
data.pop('random_chatgpt_pred')
39+
data.pop('tim_chatgpt_options')
40+
data.pop('random_chatgpt_options')
41+
visualize_with_uid(data_root, uid, vis_folder)
42+
open_ended = get_open_ended_question(inferencer, uid, checkpoint_folder)
43+
caption = get_caption(inferencer, uid, checkpoint_folder)
44+
data['open_ended'] = open_ended
45+
data['caption'] = caption
46+
ret[uid] = data
47+
count+=1
48+
with open('demo_videos/demo.json', 'w') as f:
49+
json.dump(ret, f, indent=4)
50+
51+
def get_open_ended_question(inferencer,
52+
uid,
53+
checkpoint_folder):
54+
mqa = inferencer.inference('what objects are visible in the video?',
55+
uid,
56+
'open-ended')
57+
return mqa
58+
59+
def get_caption(inferencer,
60+
uid,
61+
checkpoint_folder):
62+
caption = inferencer.inference('',
63+
uid,
64+
'caption')
65+
return caption
66+
67+
68+
if __name__ == '__main__':
69+
llava_win_path = 'llavaction_win.json'
70+
vis_folder = 'demo_videos'
71+
os.makedirs(vis_folder, exist_ok = True)
72+
checkpoint_folder = 'experiments/dev_7b_16f_top5_strong_first_layer_three_tokens_detection_and_direct_llava_video_10percent/checkpoint-15000/'
73+
data_root = '/data/shaokai/EK100_512/EK100'
74+
save_video_clips_with_uids(data_root,
75+
llava_win_path,
76+
vis_folder,
77+
checkpoint_folder)

llava/action/selective_inference.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from llava.action.ek_eval import prepare_llava
55
from llava.action.generate_interval_pred import get_lookup_dict
66
from llava.action.llava_inference import llava_inference
7+
from llava.action.utils import avion_video_loader
78

89
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
910
# val_metadata = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'
@@ -20,8 +21,8 @@
2021

2122

2223
def get_frames_by_uid(uid, root):
23-
from llava.action.utils import avion_video_loader
2424
vid_path = '_'.join(uid.split('_')[:2]).replace('-', '/')
25+
print ('debug', uid)
2526
start_timestamp, end_timestamp = uid.split('_')[2:]
2627
start_timestamp = float(start_timestamp)
2728
end_timestamp = float(end_timestamp)
@@ -51,11 +52,11 @@ def get_meta_data():
5152
pass
5253

5354

54-
def inference_task_by_uid(question, checkpoint_folder, uid, task):
55+
def inference_task_by_uid(data_root, question, checkpoint_folder, uid, task):
5556

5657
tokenizer, model, image_processor, max_length = prepare_llava(checkpoint_folder)
5758

58-
frames, time_meta = get_frames_by_uid(uid, root)
59+
frames, time_meta = get_frames_by_uid(uid, data_root)
5960

6061
meta_data = None
6162
learn_neighbor_actions = ""
@@ -86,15 +87,56 @@ def inference_task_by_uid(question, checkpoint_folder, uid, task):
8687
perspective = perspective,
8788
include_time_instruction = include_time_instruction
8889
)
89-
print (pred)
90+
return pred
91+
92+
class SelectiveInferencer:
93+
def __init__(self, data_root, checkpoint_folder, include_time_instruction = False, n_frames = 32):
94+
self.data_root = data_root
95+
self.checkpoint_folder = checkpoint_folder
96+
self.tokenizer, self.model, self.image_processor, self.max_length = prepare_llava(checkpoint_folder)
97+
self.include_time_instruction = include_time_instruction
98+
self.n_frames = n_frames
99+
def inference(self, question, uid, task):
100+
frames, time_meta = get_frames_by_uid(uid, self.data_root)
101+
102+
meta_data = None
103+
learn_neighbor_actions = ""
104+
if 'temporal_cot' in task:
105+
lookup_table = get_lookup_dict(val_metadata,
106+
action_representation,
107+
test_type = task,
108+
pseudo_folder = '')
109+
meta_data = lookup_table.get(uid, None)
110+
learn_neighbor_actions = "prior"
111+
112+
113+
pred = llava_inference(
114+
[frames],
115+
self.tokenizer,
116+
self.model,
117+
self.image_processor,
118+
question,
119+
test_type = task,
120+
clip_length = self.n_frames,
121+
num_frames= self.n_frames,
122+
temperature = 0,
123+
time_meta = time_meta,
124+
learn_neighbor_actions = learn_neighbor_actions,
125+
meta_data = meta_data,
126+
perspective = perspective,
127+
include_time_instruction = self.include_time_instruction
128+
)
129+
return pred
130+
90131

91132
if __name__ == '__main__':
92133
pretrained_model_folder = 'experiments/dev_LLaVA-Video-7B-Qwen2_64f_top5_gpt4o_avion_tim_last_layer_one_token_detection_direct_neighbor_178K_100percent_time'
93134
uid = 'P28-P28_15_50.66_51.69'
94135
task = 'open-ended'
95136
question = "What is the object that is to the left of the knife?"
96137

97-
inference_task_by_uid(question,
138+
inference_task_by_uid(data_root,
139+
question,
98140
pretrained_model_folder,
99141
uid,
100142
task)

llava/action/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -660,7 +660,7 @@ def avion_video_loader(root, vid, ext, second, end_second,
660660
chunk_start = int(second) // chunk_len * chunk_len
661661
chunk_end = int(end_second) // chunk_len * chunk_len
662662
while True:
663-
video_filename = osp.join(root, '{}.{}'.format(vid, ext), '{}.{}'.format(chunk_end, ext))
663+
video_filename = osp.join(root, '{}.{}'.format(vid, ext), '{}.{}'.format(chunk_end, ext))
664664
if not osp.exists(video_filename):
665665
# print("{} does not exists!".format(video_filename))
666666
chunk_end -= chunk_len

llava/action/vis_utils.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -237,19 +237,21 @@ def search_llavaction_win(tim_chatgpt_file,
237237
llavaction_options = llavaction_pred[uid]['options']
238238
if llavaction_pred[uid]['pred'] == llavaction_pred[uid]['gt'] and \
239239
tim_chatgpt_pred[uid]['pred'] != tim_chatgpt_pred[uid]['gt'] and \
240-
llava_pred[uid]['pred'] != llava_pred[uid]['gt']:
240+
llava_pred[uid]['pred'] != llava_pred[uid]['gt'] and \
241+
random_chatgpt_pred[uid]['pred'] == random_chatgpt_pred[uid]['gt']:
241242

242243
results[uid] = {'gt': tim_chatgpt_pred[uid]['gt'],
243244
'tim_chatgpt_pred': tim_chatgpt_pred[uid]['pred'],
245+
'random_chatgpt_pred': random_chatgpt_pred[uid]['pred'],
244246
'llava_pred': llava_pred[uid]['pred'],
245247
'llavaction_pred': llavaction_pred[uid]['pred'],
246248
'tim_chatgpt_options': tim_chatgpt_options,
247249
'llava_options': llava_options,
248250
'llavaction_options': llavaction_options,
249251
'random_chatgpt_options': random_chatgpt_options}
250-
# write results to a file
251-
with open('llavaction_win.json', 'w') as f:
252-
json.dump(results, f, indent = 4)
252+
# write results to a file
253+
with open('llavaction_win.json', 'w') as f:
254+
json.dump(results, f, indent = 4)
253255

254256
def get_wrong_prediction_uids(prediction_folder, ann_file):
255257
"""
@@ -331,14 +333,14 @@ def walk_through(ann_file):
331333
ann_file = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'
332334
prediction_folder = '/data/shaokai/predictions_for_vis/dev_7b_16f_top5_full_includes_tim/'
333335
#walk_through(ann_file)
334-
get_wrong_prediction_uids(prediction_folder, ann_file)
336+
#get_wrong_prediction_uids(prediction_folder, ann_file)
335337
root = '/data/shaokai/predictions_for_vis/'
336338
chatgpt_tim_file = os.path.join(root, 'gpt-4o-2024-08-06_tim_GT_random_narration_top5_8f_9668samples.json')
337339
chatgpt_random_file = os.path.join(root, 'gpt-4o-2024-08-06_random_GT_random_narration_top5_8f_9668samples.json')
338340
llava_zeroshot_folder = os.path.join(root, 'LLaVA_Video_7B')
339341
llavaction_folder = os.path.join(root, 'LLaVAction_7B')
340-
# search_llavaction_win(chatgpt_tim_file,
341-
# chatgpt_random_file,
342-
# llava_zeroshot_folder,
343-
# llavaction_folder)
342+
search_llavaction_win(chatgpt_tim_file,
343+
chatgpt_random_file,
344+
llava_zeroshot_folder,
345+
llavaction_folder)
344346

shaokai_generate_train.sh

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# pip install moviepy spacy==3.7.5 numpy==1.26.1 && python -m spacy download en_core_web_sm &&
1+
pip install moviepy spacy==3.7.5 numpy==1.26.1 && python -m spacy download en_core_web_sm &&
22
export PYTHONPATH=/mnt/SV_storage/VFM/LLaVA-NeXT:$PYTHONPATH
33
# export PYTHONPATH=/usr/local/lib/python3.10/site-packages/decord-0.6.0-py3.10-linux-x86_64.egg/:$PYTHONPATH
44

@@ -109,7 +109,6 @@ export PYTHONPATH=/mnt/SV_storage/VFM/LLaVA-NeXT:$PYTHONPATH
109109

110110

111111

112-
<<<<<<< HEAD
113112
# python3 llava/action/generate_description.py \
114113
# --train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
115114
# --out_folder /data/shaokai/EK100_inst_train/cross_validation \
@@ -118,13 +117,21 @@ export PYTHONPATH=/mnt/SV_storage/VFM/LLaVA-NeXT:$PYTHONPATH
118117
# --action_representation GT_random_narration \
119118
# --n_options 20
120119

121-
# python3 llava/action/generate_description.py \
122-
# --train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
123-
# --out_folder /data/shaokai/EK100_inst_train/cross_validation \
124-
# --train_predictions /data/shaokai/TIM_PREDS/tim_pred_ids_train_cross.json \
125-
# --gen_type tim_mc \
126-
# --action_representation official_key \
127-
# --n_options 20
120+
python3 llava/action/generate_description.py \
121+
--train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
122+
--out_folder /data/shaokai/EK100_inst_train/cross_validation \
123+
--train_predictions /data/shaokai/TIM_PREDS/tim_pred_ids_train_cross.json \
124+
--gen_type tim_mc \
125+
--action_representation official_key \
126+
--n_options 40
127+
128+
python3 llava/action/generate_description.py \
129+
--train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
130+
--out_folder /data/shaokai/EK100_inst_train/cross_validation \
131+
--train_predictions /data/shaokai/TIM_PREDS/tim_pred_ids_train_cross.json \
132+
--gen_type tim_mc \
133+
--action_representation official_key \
134+
--n_options 80
128135

129136

130137
# python3 llava/action/generate_description.py \
@@ -148,22 +155,22 @@ export PYTHONPATH=/mnt/SV_storage/VFM/LLaVA-NeXT:$PYTHONPATH
148155
# --gen_type direct_narration \
149156
# --action_representation GT_random_narration \
150157

151-
python3 llava/action/generate_description.py \
152-
--train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
153-
--out_folder /data/shaokai/EK100_inst_train/\
154-
--gen_type direct_narration \
155-
--action_representation official_key \
156-
=======
158+
# python3 llava/action/generate_description.py \
159+
# --train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
160+
# --out_folder /data/shaokai/EK100_inst_train/\
161+
# --gen_type direct_narration \
162+
# --action_representation official_key \
157163

158164

159165

160-
python3 llava/action/generate_description.py \
161-
--train_metadata /mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
162-
--out_folder /mnt/SV_storage/VFM/EK100/EK100_in_LLAVA/Cross_3 \
163-
--train_predictions /mnt/SV_storage/VFM/EK100/EK100_in_LLAVA/Cross_3/tim_pred_ids_train_3cross.json \
164-
--gen_type tim_mc \
165-
--action_representation official_key \
166-
--n_narrations 5
166+
167+
# python3 llava/action/generate_description.py \
168+
# --train_metadata /mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
169+
# --out_folder /mnt/SV_storage/VFM/EK100/EK100_in_LLAVA/Cross_3 \
170+
# --train_predictions /mnt/SV_storage/VFM/EK100/EK100_in_LLAVA/Cross_3/tim_pred_ids_train_3cross.json \
171+
# --gen_type tim_mc \
172+
# --action_representation official_key \
173+
# --n_narrations 5
167174

168175
# python3 llava/action/generate_description.py \
169176
# --train_metadata /mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
@@ -172,4 +179,4 @@ python3 llava/action/generate_description.py \
172179
# --gen_type tim_mc \
173180
# --action_representation official_key \
174181
# --n_narrations 5
175-
>>>>>>> origin/haozhedev
182+

0 commit comments

Comments
 (0)