Skip to content

Commit 7968191

Browse files
author
Ye Shaokai
committed
fixed zeroshot prompting
1 parent db2ad43 commit 7968191

File tree

2 files changed

+6
-4
lines changed

2 files changed

+6
-4
lines changed

action/ek_eval.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,9 +465,7 @@ def evaluate_on_EK100(eval_args,
465465
global_total_samples.add_(1)
466466

467467
logger.info(f'Process {dist.get_rank()} - local_total_samples: {local_total_samples:.4f}')
468-
469468
logger.info(f'Process {dist.get_rank()} - loca_llava_correct: {llava_correct:.4f}')
470-
471469
logger.info(f'Process {dist.get_rank()} - local_running_corrects: {local_running_corrects:.4f}')
472470

473471

action/llava_ov_inference.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,9 @@ def llava_video_process(
8383
video_duration = time_meta['duration'].item()
8484
n_frames = time_meta['n_frames'].item()
8585
frame_time = time_meta['frame_time']
86-
frame_time = [e[0] for e in frame_time]
87-
time_instruciton = f"The video lasts for {video_duration:.2f} seconds, and {n_frames} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
86+
print ('frame time', frame_time)
87+
frame_time = frame_time[0]
88+
time_instruciton = f"You are seeing a video taken from egocentric view. The video lasts for {video_duration:.2f} seconds, and {n_frames} frames are uniformly sampled from it. These frames are located at {frame_time}. What is the person doing? Format your answer letter. verb noun such as A. move knife."
8889

8990
frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
9091

@@ -97,12 +98,15 @@ def llava_video_process(
9798

9899
question = DEFAULT_IMAGE_TOKEN + f"{time_instruciton}\n:{options}"
99100

101+
print ('what is the question')
102+
print (question)
100103

101104
conv = copy.deepcopy(conv_templates[conv_template])
102105
conv.append_message(conv.roles[0], question)
103106
conv.append_message(conv.roles[1], None)
104107
prompt_question = conv.get_prompt()
105108

109+
106110
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
107111
image_sizes = [frame.size for frame in video_frames]
108112

0 commit comments

Comments
 (0)