Skip to content

Commit 4ca1a52

Browse files
authored
Fixes Metadata Reading from Released PLM Checkpoints (#665)
* Fixes metadata reading from official PLM checkpoints. * Adds plm.sh under examples/models
1 parent 2735d6b commit 4ca1a52

File tree

2 files changed

+50
-6
lines changed

2 files changed

+50
-6
lines changed

examples/models/plm.sh

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# # STEP # 1: Install lmms-eval
2+
# pip install lmms-eval
3+
4+
# # STEP # 2: Install perception_models (Details at https://github.com/facebookresearch/perception_models)
5+
# git clone https://github.com/facebookresearch/perception_models.git
6+
# cd perception_models
7+
8+
# conda create --name perception_models python=3.12
9+
# conda activate perception_models
10+
11+
# # Install PyTorch
12+
# pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 xformers --index-url https://download.pytorch.org/whl/cu124
13+
14+
# # We use torchcodec for decoding videos into PyTorch tensors
15+
# conda install ffmpeg -c conda-forge
16+
# pip install torchcodec==0.1 --index-url=https://download.pytorch.org/whl/cu124
17+
18+
# pip install -e .
19+
20+
21+
# Use facebook/Perception-LM-1B for 1B parameters model and facebook/Perception-LM-8B for 8B parameters model.
22+
CHECKPOINTS_PATH=facebook/Perception-LM-3B
23+
24+
# Define the tasks you want to evaluate PLM on. We support all the tasks present in lmms-eval, however have tested the following tasks with our models.
25+
ALL_TASKS=(
26+
"docvqa" "chartqa" "textvqa" "infovqa" "ai2d_no_mask" "ok_vqa" "vizwiz_vqa" "mme"
27+
"realworldqa" "pope" "mmmu" "ocrbench" "coco_karpathy_val" "nocaps" "vqav2_val"
28+
"mvbench" "videomme" "vatex_test" "egoschema" "egoschema_subset" "mlvu_dev"
29+
"tempcompass_multi_choice" "perceptiontest_val_mc" "perceptiontest_test_mc"
30+
)
31+
32+
# We select one image and one video task as an example.
33+
SELECTED_TASK="textvqa,videomme"
34+
35+
# After specifying the task/tasks to evaluate, run the following command to start the evaluation.
36+
accelerate launch --num_processes=8 \
37+
-m lmms_eval \
38+
--model plm \
39+
--model_args pretrained=$CHECKPOINTS_PATH,max_tokens=11264 \
40+
--tasks $SELECTED_TASK \
41+
--batch_size 1 \
42+
--log_samples \
43+
--log_samples_suffix plm \
44+
--output_path plm_reproduce

lmms_eval/models/plm.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,11 @@ def __init__(
6565

6666
# Create preprocessors (transforms)
6767
processor = {}
68-
vision_input_type = config.get("model").get("vision_input_type", "thumb+tile")
69-
max_num_tiles = config.get("model").get("max_num_tiles", 36)
68+
vision_input_type = config.get("data").get("vision_input_type", "thumb+tile")
69+
max_num_tiles = config.get("data").get("max_num_tiles", 36)
7070
processor["image"] = get_image_transform(vision_input_type=vision_input_type, image_res=model.vision_model.image_size, max_num_tiles=max_num_tiles)
7171
processor["video"] = get_video_transform(image_res=model.vision_model.image_size)
72-
self._video_max_frames = config.get("model").get("video_max_frames", 32)
72+
self._max_video_frames = config.get("data").get("max_video_frames", 32)
7373

7474
# Create PLM generator
7575
eval_logger.info(f"Creating packed generator with gen_cfg: {gen_cfg}")
@@ -130,8 +130,8 @@ def batch_size(self):
130130
return self.batch_size_per_gpu
131131

132132
@property
133-
def video_max_frames(self):
134-
return self._video_max_frames
133+
def max_video_frames(self):
134+
return self._max_video_frames
135135

136136
@property
137137
def device(self):
@@ -186,7 +186,7 @@ def _collate(x):
186186
if len(visuals) > 0:
187187
visual = visuals[i] if i < len(visuals) else None
188188
if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")): # Video file
189-
video_info = (visual, self.video_max_frames, None, None, None)
189+
video_info = (visual, self.max_video_frames, None, None, None)
190190
visual, _ = self.processor["video"](video_info)
191191
message = (context, visual)
192192
elif isinstance(visual, Image.Image): # Single image

0 commit comments

Comments
 (0)