Fixes Metadata Reading from Released PLM Checkpoints (#665)

mmaaz60 · web-flow · commit 4ca1a52b55ac · 2025-05-07T13:11:44.000+08:00
* Fixes metadata reading from official PLM checkpoints.

* Adds plm.sh under examples/models
diff --git a/examples/models/plm.sh b/examples/models/plm.sh
@@ -0,0 +1,44 @@
+# # STEP # 1: Install lmms-eval
+# pip install lmms-eval
+
+# # STEP # 2: Install perception_models (Details at https://github.com/facebookresearch/perception_models)
+# git clone https://github.com/facebookresearch/perception_models.git
+# cd perception_models
+
+# conda create --name perception_models python=3.12
+# conda activate perception_models
+
+# # Install PyTorch
+# pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 xformers --index-url https://download.pytorch.org/whl/cu124
+
+# # We use torchcodec for decoding videos into PyTorch tensors
+# conda install ffmpeg -c conda-forge
+# pip install torchcodec==0.1 --index-url=https://download.pytorch.org/whl/cu124
+
+# pip install -e .
+
+
+# Use facebook/Perception-LM-1B for 1B parameters model and facebook/Perception-LM-8B for 8B parameters model.
+CHECKPOINTS_PATH=facebook/Perception-LM-3B
+
+# Define the tasks you want to evaluate PLM on. We support all the tasks present in lmms-eval, however have tested the following tasks with our models.
+ALL_TASKS=(
+    "docvqa" "chartqa" "textvqa" "infovqa" "ai2d_no_mask" "ok_vqa" "vizwiz_vqa" "mme"
+    "realworldqa" "pope" "mmmu" "ocrbench" "coco_karpathy_val" "nocaps" "vqav2_val"
+    "mvbench" "videomme" "vatex_test" "egoschema" "egoschema_subset" "mlvu_dev"
+    "tempcompass_multi_choice" "perceptiontest_val_mc" "perceptiontest_test_mc"
+)
+
+# We select one image and one video task as an example.
+SELECTED_TASK="textvqa,videomme"
+
+# After specifying the task/tasks to evaluate, run the following command to start the evaluation.
+accelerate launch --num_processes=8 \
+-m lmms_eval \
+--model plm \
+--model_args pretrained=$CHECKPOINTS_PATH,max_tokens=11264 \
+--tasks $SELECTED_TASK \
+--batch_size 1 \
+--log_samples \
+--log_samples_suffix plm \
+--output_path plm_reproduce
diff --git a/lmms_eval/models/plm.py b/lmms_eval/models/plm.py
@@ -65,11 +65,11 @@ def __init__(
 
         # Create preprocessors (transforms)
         processor = {}
-        vision_input_type = config.get("model").get("vision_input_type", "thumb+tile")
-        max_num_tiles = config.get("model").get("max_num_tiles", 36)
+        vision_input_type = config.get("data").get("vision_input_type", "thumb+tile")
+        max_num_tiles = config.get("data").get("max_num_tiles", 36)
         processor["image"] = get_image_transform(vision_input_type=vision_input_type, image_res=model.vision_model.image_size, max_num_tiles=max_num_tiles)
         processor["video"] = get_video_transform(image_res=model.vision_model.image_size)
-        self._video_max_frames = config.get("model").get("video_max_frames", 32)
+        self._max_video_frames = config.get("data").get("max_video_frames", 32)
 
         # Create PLM generator
         eval_logger.info(f"Creating packed generator with gen_cfg: {gen_cfg}")
@@ -130,8 +130,8 @@ def batch_size(self):
         return self.batch_size_per_gpu
 
     @property
-    def video_max_frames(self):
-        return self._video_max_frames
+    def max_video_frames(self):
+        return self._max_video_frames
 
     @property
     def device(self):
@@ -186,7 +186,7 @@ def _collate(x):
                 if len(visuals) > 0:
                     visual = visuals[i] if i < len(visuals) else None
                     if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")):  # Video file
-                        video_info = (visual, self.video_max_frames, None, None, None)
+                        video_info = (visual, self.max_video_frames, None, None, None)
                         visual, _ = self.processor["video"](video_info)
                         message = (context, visual)
                     elif isinstance(visual, Image.Image):  # Single image