update video code

ZhangYuanhan-AI · ZhangYuanhan-AI · commit c121c20c5412 · 2024-08-07T02:20:02.000Z
diff --git a/.gitignore b/.gitignore
@@ -70,4 +70,4 @@ build/
 playground/*.json
 mlx_configs/
 data_processing/
-demo/
+# demo/
diff --git a/docs/LLaVA-NeXT-Video_0716.md b/docs/LLaVA-NeXT-Video_0716.md
@@ -14,7 +14,7 @@ The new model achieves the best open-source performance in several video benchma
 - **Model Card**: [LLaVA-NeXT-Video-32B-Qwen on Hugging Face](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen)
 - **Inference Script**:
   ```bash
-  bash scripts/video/demo/video_demo.sh lmms-lab/LLaVA-NeXT-Video-32B-Qwen qwen_1_5 32 2 average after grid True playground/demo/xU25MMA2N4aVtYay.mp4
+  bash scripts/video/demo/video_demo.sh lmms-lab/LLaVA-NeXT-Video-32B-Qwen qwen_1_5 32 2 average grid True playground/demo/xU25MMA2N4aVtYay.mp4
   ```
 
 ### Evaluation Results
diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py
@@ -199,6 +199,22 @@ def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=N
             all_videos_or_images_features.append(feat)
         return all_videos_or_images_features
 
+    def add_token_per_grid(self, image_feature):
+        resize_h = int(math.sqrt(image_feature.shape[1]))
+        num_frames = image_feature.shape[0]
+        image_feature = image_feature.view(num_frames, 1, resize_h, resize_h, -1)
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        return image_feature
+
+    def add_token_per_frame(self, image_feature):
+        image_feature = image_feature.permute(2, 0, 1).contiguous()
+        image_feature =  torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+        image_feature = image_feature.permute(1, 2, 0).contiguous()
+        return image_feature
+
     def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None):
         vision_tower = self.get_vision_tower()
         # rank_print(modalities)
@@ -253,12 +269,31 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio
                     # rank0_print("At least we are reaching here")
                     if image_idx in video_idx_in_batch:  # video operations
                         # rank0_print("Video")
-                        if "unpad" in mm_patch_merge_type:
-                            # image_feature = image_feature.permute(2, 0, 1).contiguous()
-                            # image_feature =  torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
-                            # image_feature = image_feature.permute(1, 2, 0).contiguous()
+                        if self.config.mm_newline_position == "grid":
+                            # Grid-wise
+                            image_feature = self.add_token_per_grid(image_feature)
+                        
+                            new_image_features.append(image_feature)
+                        elif self.config.mm_newline_position == "frame":
+                            # Frame-wise
+                            image_feature = self.add_token_per_frame(image_feature)
+
+                            new_image_features.append(image_feature.flatten(0, 1))
+                            
+                        elif self.config.mm_newline_position == "one_token":
+                            # one-token
                             image_feature = image_feature.flatten(0, 1)
-                            image_feature = torch.cat((image_feature, self.model.image_newline[None].to(image_feature.device)), dim=0)
+                            if 'unpad' in mm_patch_merge_type:
+                                image_feature = torch.cat((
+                                    image_feature,
+                                    self.model.image_newline[None].to(image_feature.device)
+                                ), dim=0)
+                            new_image_features.append(image_feature)      
+                        elif self.config.mm_newline_position == "no_token":
+                            new_image_features.append(image_feature.flatten(0, 1))
+                        else:
+                            raise ValueError(f"Unexpected mm_newline_position: {self.config.mm_newline_position}")
+
 
                     elif image_feature.shape[0] > 1:  # multi patches and multi images operations
                         # rank0_print("Single-images")
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -108,6 +108,9 @@ class ModelArguments:
     pos_skipping_range: Optional[int] = field(default=4096)
 
 
+    mm_newline_position: Optional[str] = field(default="one_token")
+
+
 @dataclass
 class DataArguments:
     data_path: str = field(default=None, metadata={"help": "Path to the training data, in llava's instruction.json format. Supporting multiple json files via /path/to/{a,b,c}.json"})
@@ -1576,6 +1579,7 @@ def make_inputs_require_grad(module, input, output):
         model.config.image_split_resolution = data_args.image_split_resolution
         model.config.tokenizer_padding_side = tokenizer.padding_side
         model.config.tokenizer_model_max_length = tokenizer.model_max_length
+        model.config.mm_newline_position = model_args.mm_newline_position
 
         ### Deciding train which part of the model
         if model_args.mm_tunable_parts is None:  # traditional way of deciding which part to train
diff --git a/playground/demo/video_demo.py b/playground/demo/video_demo.py
diff --git a/scripts/video/demo/video_demo.sh b/scripts/video/demo/video_demo.sh