update video code

ZhangYuanhan-AI · ZhangYuanhan-AI · commit 7ee057fd1d95 · 2024-08-26T07:02:21.000Z
diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py
@@ -93,6 +93,13 @@ def initialize_vision_modules(self, model_args, fsdp=None):
         self.config.mm_vision_select_feature = mm_vision_select_feature
         self.config.mm_patch_merge_type = mm_patch_merge_type
 
+        if not hasattr(self.config, 'add_faster_video'):
+            if model_args.add_faster_video:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.faster_token = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+
         if getattr(self, "mm_projector", None) is None:
             self.mm_projector = build_vision_projector(self.config, vision_cfg=vision_tower.config)
 
@@ -160,19 +167,19 @@ def get_model(self):
     def get_vision_tower(self):
         return self.get_model().get_vision_tower()
 
-    def get_2dPool(self, image_feature):
+    def get_2dPool(self, image_feature, stride=2):
         height = width = self.get_vision_tower().num_patches_per_side
         num_frames, num_tokens, num_dim = image_feature.shape
         image_feature = image_feature.view(num_frames, height, width, -1)
         image_feature = image_feature.permute(0, 3, 1, 2).contiguous()
         # image_feature = nn.functional.max_pool2d(image_feature, self.config.mm_spatial_pool_stride)
         if self.config.mm_spatial_pool_mode == "average":
-            image_feature = nn.functional.avg_pool2d(image_feature, self.config.mm_spatial_pool_stride)
+            image_feature = nn.functional.avg_pool2d(image_feature, stride)
         elif self.config.mm_spatial_pool_mode == "max":
-            image_feature = nn.functional.max_pool2d(image_feature, self.config.mm_spatial_pool_stride)
+            image_feature = nn.functional.max_pool2d(image_feature, stride)
         elif self.config.mm_spatial_pool_mode == "bilinear":
             height, weight = image_feature.shape[2:]
-            scaled_shape = [math.ceil(height / 2), math.ceil(weight / 2)]
+            scaled_shape = [math.ceil(height / stride), math.ceil(weight / stride)]
             image_feature = nn.functional.interpolate(image_feature, size=scaled_shape, mode='bilinear')
 
         else:
@@ -191,21 +198,46 @@ def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=N
         videos_or_images_features = self.get_model().get_vision_tower()(videos_or_images)
         per_videos_or_images_features = torch.split(videos_or_images_features, split_sizes, dim=0)  # tuple, (dim_1, 576, 4096)
         all_videos_or_images_features = []
+        all_faster_video_features = []
+        cur_mm_spatial_pool_stride = self.config.mm_spatial_pool_stride
 
         for idx, feat in enumerate(per_videos_or_images_features):
+            
             feat = self.get_model().mm_projector(feat)
-            if idx in video_idx_in_batch:
-                feat = self.get_2dPool(feat)
-            all_videos_or_images_features.append(feat)
-        return all_videos_or_images_features
+            faster_video_feature = 0
+            slower_img_feat = 0
+            if idx in video_idx_in_batch and cur_mm_spatial_pool_stride > 1:
+                slower_img_feat = self.get_2dPool(feat,cur_mm_spatial_pool_stride)
+                if self.config.add_faster_video:
+                    cur_mm_spatial_pool_stride = cur_mm_spatial_pool_stride * 2
+                    faster_video_feature = self.get_2dPool(feat,cur_mm_spatial_pool_stride)
+            if slower_img_feat is not 0:
+                all_videos_or_images_features.append(slower_img_feat)
+            else:
+                all_videos_or_images_features.append(feat)
+            all_faster_video_features.append(faster_video_feature)
+        return all_videos_or_images_features,all_faster_video_features
 
     def add_token_per_grid(self, image_feature):
         resize_h = int(math.sqrt(image_feature.shape[1]))
         num_frames = image_feature.shape[0]
+        feature_dim = image_feature.shape[-1]
+
         image_feature = image_feature.view(num_frames, 1, resize_h, resize_h, -1)
         image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
         image_feature = image_feature.flatten(1, 2).flatten(2, 3)
         image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+        if self.config.add_faster_video:
+            # import pdb; pdb.set_trace()
+            # (3584, 832, 14) -> (3584, 64, 13, 14)
+            image_feature = image_feature.view(feature_dim, num_frames,resize_h, -1)
+            #  (3584, 64, 13, 14) -> (64, 13, 14, 3584)
+            image_feature = image_feature.permute(1, 2, 3, 0).contiguous()
+            # (64, 13, 14, 3584) -> (64, 13*14, 3584)
+            image_feature = image_feature.flatten(1, 2)
+            # import pdb; pdb.set_trace()
+            return image_feature
+        # import pdb; pdb.set_trace()
         image_feature = image_feature.flatten(1, 2).transpose(0, 1)
         return image_feature
 
@@ -246,6 +278,7 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio
             concat_images = torch.cat([image for image in images_list], dim=0)
             split_sizes = [image.shape[0] for image in images_list]
             encoded_image_features = self.encode_images(concat_images)
+            # image_features,all_faster_video_features = self.encode_multimodals(concat_images, video_idx_in_batch, split_sizes)
 
             # This is a list, each element is [num_images, patch * patch, dim]
             # rank_print(f"Concat images : {concat_images.shape}")
@@ -278,6 +311,20 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio
                         if self.config.mm_newline_position == "grid":
                             # Grid-wise
                             image_feature = self.add_token_per_grid(image_feature)
+                            if self.config.add_faster_video:
+                                faster_video_feature = self.add_token_per_grid(all_faster_video_features[image_idx])
+                                # Add a token for each frame
+                                concat_slow_fater_token = []
+                                # import pdb; pdb.set_trace()
+                                for _ in range(image_feature.shape[0]):
+                                    if _ % self.config.faster_token_stride == 0:
+                                        concat_slow_fater_token.append(torch.cat((image_feature[_], self.model.faster_token[None].to(image_feature.device)), dim=0))
+                                    else:
+                                        concat_slow_fater_token.append(torch.cat((faster_video_feature[_], self.model.faster_token[None].to(image_feature.device)), dim=0))
+                                # import pdb; pdb.set_trace()
+                                image_feature = torch.cat(concat_slow_fater_token)
+
+                                # print("!!!!!!!!!!!!")
                         
                             new_image_features.append(image_feature)
                         elif self.config.mm_newline_position == "frame":
@@ -357,12 +404,13 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio
                             pass
                         else:
                             image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                        new_image_features.append(image_feature)
                     else:  # single image operations
                         image_feature = image_feature[0]
                         if "unpad" in mm_patch_merge_type:
                             image_feature = torch.cat((image_feature, self.model.image_newline[None]), dim=0)
 
-                    new_image_features.append(image_feature)
+                        new_image_features.append(image_feature)
                 image_features = new_image_features
             else:
                 raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -108,7 +108,11 @@ class ModelArguments:
     pos_skipping_range: Optional[int] = field(default=4096)
 
 
-    mm_newline_position: Optional[str] = field(default="one_token")
+    mm_newline_position: Optional[str] = field(default="grid")
+    delay_load: Optional[bool] = field(default=True)
+    add_faster_video: Optional[bool] = field(default=False)
+    faster_token_stride: Optional[int] = field(default=10)
+
 
 
 @dataclass
@@ -126,6 +130,8 @@ class DataArguments:
     video_folder: Optional[str] = field(default=None)
     video_fps: Optional[int] = field(default=1)
     frames_upbound: Optional[int] = field(default=0)
+    add_time_instruction: Optional[bool] = field(default=False)
+    force_sample: Optional[bool] = field(default=False)
 
 
 @dataclass
@@ -1158,10 +1164,22 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
                     frame_files.sort()  # Ensure the frames are sorted if they are named sequentially
 
                     # TODO: Hard CODE: Determine the indices for uniformly sampling 10 frames
-                    num_frames_to_sample = 10
+                    if self.data_args.force_sample:
+                        num_frames_to_sample = self.data_args.frames_upbound
+                    else:
+                        num_frames_to_sample = 10
+
+                    avg_fps = 2
+                    
                     total_frames = len(frame_files)
                     sampled_indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
 
+
+                    frame_time = [i/2 for i in sampled_indices]
+                    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+
+                    video_time = total_frames / avg_fps
+
                     # Read and store the sampled frames
                     video = []
                     for idx in sampled_indices:
@@ -1173,12 +1191,16 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
                         except IOError:
                             print(f"Failed to read frame at path: {frame_path}")
                 else:
-                    video = process_video_with_decord(video_file, self.data_args)
+                    video, video_time, frame_time, num_frames_to_sample = process_video_with_decord(video_file, self.data_args)
 
                 processor = self.data_args.image_processor
                 image = processor.preprocess(video, return_tensors="pt")["pixel_values"]
+                if self.data_args.add_time_instruction:
+                    time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
+                    sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
                 image = [(image, video[0].size, "video")]
                 sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
+                # print(sources)
             except Exception as e:
                 print(f"Error: {e}")
                 print(f"Failed to read video file: {video_file}")
@@ -1580,6 +1602,11 @@ def make_inputs_require_grad(module, input, output):
         model.config.tokenizer_padding_side = tokenizer.padding_side
         model.config.tokenizer_model_max_length = tokenizer.model_max_length
         model.config.mm_newline_position = model_args.mm_newline_position
+        model.config.add_faster_video = model_args.add_faster_video
+        model.config.faster_token_stride = model_args.faster_token_stride
+        model.config.add_time_instruction = data_args.add_time_instruction
+        model.config.force_sample = data_args.force_sample
+        model.config.mm_spatial_pool_stride = model_args.mm_spatial_pool_stride 
 
         ### Deciding train which part of the model
         if model_args.mm_tunable_parts is None:  # traditional way of deciding which part to train
diff --git a/llava/utils.py b/llava/utils.py
@@ -25,18 +25,25 @@
 def process_video_with_decord(video_file, data_args):
     vr = VideoReader(video_file, ctx=cpu(0), num_threads=1)
     total_frame_num = len(vr)
+    video_time = total_frame_num / vr.get_avg_fps()
     avg_fps = round(vr.get_avg_fps() / data_args.video_fps)
     frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
+    frame_time = [i/avg_fps for i in frame_idx]
+
     
     if data_args.frames_upbound > 0:
-        if len(frame_idx) > data_args.frames_upbound:
+        if len(frame_idx) > data_args.frames_upbound or data_args.force_sample:
             uniform_sampled_frames = np.linspace(0, total_frame_num - 1, data_args.frames_upbound, dtype=int)
             frame_idx = uniform_sampled_frames.tolist()
+            frame_time = [i/vr.get_avg_fps() for i in frame_idx]
     
     video = vr.get_batch(frame_idx).asnumpy()
+    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+
+    num_frames_to_sample = num_frames = len(frame_idx)
     # https://github.com/dmlc/decord/issues/208
     vr.seek(0)
-    return video
+    return video, video_time, frame_time, num_frames_to_sample
 
 def process_video_with_pyav(video_file, data_args):
     container = av.open(video_file)
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,12 +24,12 @@ standalone = [
     "ftfy",
 ]
 
+
 train = [
     "llava[standalone]",
     "numpy==1.26.1",
     "open_clip_torch",
     "fastapi",
-    "gradio==3.35.2",
     "markdown2[all]",
     "numpy",
     "requests",