Merge branch 'EK100' of github.com:HaozheQi/LLaVA-NeXT into shaokai_dev

yeshaokai · yeshaokai · commit afe35d44500b · 2024-10-07T08:31:45.000Z
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -7,7 +7,7 @@
             "request": "launch",
             "module": "torch.distributed.run",
             "env": {
-                "CUDA_VISIBLE_DEVICES": "1,2",
+                "CUDA_VISIBLE_DEVICES": "1,2,3",
                 "OMP_NUM_THREADS": "8",
                 "NCCL_IB_DISABLE": "0",
                 "NCCL_IB_GID_INDEX": "3",
@@ -18,7 +18,7 @@
                 "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
             },
             "args": [
-                "--nproc_per_node=2",
+                "--nproc_per_node=3",
                 "--nnodes=1",
                 "--node_rank=0",
                 "--master_addr=127.0.0.1",
@@ -31,6 +31,7 @@
                 // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
                 "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
                 "--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
+                // "--video_folder", "/home/haozhe/kitchen/AVION/datasets",
                 "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
                 "--mm_vision_tower_lr", "2e-6",
                 "--vision_tower", "google/siglip-so400m-patch14-384",
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -45,7 +45,7 @@
 from llava import conversation as conversation_lib
 from llava.model import *
 from llava.mm_utils import process_highres_image, process_anyres_image, process_highres_image_crop_split, tokenizer_image_token
-from llava.utils import rank0_print, process_video_with_pyav, process_video_with_decord
+from llava.utils import rank0_print, process_video_with_pyav, process_video_with_decord, process_EK100_video_with_decord
 
 torch.multiprocessing.set_sharing_strategy("file_system")
 
@@ -1152,9 +1152,13 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
             sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
 
         elif "video" in sources[0]:
-            video_file = self.list_data_dict[i]["video"]
+            video_info = self.list_data_dict[i]["video"]
             video_folder = os.path.join(self.data_args.video_folder, sources[0]['dataset_name'])
-            video_file = os.path.join(video_folder, video_file)
+            if 'EK100' in video_folder:
+                video_file = os.path.join(video_folder, video_info.split("-")[0], video_info.split("-")[1]+".MP4")
+            else:
+                video_file = os.path.join(video_folder, video_info)
+
             suffix = video_file.split(".")[-1]
             if not os.path.exists(video_file):
                 print("File {} not exist!".format(video_file))
@@ -1191,6 +1195,10 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
                                 video.append(frame)
                         except IOError:
                             print(f"Failed to read frame at path: {frame_path}")
+                elif 'EK100' in video_file:
+                    start_second = float(self.list_data_dict[i]['start_timestamp'])
+                    end_second = float(self.list_data_dict[i]['end_timestamp'])
+                    video, video_time, frame_time, num_frames_to_sample = process_EK100_video_with_decord(video_file, self.data_args, start_second, end_second, 15)
                 else:
                     video, video_time, frame_time, num_frames_to_sample = process_video_with_decord(video_file, self.data_args)
 
diff --git a/llava/utils.py b/llava/utils.py
@@ -22,6 +22,151 @@
 except ImportError:
     print("Please install pyav to use video processing functions.")
 
+def get_frame_ids(start_frame, end_frame, num_segments=32, jitter=True):
+    frame_ids = np.convolve(np.linspace(start_frame, end_frame, num_segments + 1), [0.5, 0.5], mode='valid')
+    if jitter:
+        seg_size = float(end_frame - start_frame - 1) / num_segments
+        shift = (np.random.rand(num_segments) - 0.5) * seg_size
+        frame_ids += shift
+    return frame_ids.astype(int).tolist()
+
+# def get_video_reader(videoname, num_threads, fast_rrc, rrc_params, fast_rcc, rcc_params):
+#     video_reader = None
+#     if fast_rrc:
+#         video_reader = VideoReader(
+#             videoname,
+#             num_threads=num_threads,
+#             width=rrc_params[0], height=rrc_params[0],
+#             use_rrc=True, scale_min=rrc_params[1][0], scale_max=rrc_params[1][1],
+#         )
+#     elif fast_rcc:
+#         video_reader = VideoReader(
+#             videoname,
+#             num_threads=num_threads,
+#             width=rcc_params[0], height=rcc_params[0],
+#             use_rcc=True,
+#         )
+#     else:
+#         video_reader = VideoReader(videoname, num_threads=num_threads)
+#     return video_reader
+
+# def video_loader(root, vid, ext, second, end_second,
+#                  chunk_len=300, fps=30, clip_length=32,
+#                  threads=1,
+#                  fast_rrc=False, rrc_params=(224, (0.5, 1.0)),
+#                  fast_rcc=False, rcc_params=(224, ),
+#                  jitter=False):
+#     assert fps > 0, 'fps should be greater than 0'
+
+#     if chunk_len == -1:
+#         vr = get_video_reader(
+#             osp.join(root, '{}.{}'.format(vid, ext)),
+#             num_threads=threads,
+#             fast_rrc=fast_rrc, rrc_params=rrc_params,
+#             fast_rcc=fast_rcc, rcc_params=rcc_params,
+#         )
+#         end_second = min(end_second, len(vr) / fps)
+
+#         # calculate frame_ids
+#         frame_offset = int(np.round(second * fps))
+#         total_duration = max(int((end_second - second) * fps), clip_length)
+#         frame_ids = get_frame_ids(frame_offset, min(frame_offset + total_duration, len(vr)), num_segments=clip_length, jitter=jitter)
+
+#         # load frames
+#         assert max(frame_ids) < len(vr)
+#         try:
+#             frames = vr.get_batch(frame_ids).asnumpy()
+#         except decord.DECORDError as error:
+#             print(error)
+#             frames = vr.get_batch([0] * len(frame_ids)).asnumpy()
+    
+#         return torch.from_numpy(frames.astype(np.float32))
+
+#     else:
+#         chunk_start = int(second) // chunk_len * chunk_len
+#         chunk_end = int(end_second) // chunk_len * chunk_len
+#         while True:
+#             video_filename = osp.join(root, '{}.{}'.format(vid, ext), '{}.{}'.format(chunk_end, ext))
+#             if not osp.exists(video_filename):
+#                 # print("{} does not exists!".format(video_filename))
+#                 chunk_end -= chunk_len
+#             else:
+#                 vr = decord.VideoReader(video_filename)
+#                 end_second = min(end_second, (len(vr) - 1) / fps + chunk_end)
+#                 assert chunk_start <= chunk_end
+#                 break
+#         # calculate frame_ids
+#         frame_ids = get_frame_ids(
+#             int(np.round(second * fps)),
+#             int(np.round(end_second * fps)),
+#             num_segments=clip_length, jitter=jitter
+#         )
+#         all_frames = []
+#         # allocate absolute frame-ids into the relative ones
+#         for chunk in range(chunk_start, chunk_end + chunk_len, chunk_len):
+#             rel_frame_ids = list(filter(lambda x: int(chunk * fps) <= x < int((chunk + chunk_len) * fps), frame_ids))
+#             rel_frame_ids = [int(frame_id - chunk * fps) for frame_id in rel_frame_ids]
+#             vr = get_video_reader(
+#                 osp.join(root, '{}.{}'.format(vid, ext), '{}.{}'.format(chunk, ext)),
+#                 num_threads=threads,
+#                 fast_rrc=fast_rrc, rrc_params=rrc_params,
+#                 fast_rcc=fast_rcc, rcc_params=rcc_params,
+#             )
+#             try:
+#                 frames = vr.get_batch(rel_frame_ids).asnumpy()
+#             except decord.DECORDError as error:
+#                 print(error)
+#                 frames = vr.get_batch([0] * len(rel_frame_ids)).asnumpy()
+#             except IndexError:
+#                 print(root, vid, ext, second, end_second)
+#             all_frames.append(frames)
+#             if sum(map(lambda x: x.shape[0], all_frames)) == clip_length:
+#                 break
+#         res = torch.from_numpy(np.concatenate(all_frames, axis=0).astype(np.float32))
+#         assert res.shape[0] == clip_length, "{}, {}, {}, {}, {}, {}, {}".format(root, vid, second, end_second, res.shape[0], rel_frame_ids, frame_ids)
+#         return res
+
+def process_EK100_video_with_decord(video_file, data_args, start_second, end_second, chunk_len):
+    fps = 30
+    start_frame = int(start_second * fps)
+    end_frame = int(end_second * fps)
+    chunk_start = int(start_second) // chunk_len * chunk_len
+    chunk_end = int(end_second) // chunk_len * chunk_len
+    video_time = end_second - start_second
+    while True:
+        video_filename = os.path.join(video_file, '{}.MP4'.format(chunk_end))
+        if not os.path.exists(video_filename):
+            # print("{} does not exists!".format(video_filename))
+            chunk_end -= chunk_len
+        else:
+            vr = VideoReader(video_filename, ctx=cpu(0), num_threads=1)
+            end_second = min(end_second, (len(vr) - 1) / fps + chunk_end)
+            assert chunk_start <= chunk_end
+            break
+    
+    # calculate frame_ids
+    frame_ids = get_frame_ids(start_frame, end_frame, num_segments=data_args.frames_upbound, jitter=False)
+    frame_time = [i/fps for i in frame_ids]
+    
+    all_frames = []
+    # allocate absolute frame-ids into the relative ones
+    for chunk in range(chunk_start, chunk_end + chunk_len, chunk_len):
+        rel_frame_ids = list(filter(lambda x: int(chunk * fps) <= x < int((chunk + chunk_len) * fps), frame_ids))
+        rel_frame_ids = [int(frame_id - chunk * fps) for frame_id in rel_frame_ids]
+        vr = VideoReader(os.path.join(video_file, '{}.MP4'.format(chunk)),ctx=cpu(0), num_threads=1)
+        frames = vr.get_batch(rel_frame_ids).asnumpy()
+        all_frames.append(frames)
+        vr.seek(0)
+        if sum(map(lambda x: x.shape[0], all_frames)) == data_args.frames_upbound:
+            break
+
+    video = np.concatenate(all_frames, axis=0).astype(np.float32)
+
+    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+    num_frames_to_sample = len(frame_ids)
+
+    return video, video_time, frame_time, num_frames_to_sample
+
 def process_video_with_decord(video_file, data_args):
     vr = VideoReader(video_file, ctx=cpu(0), num_threads=1)
     total_frame_num = len(vr)
diff --git a/run.sh b/run.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Export environment variables
-export CUDA_VISIBLE_DEVICES="0,1"
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
 export OMP_NUM_THREADS="8"
 export NCCL_IB_DISABLE="0"
 export NCCL_IB_GID_INDEX="3"
@@ -12,14 +12,14 @@ export ACCELERATE_CPU_AFFINITY="1"
 export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
 
 # Run the command using torchrun
-torchrun --nproc_per_node=2 \
+torchrun --nproc_per_node=4 \
          --nnodes=1 \
          --node_rank=0 \
          --master_addr=127.0.0.1 \
          --master_port=29500 \
          llava/train/train_mem.py \
          --deepspeed scripts/zero3.json \
-         --model_name_or_path lmms-lab/llava-onevision-qwen2-7b-ov \
+         --model_name_or_path lmms-lab/llava-onevision-qwen2-0.5b-ov \
          --version qwen_1_5 \
          --data_path scripts/train/onevision.yaml \
          --image_folder /media/data/haozhe/VFM/onevision/llava_data/geo3k/ \
@@ -60,4 +60,4 @@ torchrun --nproc_per_node=2 \
          --torch_compile True \
          --torch_compile_backend inductor \
          --dataloader_drop_last True \
-         --frames_upbound 32  > test7b.out 2>&1
+         --frames_upbound 32  > train_kitchen0.5b.out 2>&1
diff --git a/scripts/train/onevision.yaml b/scripts/train/onevision.yaml
@@ -68,8 +68,8 @@ datasets:
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
   #   sampling_strategy: "all"
   # - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/geo3k.json
-  - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/geo3k.json
-    sampling_strategy: "all"
+  # - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/geo3k.json
+  #   sampling_strategy: "all"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
   #   sampling_strategy: "first:10%"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_align_converted_60252.json
@@ -183,4 +183,7 @@ datasets:
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/0718_0_30_s_academic_mc_v0_1_all.json # will be released in next version of LLaVA-NeXT-Video
   #   sampling_strategy: all
   # - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/sharegpt4video.json # download from sharegpt4video
-  #   sampling_strategy: all
+  # - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/sharegpt4video.json
+  #   sampling_strategy: "first:10%"
+  - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/train_convs_narration.jsonl
+    sampling_strategy: all