fix the instruction generation

HaozheQi · HaozheQi · commit a29660540f35 · 2025-02-05T18:44:39.000+01:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -1,96 +1,11 @@
-// {
-//     "version": "0.2.0",
-//     "configurations": [
-//         {
-//             "name": "Run LLAVA Training with torchrun",
-//             "type": "debugpy",
-//             "request": "launch",
-//             "module": "torch.distributed.run",
-//             "env": {
-//                 "CUDA_VISIBLE_DEVICES": "0,1,2,3",
-//                 "OMP_NUM_THREADS": "8",
-//                 "NCCL_IB_DISABLE": "0",
-//                 "NCCL_IB_GID_INDEX": "3",
-//                 "NCCL_SOCKET_IFNAME": "eth0",
-//                 "NCCL_DEBUG": "INFO",
-//                 "ACCELERATE_CPU_AFFINITY": "1",
-//                 "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
-//                 "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
-//                 "CUDA_LAUNCH_BLOCKING": "1",
-//                 "HF_HOME": "/media/data/haozhe/VFM/huggingface",
-//             },
-//             "args": [
-//                 "--nproc_per_node=4",
-//                 "--nnodes=1",
-//                 "llava/train/train_mem.py",
-//                 "--deepspeed", "scripts/zero3.json",
-//                 "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
-//                 "--version", "qwen_1_5",
-//                 "--data_path", "scripts/train/llava_video.yaml",
-//                 "--video_folder", "/media/data/haozhe/VFM/onevision/llava_video",
-//                 "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
-//                 "--mm_vision_tower_lr", "2e-6",
-//                 "--vision_tower", "google/siglip-so400m-patch14-384",
-//                 "--mm_projector_type", "mlp2x_gelu",
-//                 "--mm_vision_select_layer", "-2",
-//                 "--mm_use_im_start_end", "False",
-//                 "--mm_use_im_patch_token", "False",
-//                 "--group_by_modality_length", "True",
-//                 "--image_aspect_ratio", "anyres_max_9",
-//                 "--image_grid_pinpoints", "(1x1),...,(6x6)",
-//                 "--mm_patch_merge_type", "spatial_unpad",
-//                 "--bf16", "True",
-//                 "--run_name", "dev_0.5b_4f_llavavideo_haozhe",
-//                 "--output_dir", "experiments/dev_0.5b_4f_llavavideo_haozhe",
-//                 "--num_train_epochs", "1",
-//                 "--per_device_train_batch_size", "8",
-//                 "--per_device_eval_batch_size", "4",
-//                 "--gradient_accumulation_steps", "2",
-//                 "--evaluation_strategy", "epoch",
-//                 "--eval_steps", "1",
-//                 "--save_strategy", "steps",
-//                 "--save_steps", "2000",
-//                 "--learning_rate", "1e-5",
-//                 "--weight_decay", "0.",
-//                 "--warmup_ratio", "0.03",
-//                 "--lr_scheduler_type", "cosine",
-//                 "--logging_steps", "1",
-//                 "--tf32", "True",
-//                 "--model_max_length", "32768",
-//                 "--gradient_checkpointing", "True",
-//                 "--dataloader_num_workers", "4",
-//                 "--lazy_preprocess", "True",
-//                 "--report_to", "wandb",
-//                 "--torch_compile", "True",
-//                 "--torch_compile_backend", "inductor",
-//                 "--dataloader_drop_last", "True",
-//                 "--frames_upbound", "4",
-//                 "--root", "/media/data/haozhe/VFM/onevision/llava_video/EK100",
-//                 "--action_predictions", "/media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_pred_ids_val.json",
-//                 "--val_metadata", "/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
-//                 "--add_time_instruction", "False",
-//                 "--llava_num_frames", "4",
-//                 "--clip_length", "4",
-//                 "--action_representation", "official_key",
-//                 "--topk_predictions", "5"
-//             ],
-//             "console": "integratedTerminal",
-//             "justMyCode": false,
-//             "cwd": "${workspaceFolder}"
-//         }
-//     ]
-// }
-
-
 {
     "version": "0.2.0",
     "configurations": [
         {
             "name": "Run LLAVA Training with torchrun",
             "type": "debugpy",
             "request": "launch",
-            "python": "/media/data/haozhe/VFM/llmseval-venv/bin/python",
-            "module": "accelerate.commands.launch",
+            "module": "torch.distributed.run",
             "env": {
                 "CUDA_VISIBLE_DEVICES": "0,1,2,3",
                 "OMP_NUM_THREADS": "8",
@@ -103,20 +18,64 @@
                 "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
                 "CUDA_LAUNCH_BLOCKING": "1",
                 "HF_HOME": "/media/data/haozhe/VFM/huggingface",
-                "OPENAI_API_KEY": "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"
             },
             "args": [
-                "--num_processes", "4",
-                "-m", "lmms_eval",
-                // "--model", "llava_vid",
-                "--model", "llava_onevision",
-                // "--model_args", "pretrained=experiments/dev_LLaVA-Video-7B-Qwen2_4f_test_haozhe,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average",
-                "--model_args", "pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen",
-                "--tasks", "video_dc499",
-                "--batch_size", "1",
-                "--log_samples",
-                "--log_samples_suffix", "llava_onevision",
-                "--output_path", "./logs/"
+                "--nproc_per_node=4",
+                "--nnodes=1",
+                "llava/train/train_mem.py",
+                "--deepspeed", "scripts/zero3.json",
+                "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
+                "--version", "qwen_1_5",
+                "--data_path", "scripts/train/llava_video_RCP.yaml",
+                "--video_folder", "/media/data/haozhe/VFM/onevision/llava_video",
+                "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
+                "--mm_vision_tower_lr", "2e-6",
+                "--vision_tower", "google/siglip-so400m-patch14-384",
+                "--mm_projector_type", "mlp2x_gelu",
+                "--mm_vision_select_layer", "-2",
+                "--mm_use_im_start_end", "False",
+                "--mm_use_im_patch_token", "False",
+                "--group_by_modality_length", "True",
+                "--image_aspect_ratio", "anyres_max_9",
+                "--image_grid_pinpoints", "(1x1),...,(6x6)",
+                "--mm_patch_merge_type", "spatial_unpad",
+                "--bf16", "True",
+                "--run_name", "dev_0.5b_llavavideo_haozhe",
+                "--output_dir", "experiments/dev_0.5b_llavavideo_haozhe",
+                "--num_train_epochs", "1",
+                "--per_device_train_batch_size", "1",
+                "--per_device_eval_batch_size", "4",
+                "--gradient_accumulation_steps", "2",
+                "--evaluation_strategy", "epoch",
+                "--eval_steps", "1",
+                "--save_strategy", "steps",
+                "--save_steps", "2000",
+                "--learning_rate", "1e-5",
+                "--weight_decay", "0.",
+                "--warmup_ratio", "0.03",
+                "--lr_scheduler_type", "cosine",
+                "--logging_steps", "1",
+                "--tf32", "True",
+                "--model_max_length", "32768",
+                "--gradient_checkpointing", "True",
+                "--dataloader_num_workers", "4",
+                "--lazy_preprocess", "True",
+                "--report_to", "wandb",
+                "--torch_compile", "True",
+                "--torch_compile_backend", "inductor",
+                "--dataloader_drop_last", "True",
+                "--frames_upbound", "64",
+                "--mm_newline_position", "grid",
+                "--add_time_instruction", "True",
+                "--force_sample", "True",
+                "--mm_spatial_pool_stride", "2",
+                "--root", "/media/data/haozhe/VFM/onevision/llava_video/EK100",
+                "--action_predictions", "/media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_pred_ids_val.json",
+                "--val_metadata", "/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
+                "--llava_num_frames", "64",
+                "--clip_length", "64",
+                "--action_representation", "official_key",
+                "--topk_predictions", "5"
             ],
             "console": "integratedTerminal",
             "justMyCode": false,
@@ -125,6 +84,50 @@
     ]
 }
 
+
+// {
+//     "version": "0.2.0",
+//     "configurations": [
+//         {
+//             "name": "Run LLAVA Training with torchrun",
+//             "type": "debugpy",
+//             "request": "launch",
+//             "python": "/media/data/haozhe/VFM/llmseval-venv/bin/python",
+//             "module": "accelerate.commands.launch",
+//             "env": {
+//                 "CUDA_VISIBLE_DEVICES": "0,1,2,3",
+//                 "OMP_NUM_THREADS": "8",
+//                 "NCCL_IB_DISABLE": "0",
+//                 "NCCL_IB_GID_INDEX": "3",
+//                 "NCCL_SOCKET_IFNAME": "eth0",
+//                 "NCCL_DEBUG": "INFO",
+//                 "ACCELERATE_CPU_AFFINITY": "1",
+//                 "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
+//                 "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
+//                 "CUDA_LAUNCH_BLOCKING": "1",
+//                 "HF_HOME": "/media/data/haozhe/VFM/huggingface",
+//                 "OPENAI_API_KEY": "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"
+//             },
+//             "args": [
+//                 "--num_processes", "4",
+//                 "-m", "lmms_eval",
+//                 // "--model", "llava_vid",
+//                 "--model", "llava_onevision",
+//                 // "--model_args", "pretrained=experiments/dev_LLaVA-Video-7B-Qwen2_4f_test_haozhe,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average",
+//                 "--model_args", "pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen",
+//                 "--tasks", "video_dc499",
+//                 "--batch_size", "1",
+//                 "--log_samples",
+//                 "--log_samples_suffix", "llava_onevision",
+//                 "--output_path", "./logs/"
+//             ],
+//             "console": "integratedTerminal",
+//             "justMyCode": false,
+//             "cwd": "${workspaceFolder}"
+//         }
+//     ]
+// }
+
 // {
 //     // Use IntelliSense to learn about possible attributes.
 //     // Hover to view descriptions of existing attributes.
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -1206,7 +1206,8 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
             if not os.path.exists(video_file):
                 print("File {} not exist!".format(video_file))
 
-            try:
+            # try:
+            if True:
                 if "sharegpt4video" in video_folder:
                     frame_files = [os.path.join(video_file, f) for f in os.listdir(video_file) if os.path.isfile(os.path.join(video_file, f))]
                     frame_files.sort()  # Ensure the frames are sorted if they are named sequentially
@@ -1279,11 +1280,7 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
 
                 processor = self.data_args.image_processor
                 image = processor.preprocess(video, return_tensors="pt")["pixel_values"]
-                if 'EK100' not in video_file and 'EKframes' not in video_folder:
-                    if self.data_args.add_time_instruction:
-                        time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. Please answer the following questions related to this video."
-                        sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
-                else:
+                if 'EK100' in video_file or 'EKframes' in video_folder:
                     # We use our own prompting logic when it's EK100
                     # We turn a string of list to a python list
                     question_type = sources[0]['question_type']
@@ -1306,16 +1303,21 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
                                                  include_frame_time = False)
                     sources[0]["conversations"][0]["value"] = llava_prompt
                     # rank0_print (sources[0])
+                    
+                if self.data_args.add_time_instruction:
+                        time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. Please answer the following questions related to this video."
+                        sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
+                
                 action = torch.tensor([sources[0]['verb_id'], sources[0]['noun_id'], sources[0]['action_id']] if 'verb_id' in sources[0] else [-1, -1, -1]).long()
                 image = [(image, video[0].size, "video", action)]
                 sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
                 # print(sources)
-            except Exception as e:
-                import traceback
-                traceback.print_exc() 
-                print(f"Error: {e}")
-                print(f"Failed to read video file: {video_file}")
-                return self._get_item(i + 1)
+            # except Exception as e:
+            #     import traceback
+            #     traceback.print_exc() 
+            #     print(f"Error: {e}")
+            #     print(f"Failed to read video file: {video_file}")
+            #     return self._get_item(i + 1)
         else:
             sources = copy.deepcopy([e["conversations"] for e in sources])
 
diff --git a/run_clariden.sbatch b/run_clariden.sbatch
@@ -8,8 +8,8 @@
 #SBATCH --ntasks-per-node 1         # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
 #SBATCH --gres gpu:4                # Number of GPUs
 #SBATCH --time 23:00:00             # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
-#SBATCH --output logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe.out
-#SBATCH --error logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe.err
+#SBATCH --output logs/R-%x.%j-dev_7b_64f_10llavavideo_EK100_haozhe.out
+#SBATCH --error logs/R-%x.%j-dev_7b_64f_10llavavideo_EK100_haozhe.err
 
 mkdir -p logs
 
@@ -71,8 +71,8 @@ PYTHON_ARGS=" \
     --image_grid_pinpoints \"(1x1),...,(6x6)\" \
     --mm_patch_merge_type spatial_unpad \
     --bf16 True \
-    --run_name dev_7b_4f_llavavideo_test_haozhe \
-    --output_dir experiments/dev_7b_4f_llavavideo_test_haozhe \
+    --run_name dev_7b_64f_10llavavideo_EK100_haozhe \
+    --output_dir experiments/dev_7b_64f_10llavavideo_EK100_haozhe \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 4 \
@@ -103,9 +103,8 @@ PYTHON_ARGS=" \
     --root /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/EK100 \
     --action_predictions /iopsstor/scratch/cscs/hqi/VFM/llava_data/TIM_PREDS/tim_pred_ids_val.json \
     --val_metadata /iopsstor/scratch/cscs/hqi/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
-    --add_time_instruction False \
-    --llava_num_frames 4 \
-    --clip_length 4 \
+    --llava_num_frames 64 \
+    --clip_length 64 \
     --action_representation official_key \
     --topk_predictions 5 \
     "
diff --git a/scripts/train/llava_video.yaml b/scripts/train/llava_video.yaml
@@ -1,6 +1,6 @@
 datasets:
   - json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_v0_1_cap_processed.json
-    sampling_strategy: "first:10%"
+    sampling_strategy: "first:1%"
   - json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_cap_processed.json
     sampling_strategy: "first:10%"
   - json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_v0_1_cap_processed.json
@@ -78,4 +78,4 @@ datasets:
   # - json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.json
   #   sampling_strategy: "first:10%"
   - json_path: /media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_mc_top5_official_key/train_convs_narration_actionids.jsonl
-    sampling_strategy: all
+    sampling_strategy: "first:1%"
diff --git a/scripts/train/llava_video_RCP.yaml b/scripts/train/llava_video_RCP.yaml