update

yeshaokai · yeshaokai · commit 7665a5ea837a · 2025-02-07T17:30:08.000+01:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -299,7 +299,8 @@
                 "--vision_supervision", "three_tokens",
                 "--vision_token_training", "all_layers",
                 "--action_types",  "97,300,3806",
-                "--perspective", "first_person"
+                "--learn_neighbor_actions", "prior",
+                "--test_type", "temporal_cot"
             ],
             "console": "integratedTerminal",
             "justMyCode": false,
diff --git a/llava/action/ek_eval.py b/llava/action/ek_eval.py
@@ -125,7 +125,7 @@ def get_args_parser():
                                    'GT_key', 'GT_random_narration', 'GT_random_narration_cut', 'gpt_narration'])
     parser.add_argument('--n_narrations', default = -1, type = int)
     parser.add_argument('--test_type', default = 'base', type = str, choices = ['caption', 'base', 'temporal_cot', 'caption_then_answer', 'direct_narration'])
-    parser.add_argument('--learn_neighbor_actions', action='store_true', default = False)
+    parser.add_argument('--learn_neighbor_actions', type= str, default = "")
     parser.add_argument('--pseudo_folder', default = None, type = str)
     parser.add_argument('--output_dir', default = None, type = str)
     parser.add_argument("--perspective", default = "first_person", type = str)
@@ -168,7 +168,7 @@ def ensemble_llava_evaluation(
                               clip_length,  
                               num_frames,
                               test_type = 'base',
-                              learn_neighbor_actions = False,                             
+                              learn_neighbor_actions = "",                             
                               time_meta = None,
                               meta_data = None,
                               perspective = "first_person"
diff --git a/llava/action/llava_inference.py b/llava/action/llava_inference.py
@@ -20,7 +20,7 @@ def llava_inference(
     temperature = 0,
     test_type = 'base',
     time_meta = None,
-    learn_neighbor_actions = False,
+    learn_neighbor_actions = "",
     meta_data = None,
     perspective = "first_person"
     ):
diff --git a/llava/action/make_visualizations.py b/llava/action/make_visualizations.py
@@ -206,7 +206,7 @@ def visualize_with_llava(pretrained_path, uid, question_type, gen_type):
                             num_frames=n_frames,
                             temperature = 0,
                             time_meta = time_meta,
-                            learn_neighbor_actions = False,
+                            learn_neighbor_actions = "",
                             meta_data = None,
                             perspective = perspective
                             )
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -224,7 +224,7 @@ def remove_sub_nouns_with_doc(doc, verb: str, noun: str) -> str:
     return processed_text
 
 
-def format_task_related_prompt(question, question_type, meta_data = None, perspective = "first_person", learn_neighbor_actions = False):
+def format_task_related_prompt(question, question_type, meta_data = None, perspective = "first_person", learn_neighbor_actions = ""):
     """
     Task related prompt is impacted by the question_type.
     We currently support mc_{action_representation} and gpt-gt-reason
@@ -238,7 +238,20 @@ def format_task_related_prompt(question, question_type, meta_data = None, perspe
                     
     if question_type.startswith("mc_") or question_type == 'temporal_cot':
                                       
-        if learn_neighbor_actions and meta_data:
+        if question_type.startswith("mc_") and learn_neighbor_actions == "prior" and meta_data and random.random() < 0.3:
+            # this means it's training time and we are learning the prior actions
+            prefix = f"{perspective_prefix}\n"
+            assert isinstance(question, list)
+            suffix = ", ".join(question)
+            prev2_narration = meta_data['prev2_narration']
+            prev2_offset = meta_data['prev2_offset']
+            prev1_narration = meta_data['prev1_narration']
+            prev1_offset = meta_data['prev1_offset']
+            cur_narration = meta_data['cur_narration']
+            suffix = f"{prev2_offset} seconds ago, you started an action {prev2_narration}. {prev1_offset} seconds ago, you started an action {prev1_narration}. What action are you currently performing? Here are the options of actions you can select:\n" + suffix 
+            ret = prefix + suffix
+        elif question_type == "temporal_cot" and learn_neighbor_actions == "prior" and meta_data:
+            # means it's test time
             prefix = f"{perspective_prefix}\n"
             assert isinstance(question, list)
             suffix = ", ".join(question)
@@ -264,18 +277,7 @@ def format_task_related_prompt(question, question_type, meta_data = None, perspe
         ret = question
     elif question_type == "gpt-gt-reason" or question_type == "caption":
         ret = f"{perspective_prefix} Describe in details what you see from the video frames. You must talk in the first person perspective. Try to focus on what you are doing. "
-    
-    elif question_type == "triple_direct_answer":
-        assert meta_data
-        duration1 = meta_data[0]['duration']
-        duration2 = meta_data[1]['duration']
-        duration3 = meta_data[2]['duration']
-        prompt = f"The video consists of 3 sequential actions.  What are the actions? Format your answer as action1, action2, action3."        
-        ret = f"{perspective_prefix}{prompt}"
-    
-    
-    elif question_type == "validation":
-        ret = f"Ask yourself questions to validate your notes."
+           
     
     elif question_type == "gpt-gt-strong-reason":
         ret = f"{perspective_prefix} Describe in details what you see and answer the multi-choice question. Explain why wrong answers are wrong and why the correct answer is correct. "
@@ -328,7 +330,7 @@ def format_llava_prompt(image_token,
                         include_time_instruction = False,
                         include_frame_time = False,
                         meta_data = None,
-                        learn_neighbor_actions = False,
+                        learn_neighbor_actions = "",
                         perspective = "first_person"
                         ):
     """
diff --git a/llava/model/language_model/llava_qwen.py b/llava/model/language_model/llava_qwen.py
@@ -244,7 +244,7 @@ def forward(
                         pass
                     # by default, distilaltion uses all layers            
                     # First check if any process has valid examples across all triples
-                    world_has_valid = torch.tensor(actions[:, 0].any() > 0, device=actions.device)
+                    world_has_valid = torch.tensor(actions[:, 0].any() >= 0, device=actions.device)
                     torch.distributed.all_reduce(world_has_valid, op=torch.distributed.ReduceOp.MAX)               
 
                     if world_has_valid:  # If any process has valid examples
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -201,7 +201,7 @@ class EK100EvalArguments:
     action_representation: str = "GT_random_narration_cut"
     n_narrations: int = -1
     test_type: str = 'base'
-    learn_neighbor_actions: bool = False
+    learn_neighbor_actions: str = "" # "prior", "triple_direct"
     perspective: str = "first_person"
     pseudo_folder: str = ""
     benchmark_testing: bool = False
@@ -990,7 +990,6 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer,
         from llava.action.generate_interval_pred import get_lookup_dict
         
         self.train_triple_lookup = get_lookup_dict(os.path.join(self.EK100_anno_root, 'EPIC_100_train.csv'))
-        #self.val_triple_lookup = get_lookup_dict(os.path.join(self.EK100_anno_root, 'EPIC_100_validation.csv'))                     
 
         # Handle multiple JSON files specified in the data_path
         if "{" in data_path and "}" in data_path:

Original file line number	Diff line number	Diff line change
`@@ -206,7 +206,7 @@ def visualize_with_llava(pretrained_path, uid, question_type, gen_type):`
`206`	`206`	`num_frames=n_frames,`
`207`	`207`	`temperature = 0,`
`208`	`208`	`time_meta = time_meta,`
`209`		`- learn_neighbor_actions = False,`
	`209`	`+ learn_neighbor_actions = "",`
`210`	`210`	`meta_data = None,`
`211`	`211`	`perspective = perspective`
`212`	`212`	`)`