Skip to content

Commit 6c3db97

Browse files
author
Ye Shaokai
committed
fixed a recently introduced bug about action model prediction evaluation
1 parent 7665a5e commit 6c3db97

File tree

2 files changed

+140
-140
lines changed

2 files changed

+140
-140
lines changed

.vscode/launch.json

Lines changed: 134 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -219,145 +219,144 @@
219219
// }
220220

221221
//shaokai
222-
{
223-
"version": "0.2.0",
224-
"configurations": [
225-
{
226-
"name": "Run LLAVA Training with torchrun",
227-
"type": "debugpy",
228-
"request": "launch",
229-
"module": "torch.distributed.run",
230-
"env": {
231-
"CUDA_VISIBLE_DEVICES": "0",
232-
"OMP_NUM_THREADS": "8",
233-
"NCCL_IB_DISABLE": "0",
234-
"NCCL_IB_GID_INDEX": "3",
235-
"NCCL_SOCKET_IFNAME": "eth0",
236-
"HF_HOME": "/data/shaokai",
237-
"NCCL_DEBUG": "INFO",
238-
"ACCELERATE_CPU_AFFINITY": "1",
239-
"WANDB_API_KEY": "4474ec79de023b0c3ffb43588ab6163264f875db",
240-
"PYTHONPATH": "/data/shaokai/LLaVA-NeXT:/usr/local/lib/python3.10/site-packages/decord-0.6.0-py3.10-linux-x86_64.egg/"
241-
},
242-
"args": [
243-
"--nproc_per_node=1",
244-
"--nnodes=1",
245-
"--node_rank=0",
246-
"--master_addr=127.0.0.1",
247-
"--master_port=29500",
248-
"llava/train/train_mem.py",
249-
"--deepspeed", "scripts/zero3.json",
250-
"--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
251-
"--version", "qwen_1_5",
252-
"--data_path", "scripts/train/simple_avion_top5_gt_and_direct.yaml",
253-
"--video_folder", "/data/shaokai/EK100_512/",
254-
"--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
255-
"--mm_vision_tower_lr", "2e-6",
256-
"--vision_tower", "google/siglip-so400m-patch14-384",
257-
"--mm_projector_type", "mlp2x_gelu",
258-
"--mm_vision_select_layer", "-2",
259-
"--mm_use_im_start_end", "False",
260-
"--mm_use_im_patch_token", "False",
261-
"--group_by_modality_length", "True",
262-
"--image_aspect_ratio", "anyres_max_9",
263-
"--image_grid_pinpoints", "(1x1),...,(6x6)",
264-
"--mm_patch_merge_type", "spatial_unpad",
265-
"--bf16", "True",
266-
"--run_name", "dpo_test",
267-
"--output_dir", "experiments/dpo_test",
268-
"--num_train_epochs", "1",
269-
"--per_device_train_batch_size", "4",
270-
"--per_device_eval_batch_size", "4",
271-
"--gradient_accumulation_steps", "2",
272-
"--evaluation_strategy", "steps",
273-
"--save_strategy", "steps",
274-
"--save_steps", "1000",
275-
"--save_total_limit", "1",
276-
"--learning_rate", "1e-5",
277-
"--weight_decay", "0.",
278-
"--warmup_ratio", "0.03",
279-
"--lr_scheduler_type", "cosine",
280-
"--logging_steps", "1",
281-
"--tf32", "True",
282-
"--model_max_length", "32768",
283-
"--gradient_checkpointing", "True",
284-
"--dataloader_num_workers", "4",
285-
"--lazy_preprocess", "True",
286-
"--report_to", "wandb",
287-
"--torch_compile", "True",
288-
"--torch_compile_backend", "inductor",
289-
"--dataloader_drop_last", "True",
290-
"--frames_upbound", "4",
291-
"--root", "/data/shaokai/EK100_512/EK100",
292-
"--action_predictions", "/data/shaokai/AVION_PREDS/avion_pred_ids_val.json",
293-
"--val_metadata", "/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv",
294-
"--llava_num_frames", "4",
295-
"--clip_length", "4",
296-
"--action_representation", "official_key",
297-
"--topk_predictions", "5",
298-
"--eval_steps", "1",
299-
"--vision_supervision", "three_tokens",
300-
"--vision_token_training", "all_layers",
301-
"--action_types", "97,300,3806",
302-
"--learn_neighbor_actions", "prior",
303-
"--test_type", "temporal_cot"
304-
],
305-
"console": "integratedTerminal",
306-
"justMyCode": false,
307-
"cwd": "${workspaceFolder}"
308-
}
309-
]
310-
}
311-
312-
313222
// {
314-
// "version": "0.2.0",
315-
// "configurations": [
316-
// {
317-
// "name": "Run LLAVA Training with torchrun",
318-
// "type": "debugpy",
319-
// "request": "launch",
320-
// "module": "torch.distributed.run",
321-
// "env": {
322-
// "CUDA_VISIBLE_DEVICES": "0",
323-
// "OMP_NUM_THREADS": "8",
324-
// "NCCL_IB_DISABLE": "0",
325-
// "NCCL_IB_GID_INDEX": "3",
326-
// "NCCL_SOCKET_IFNAME": "eth0",
327-
// "HF_HOME": "/data/shaokai",
328-
// "NCCL_DEBUG": "INFO",
329-
// "ACCELERATE_CPU_AFFINITY": "1",
330-
// "WANDB_API_KEY": "4474ec79de023b0c3ffb43588ab6163264f875db",
331-
// "PYTHONPATH": "/data/shaokai/LLaVA-NeXT:/usr/local/lib/python3.10/site-packages/decord-0.6.0-py3.10-linux-x86_64.egg/"
332-
// },
333-
// "args": [
334-
// "--nproc_per_node=1",
335-
// "--nnodes=1",
336-
// "--node_rank=0",
337-
// "--master_addr=127.0.0.1",
338-
// "--master_port=29500",
339-
// "llava/action/ek_eval.py",
340-
// "--pretrained_name", "experiments/dev_0.5b_4f_avion_top5_and_direct_neighbor",
341-
// "--root", "/data/shaokai/EK100",
342-
// "--train-metadata", "/data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv",
343-
// "--val-metadata", "/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv",
344-
// "--llava_num_frames", "4",
345-
// "--clip-length", "4",
346-
// "--action_predictions","/data/shaokai/TIM_PREDS/tim_pred_ids_val.json",
347-
// "--action_representation", "official_key",
348-
// "--topk_predictions", "5",
349-
// "--test_type", "temporal_cot",
350-
// "--output_dir", "test_0.5b_direct",
351-
// "--learn_neighbor_actions"
352-
// ],
353-
// "console": "integratedTerminal",
354-
// "justMyCode": false,
355-
// "cwd": "${workspaceFolder}"
356-
// }
357-
// ]
223+
// "version": "0.2.0",
224+
// "configurations": [
225+
// {
226+
// "name": "Run LLAVA Training with torchrun",
227+
// "type": "debugpy",
228+
// "request": "launch",
229+
// "module": "torch.distributed.run",
230+
// "env": {
231+
// "CUDA_VISIBLE_DEVICES": "0",
232+
// "OMP_NUM_THREADS": "8",
233+
// "NCCL_IB_DISABLE": "0",
234+
// "NCCL_IB_GID_INDEX": "3",
235+
// "NCCL_SOCKET_IFNAME": "eth0",
236+
// "HF_HOME": "/data/shaokai",
237+
// "NCCL_DEBUG": "INFO",
238+
// "ACCELERATE_CPU_AFFINITY": "1",
239+
// "WANDB_API_KEY": "4474ec79de023b0c3ffb43588ab6163264f875db",
240+
// "PYTHONPATH": "/data/shaokai/LLaVA-NeXT:/usr/local/lib/python3.10/site-packages/decord-0.6.0-py3.10-linux-x86_64.egg/"
241+
// },
242+
// "args": [
243+
// "--nproc_per_node=1",
244+
// "--nnodes=1",
245+
// "--node_rank=0",
246+
// "--master_addr=127.0.0.1",
247+
// "--master_port=29500",
248+
// "llava/train/train_mem.py",
249+
// "--deepspeed", "scripts/zero3.json",
250+
// "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
251+
// "--version", "qwen_1_5",
252+
// "--data_path", "scripts/train/simple_avion_top5_gt_and_direct.yaml",
253+
// "--video_folder", "/data/shaokai/EK100_512/",
254+
// "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
255+
// "--mm_vision_tower_lr", "2e-6",
256+
// "--vision_tower", "google/siglip-so400m-patch14-384",
257+
// "--mm_projector_type", "mlp2x_gelu",
258+
// "--mm_vision_select_layer", "-2",
259+
// "--mm_use_im_start_end", "False",
260+
// "--mm_use_im_patch_token", "False",
261+
// "--group_by_modality_length", "True",
262+
// "--image_aspect_ratio", "anyres_max_9",
263+
// "--image_grid_pinpoints", "(1x1),...,(6x6)",
264+
// "--mm_patch_merge_type", "spatial_unpad",
265+
// "--bf16", "True",
266+
// "--run_name", "dpo_test",
267+
// "--output_dir", "experiments/dpo_test",
268+
// "--num_train_epochs", "1",
269+
// "--per_device_train_batch_size", "4",
270+
// "--per_device_eval_batch_size", "4",
271+
// "--gradient_accumulation_steps", "2",
272+
// "--evaluation_strategy", "steps",
273+
// "--save_strategy", "steps",
274+
// "--save_steps", "1000",
275+
// "--save_total_limit", "1",
276+
// "--learning_rate", "1e-5",
277+
// "--weight_decay", "0.",
278+
// "--warmup_ratio", "0.03",
279+
// "--lr_scheduler_type", "cosine",
280+
// "--logging_steps", "1",
281+
// "--tf32", "True",
282+
// "--model_max_length", "32768",
283+
// "--gradient_checkpointing", "True",
284+
// "--dataloader_num_workers", "4",
285+
// "--lazy_preprocess", "True",
286+
// "--report_to", "wandb",
287+
// "--torch_compile", "True",
288+
// "--torch_compile_backend", "inductor",
289+
// "--dataloader_drop_last", "True",
290+
// "--frames_upbound", "4",
291+
// "--root", "/data/shaokai/EK100_512/EK100",
292+
// "--action_predictions", "/data/shaokai/AVION_PREDS/avion_pred_ids_val.json",
293+
// "--val_metadata", "/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv",
294+
// "--llava_num_frames", "4",
295+
// "--clip_length", "4",
296+
// "--action_representation", "official_key",
297+
// "--topk_predictions", "5",
298+
// "--eval_steps", "1",
299+
// "--vision_supervision", "three_tokens",
300+
// "--vision_token_training", "all_layers",
301+
// "--action_types", "97,300,3806",
302+
// "--learn_neighbor_actions", "prior",
303+
// "--test_type", "base"
304+
// ],
305+
// "console": "integratedTerminal",
306+
// "justMyCode": false,
307+
// "cwd": "${workspaceFolder}"
308+
// }
309+
// ]
358310
// }
359311

360312

313+
{
314+
"version": "0.2.0",
315+
"configurations": [
316+
{
317+
"name": "Run LLAVA Training with torchrun",
318+
"type": "debugpy",
319+
"request": "launch",
320+
"module": "torch.distributed.run",
321+
"env": {
322+
"CUDA_VISIBLE_DEVICES": "0",
323+
"OMP_NUM_THREADS": "8",
324+
"NCCL_IB_DISABLE": "0",
325+
"NCCL_IB_GID_INDEX": "3",
326+
"NCCL_SOCKET_IFNAME": "eth0",
327+
"HF_HOME": "/data/shaokai",
328+
"NCCL_DEBUG": "INFO",
329+
"ACCELERATE_CPU_AFFINITY": "1",
330+
"WANDB_API_KEY": "4474ec79de023b0c3ffb43588ab6163264f875db",
331+
"PYTHONPATH": "/data/shaokai/LLaVA-NeXT:/usr/local/lib/python3.10/site-packages/decord-0.6.0-py3.10-linux-x86_64.egg/"
332+
},
333+
"args": [
334+
"--nproc_per_node=1",
335+
"--nnodes=1",
336+
"--node_rank=0",
337+
"--master_addr=127.0.0.1",
338+
"--master_port=29500",
339+
"llava/action/ek_eval.py",
340+
"--pretrained_name", "experiments/dev_0.5b_4f_avion_top5_and_direct_neighbor",
341+
"--root", "/data/shaokai/EK100",
342+
"--train-metadata", "/data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv",
343+
"--val-metadata", "/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv",
344+
"--llava_num_frames", "4",
345+
"--clip-length", "4",
346+
"--action_predictions","/data/shaokai/TIM_PREDS/tim_pred_ids_val.json",
347+
"--action_representation", "GT_random_narration",
348+
"--topk_predictions", "5",
349+
"--test_type", "base",
350+
"--output_dir", "test_0.5b_direct",
351+
],
352+
"console": "integratedTerminal",
353+
"justMyCode": false,
354+
"cwd": "${workspaceFolder}"
355+
}
356+
]
357+
}
358+
359+
361360
// {
362361
// "version": "0.2.0",
363362
// "configurations": [

llava/action/utils.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -550,17 +550,18 @@ def test_generate(self,
550550
answer_ids.pop()
551551
answer_ids.append(gt_vn)
552552

553-
# let's shuffle answer_ids so that the gt_vn is not always at the end
554-
random.shuffle(answer_ids)
555-
553+
556554
answers = []
557555
for answer_id in answer_ids:
558556
answer = parse_vn_ids(answer_id, gt_vn, narration, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps)
559557
answers.append(answer)
558+
avion_pred = answers[0]
559+
560+
random.shuffle(answers)
560561

561562
letters = [chr(65+i) for i in range(26)][:k]
562563
options = list(range(26))[:k]
563-
564+
564565
options = []
565566
for answer, letter in zip(answers, letters):
566567
options.append(f'{letter}. {answer}')
@@ -572,7 +573,7 @@ def test_generate(self,
572573
'options': {0: options},
573574
'gt_answer_name': {0: gt_answer},
574575
'valid_letters': letters,
575-
'avion_pred': answers[0],
576+
'avion_pred': avion_pred,
576577
'all_avion_preds': answers
577578
}
578579

0 commit comments

Comments
 (0)