Skip to content

Commit 381dd49

Browse files
committed
runable version on AMGm0
1 parent 2e7cd5a commit 381dd49

File tree

14 files changed

+589
-252
lines changed

14 files changed

+589
-252
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ dist
1515
# Editor
1616
.idea
1717
*.swp
18-
.vscode
1918

2019
# Other
2120
.DS_Store
@@ -71,3 +70,7 @@ playground/*.json
7170
mlx_configs/
7271
data_processing/
7372
# demo/
73+
74+
75+
experiments/
76+
*.out

.vscode/launch.json

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
{
2+
"version": "0.2.0",
3+
"configurations": [
4+
{
5+
"name": "Run LLAVA Training with torchrun",
6+
"type": "debugpy",
7+
"request": "launch",
8+
"module": "torch.distributed.run",
9+
"env": {
10+
"CUDA_VISIBLE_DEVICES": "1,2",
11+
"OMP_NUM_THREADS": "8",
12+
"NCCL_IB_DISABLE": "0",
13+
"NCCL_IB_GID_INDEX": "3",
14+
"NCCL_SOCKET_IFNAME": "eth0",
15+
"NCCL_DEBUG": "INFO",
16+
"ACCELERATE_CPU_AFFINITY": "1",
17+
"LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
18+
},
19+
"args": [
20+
"--nproc_per_node=2",
21+
"--nnodes=1",
22+
"--node_rank=0",
23+
"--master_addr=127.0.0.1",
24+
"--master_port=29500",
25+
"llava/train/train_mem.py",
26+
"--deepspeed", "scripts/zero3.json",
27+
"--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
28+
"--version", "qwen_1_5",
29+
"--data_path", "scripts/train/onevision.yaml",
30+
// "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
31+
"--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
32+
"--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
33+
"--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
34+
"--mm_vision_tower_lr", "2e-6",
35+
"--vision_tower", "google/siglip-so400m-patch14-384",
36+
"--mm_projector_type", "mlp2x_gelu",
37+
"--mm_vision_select_layer", "-2",
38+
"--mm_use_im_start_end", "False",
39+
"--mm_use_im_patch_token", "False",
40+
"--group_by_modality_length", "True",
41+
"--image_aspect_ratio", "anyres_max_9",
42+
"--image_grid_pinpoints", "(1x1),...,(6x6)",
43+
"--mm_patch_merge_type", "spatial_unpad",
44+
"--bf16", "True",
45+
"--run_name", "test",
46+
"--output_dir", "experiments/test",
47+
"--num_train_epochs", "1",
48+
"--per_device_train_batch_size", "1",
49+
"--per_device_eval_batch_size", "4",
50+
"--gradient_accumulation_steps", "2",
51+
"--evaluation_strategy", "no",
52+
"--save_strategy", "steps",
53+
"--save_steps", "1000",
54+
"--save_total_limit", "1",
55+
"--learning_rate", "1e-5",
56+
"--weight_decay", "0.",
57+
"--warmup_ratio", "0.03",
58+
"--lr_scheduler_type", "cosine",
59+
"--logging_steps", "1",
60+
"--tf32", "True",
61+
"--model_max_length", "32768",
62+
"--gradient_checkpointing", "True",
63+
"--dataloader_num_workers", "4",
64+
"--lazy_preprocess", "True",
65+
"--report_to", "wandb",
66+
"--torch_compile", "True",
67+
"--torch_compile_backend", "inductor",
68+
"--dataloader_drop_last", "True",
69+
"--frames_upbound", "32",
70+
],
71+
"console": "integratedTerminal",
72+
"justMyCode": false,
73+
"cwd": "${workspaceFolder}"
74+
}
75+
]
76+
}
77+
78+
79+
// {
80+
// // Use IntelliSense to learn about possible attributes.
81+
// // Hover to view descriptions of existing attributes.
82+
// // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
83+
// "version": "0.2.0",
84+
// "configurations": [
85+
// {
86+
// "name": "Python: Current File",
87+
// "type": "debugpy",
88+
// "request": "launch",
89+
// "program": "docs/LLaVA_OneVision_Tutorials.py",
90+
// "console": "integratedTerminal",
91+
// "env":{"CUDA_VISIBLE_DEVICES":"0",
92+
// "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
93+
// "LD_LIBRARY_PATH": "/home/haozhe/miniconda3/envs/llava/lib"},
94+
// "justMyCode": false,
95+
// // "args": [
96+
// // "--run_dir_name", "test",
97+
// // // "--use_big_decoder"
98+
// // ]
99+
// }
100+
// ]
101+
// }

add_dataset_name.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import json
2+
import os
3+
4+
json_root = '/mediaPFM/data/haozhe/onevision/llava_instruct_old'
5+
save_root = '/mediaPFM/data/haozhe/onevision/llava_instruct'
6+
7+
json_list = os.listdir(json_root)
8+
for json_name in json_list:
9+
json_path = os.path.join(json_root, json_name)
10+
if json_path.endswith(".jsonl"):
11+
cur_data_dict = []
12+
with open(json_path, "r") as json_file:
13+
for line in json_file:
14+
cur_data_dict.append(json.loads(line.strip()))
15+
elif json_path.endswith(".json"):
16+
with open(json_path, "r") as json_file:
17+
cur_data_dict = json.load(json_file)
18+
else:
19+
raise ValueError(f"Unsupported file type: {json_path}")
20+
21+
dataset_name = json_path.split('/')[-1].split('.')[0]
22+
for data in cur_data_dict:
23+
data['dataset_name'] = dataset_name
24+
25+
# save back
26+
save_path = os.path.join(save_root, json_name)
27+
with open(save_path, "w") as json_file:
28+
if json_path.endswith(".jsonl"):
29+
for data in cur_data_dict:
30+
json_file.write(json.dumps(data) + "\n")
31+
elif json_path.endswith(".json"):
32+
json.dump(cur_data_dict, json_file, indent=4)
33+
aa = 1

docs/LLaVA_OneVision_Tutorials.ipynb

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,23 @@
2424
},
2525
{
2626
"cell_type": "code",
27-
"execution_count": null,
27+
"execution_count": 2,
2828
"metadata": {},
29-
"outputs": [],
29+
"outputs": [
30+
{
31+
"ename": "ImportError",
32+
"evalue": "cannot import name 'LlavaLlamaForCausalLM' from 'llava.model' (/media1/data/haozhe/VFM/LLaVA-NeXT/llava/model/__init__.py)",
33+
"output_type": "error",
34+
"traceback": [
35+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
36+
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
37+
"Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllava\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbuilder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_pretrained_model\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllava\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmm_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_model_name_from_path, process_images, tokenizer_image_token\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllava\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX\n",
38+
"File \u001b[0;32m/media1/data/haozhe/VFM/LLaVA-NeXT/llava/model/builder.py:24\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllava\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllava\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN\n\u001b[0;32m---> 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mllava\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m rank0_print\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_pretrained_model\u001b[39m(model_path, model_base, model_name, load_8bit\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, load_4bit\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, device_map\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m\"\u001b[39m, attn_implementation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m, customized_config\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, overwrite_config\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 28\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdevice_map\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m device_map\n",
39+
"File \u001b[0;32m/media1/data/haozhe/VFM/LLaVA-NeXT/llava/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LlavaLlamaForCausalLM\n",
40+
"\u001b[0;31mImportError\u001b[0m: cannot import name 'LlavaLlamaForCausalLM' from 'llava.model' (/media1/data/haozhe/VFM/LLaVA-NeXT/llava/model/__init__.py)"
41+
]
42+
}
43+
],
3044
"source": [
3145
"from llava.model.builder import load_pretrained_model\n",
3246
"from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token\n",

docs/LLaVA_OneVision_Tutorials.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# from llava.model.builder import load_pretrained_model
2+
# from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
3+
# from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
4+
# from llava.conversation import conv_templates, SeparatorStyle
5+
6+
# from PIL import Image
7+
# import requests
8+
# import copy
9+
# import torch
10+
11+
# import sys
12+
# import warnings
13+
14+
15+
16+
# warnings.filterwarnings("ignore")
17+
# pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-si"
18+
# model_name = "llava_qwen"
19+
# device = "cuda"
20+
# device_map = "auto"
21+
# tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
22+
23+
# model.eval()
24+
25+
# url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
26+
# image = Image.open(requests.get(url, stream=True).raw)
27+
# image_tensor = process_images([image], image_processor, model.config)
28+
# image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
29+
30+
# conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
31+
# question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image?"
32+
# conv = copy.deepcopy(conv_templates[conv_template])
33+
# conv.append_message(conv.roles[0], question)
34+
# conv.append_message(conv.roles[1], None)
35+
# prompt_question = conv.get_prompt()
36+
37+
# input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
38+
# image_sizes = [image.size]
39+
40+
41+
# cont = model.generate(
42+
# input_ids,
43+
# images=image_tensor,
44+
# image_sizes=image_sizes,
45+
# do_sample=False,
46+
# temperature=0,
47+
# max_new_tokens=4096,
48+
# )
49+
# text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
50+
# print(text_outputs)
51+
52+
53+
54+
55+
from operator import attrgetter
56+
from llava.model.builder import load_pretrained_model
57+
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
58+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
59+
from llava.conversation import conv_templates, SeparatorStyle
60+
61+
import torch
62+
import cv2
63+
import numpy as np
64+
from PIL import Image
65+
import requests
66+
import copy
67+
import warnings
68+
from decord import VideoReader, cpu
69+
70+
warnings.filterwarnings("ignore")
71+
# Load the OneVision model
72+
pretrained = "lmms-lab/llava-onevision-qwen2-7b-ov"
73+
model_name = "llava_qwen"
74+
device = "cuda"
75+
device_map = "auto"
76+
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation="sdpa")
77+
78+
model.eval()
79+
80+
81+
# Function to extract frames from video
82+
def load_video(video_path, max_frames_num):
83+
if type(video_path) == str:
84+
vr = VideoReader(video_path, ctx=cpu(0))
85+
else:
86+
vr = VideoReader(video_path[0], ctx=cpu(0))
87+
total_frame_num = len(vr)
88+
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
89+
frame_idx = uniform_sampled_frames.tolist()
90+
spare_frames = vr.get_batch(frame_idx).asnumpy()
91+
return spare_frames # (frames, height, width, channels)
92+
93+
94+
# Load and process video
95+
video_path = "docs/jobs.mp4"
96+
video_frames = load_video(video_path, 16)
97+
print(video_frames.shape) # (16, 1024, 576, 3)
98+
image_tensors = []
99+
frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].half().cuda()
100+
image_tensors.append(frames)
101+
102+
# Prepare conversation input
103+
conv_template = "qwen_1_5"
104+
question = f"{DEFAULT_IMAGE_TOKEN}\nDescribe what's happening in this video."
105+
106+
conv = copy.deepcopy(conv_templates[conv_template])
107+
conv.append_message(conv.roles[0], question)
108+
conv.append_message(conv.roles[1], None)
109+
prompt_question = conv.get_prompt()
110+
111+
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
112+
image_sizes = [frame.size for frame in video_frames]
113+
114+
# Generate response
115+
cont = model.generate(
116+
input_ids,
117+
images=image_tensors,
118+
image_sizes=image_sizes,
119+
do_sample=False,
120+
temperature=0,
121+
max_new_tokens=4096,
122+
modalities=["video"],
123+
)
124+
text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
125+
print(text_outputs[0])

docs/download_data.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import os
2+
from datasets import load_dataset
3+
from tqdm import tqdm
4+
import json
5+
import yaml
6+
7+
avaliable_datasets = ['CLEVR-Math(MathV360K)', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)',
8+
'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'PMC-VQA(MathV360K)',
9+
'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)',
10+
'VizWiz(MathV360K)', 'ai2d(cauldron,llava_format)', 'ai2d(gpt4v)', 'ai2d(internvl)',
11+
'allava_instruct_laion4v', 'allava_instruct_vflan4v', 'aokvqa(cauldron,llava_format)',
12+
'chart2text(cauldron)', 'chartqa(cauldron,llava_format)', 'chrome_writting',
13+
'clevr(cauldron,llava_format)', 'diagram_image_to_text(cauldron)', 'dvqa(cauldron,llava_format)',
14+
'figureqa(cauldron,llava_format)', 'geo170k(align)', 'geo170k(qa)', 'geo3k', 'geomverse(cauldron)',
15+
'hateful_memes(cauldron,llava_format)', 'hitab(cauldron,llava_format)', 'hme100k',
16+
'iam(cauldron)', 'iconqa(cauldron,llava_format)', 'iiit5k', 'image_textualization(filtered)',
17+
'infographic(gpt4v)', 'infographic_vqa', 'infographic_vqa_llava_format',
18+
'intergps(cauldron,llava_format)', 'k12_printing', 'llavar_gpt4_20k', 'lrv_chart',
19+
'lrv_normal(filtered)', 'magpie_pro(l3_80b_mt)', 'magpie_pro(l3_80b_st)',
20+
'magpie_pro(qwen2_72b_st)', 'mapqa(cauldron,llava_format)', 'mathqa', 'mavis_math_metagen',
21+
'mavis_math_rule_geo', 'multihiertt(cauldron)', 'orand_car_a', 'raven(cauldron)',
22+
'rendered_text(cauldron)', 'robut_sqa(cauldron)', 'robut_wikisql(cauldron)',
23+
'robut_wtq(cauldron,llava_format)', 'scienceqa(cauldron,llava_format)', 'scienceqa(nona_context)',
24+
'screen2words(cauldron)', 'sharegpt4o', 'sharegpt4v(coco)', 'sharegpt4v(knowledge)',
25+
'sharegpt4v(llava)', 'sharegpt4v(sam)', 'sroie', 'st_vqa(cauldron,llava_format)',
26+
'tabmwp(cauldron)', 'tallyqa(cauldron,llava_format)', 'textcaps', 'textocr(gpt4v)',
27+
'tqa(cauldron,llava_format)', 'ureader_cap', 'ureader_ie', 'vision_flan(filtered)',
28+
'vistext(cauldron)', 'visual7w(cauldron,llava_format)', 'visualmrc(cauldron)',
29+
'vqarad(cauldron,llava_format)', 'vsr(cauldron,llava_format)', 'websight(cauldron)']
30+
31+
chossen_datasets = ['sharegpt4v(sam)', 'sharegpt4v(llava)']
32+
33+
image_base = "/mediaPFM/data/haozhe/onevision/llava_data"
34+
json_base = "/mediaPFM/data/haozhe/onevision/llava_instruct"
35+
dataset_yaml = 'scripts/train/onevision.yaml'
36+
37+
# # open the yaml file
38+
# with open(dataset_yaml, 'r') as f:
39+
# dataset_config = yaml.safe_load(f)
40+
41+
# dataset_paths = {}
42+
# for data_info in dataset_config['datasets']:
43+
# dataset_paths[data_info['json_path'].split('/')[-1]] = data_info['json_path']
44+
45+
46+
for dataset_name in chossen_datasets:
47+
data = load_dataset("lmms-lab/LLaVA-OneVision-Data", dataset_name, split="train")
48+
converted_data = []
49+
50+
image_folder = os.path.join(image_base, dataset_name)
51+
os.makedirs(image_folder, exist_ok=True)
52+
53+
for da in tqdm(data):
54+
json_data = {}
55+
json_data["id"] = da["id"]
56+
if da["image"] is not None:
57+
json_data["image"] = f"{da['id']}.png"
58+
da["image"].save(os.path.join(image_folder, json_data["image"]))
59+
json_data["conversations"] = da["conversations"]
60+
converted_data.append(json_data)
61+
62+
63+
with open(os.path.join(json_base, '{}.json'.format(dataset_name)), "w") as f:
64+
json.dump(converted_data, f, indent=4, ensure_ascii=False)

0 commit comments

Comments
 (0)