Skip to content

Commit 9356ac9

Browse files
authored
Merge pull request #3 from yeshaokai/shaokai/dev
Shaokai/dev
2 parents 2e1d3a9 + dfa22ed commit 9356ac9

File tree

70 files changed

+11039
-1270
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+11039
-1270
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,6 @@ data_processing/
7474

7575
experiments/
7676
*.out
77-
pretrained_models/
77+
pretrained_models/
78+
79+
huggingface/

.vscode/launch.json

Lines changed: 407 additions & 46 deletions
Large diffs are not rendered by default.

docs/LLaVA_OneVision_Tutorials.ipynb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,11 @@
6060
"model_name = \"llava_qwen\"\n",
6161
"device = \"cuda\"\n",
6262
"device_map = \"auto\"\n",
63-
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args\n",
63+
"llava_model_args = {\n",
64+
" \"multimodal\": True,\n",
65+
" \"attn_implementation\": \"sdpa\",\n",
66+
"}\n",
67+
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, **llava_model_args) # Add any other thing you want to pass in llava_model_args\n",
6468
"\n",
6569
"model.eval()\n",
6670
"\n",
@@ -322,7 +326,10 @@
322326
"model_name = \"llava_qwen\"\n",
323327
"device = \"cuda\"\n",
324328
"device_map = \"auto\"\n",
325-
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation=\"sdpa\")\n",
329+
"llava_model_args = {\n",
330+
" \"multimodal\": True,\n",
331+
"}\n",
332+
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation=\"sdpa\", **llava_model_args)\n",
326333
"\n",
327334
"model.eval()\n",
328335
"\n",

docs/LLaVA_OneVision_Tutorials.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575

7676
warnings.filterwarnings("ignore")
7777
# Load the OneVision model
78-
pretrained = "lmms-lab/LLaVA-Video-72B-Qwen2"
78+
pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
7979
# pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_quick_config"
8080
model_base = None
8181
model_name = "llava_qwen"

docs/LLaVA_OneVision_debug.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import os
2+
import sys
3+
sys.path[0] = os.path.dirname(sys.path[0])
4+
5+
# from llava.model.builder import load_pretrained_model
6+
# from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
7+
# from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
8+
# from llava.conversation import conv_templates, SeparatorStyle
9+
10+
# from PIL import Image
11+
# import requests
12+
# import copy
13+
# import torch
14+
15+
# import sys
16+
# import warnings
17+
18+
19+
20+
# warnings.filterwarnings("ignore")
21+
# pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-si"
22+
# model_name = "llava_qwen"
23+
# device = "cuda"
24+
# device_map = "auto"
25+
# tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
26+
27+
# model.eval()
28+
29+
# url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
30+
# image = Image.open(requests.get(url, stream=True).raw)
31+
# image_tensor = process_images([image], image_processor, model.config)
32+
# image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
33+
34+
# conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
35+
# question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image?"
36+
# conv = copy.deepcopy(conv_templates[conv_template])
37+
# conv.append_message(conv.roles[0], question)
38+
# conv.append_message(conv.roles[1], None)
39+
# prompt_question = conv.get_prompt()
40+
41+
# input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
42+
# image_sizes = [image.size]
43+
44+
45+
# cont = model.generate(
46+
# input_ids,
47+
# images=image_tensor,
48+
# image_sizes=image_sizes,
49+
# do_sample=False,
50+
# temperature=0,
51+
# max_new_tokens=4096,
52+
# )
53+
# text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
54+
# print(text_outputs)
55+
56+
57+
58+
59+
from operator import attrgetter
60+
from llava.model.builder import load_pretrained_model
61+
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
62+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
63+
from llava.conversation import conv_templates, SeparatorStyle
64+
65+
import torch
66+
import cv2
67+
import numpy as np
68+
from PIL import Image
69+
import requests
70+
import copy
71+
import warnings
72+
from decord import VideoReader, cpu
73+
import transformers
74+
import ast
75+
import re
76+
77+
from llava.train.train import ModelArguments, DataArguments, TrainingArguments, EK100EvalArguments, LazySupervisedDataset
78+
79+
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments, EK100EvalArguments))
80+
model_args, data_args, training_args, eval_args = parser.parse_args_into_dataclasses()
81+
82+
83+
os.environ["HF_HOME"] = "huggingface"
84+
85+
warnings.filterwarnings("ignore")
86+
# Load the OneVision model
87+
pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
88+
# pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_quick_config"
89+
model_base = None
90+
model_name = "llava_qwen"
91+
92+
# pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_lora_quick_check"
93+
# model_base = "/mnt/SV_storage/VFM/huggingface/hub/models--lmms-lab--llava-onevision-qwen2-0.5b-ov/snapshots/381d9947148efb1e58a577f451c05705ceec666e"
94+
# model_name = "lora_llava_qwen"
95+
device = "cuda"
96+
device_map = "auto"
97+
# tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, model_base, model_name, device_map=device_map, attn_implementation="sdpa")
98+
overwrite_config = {}
99+
if model_args.vision_supervision is not None:
100+
overwrite_config["vision_supervision"] = model_args.vision_supervision
101+
overwrite_config["action_types"] = model_args.action_types
102+
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, model_base, model_name,
103+
device_map=device_map, attn_implementation="flash_attention_2", overwrite_config=overwrite_config)
104+
# model.eval()
105+
106+
107+
vision_tower = model.get_vision_tower()
108+
data_args.image_processor = vision_tower.image_processor
109+
data_args.is_multimodal = True
110+
data_args.mm_use_im_start_end = False
111+
if data_args.image_grid_pinpoints is not None:
112+
if isinstance(data_args.image_grid_pinpoints, str) and "x" in data_args.image_grid_pinpoints:
113+
try:
114+
patch_size = data_args.image_processor.size[0]
115+
except Exception as e:
116+
patch_size = data_args.image_processor.size["shortest_edge"]
117+
118+
assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
119+
# Use regex to extract the range from the input string
120+
matches = re.findall(r"\((\d+)x(\d+)\)", data_args.image_grid_pinpoints)
121+
range_start = tuple(map(int, matches[0]))
122+
range_end = tuple(map(int, matches[-1]))
123+
# Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
124+
grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
125+
# Multiply all elements by patch_size
126+
data_args.image_grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
127+
elif isinstance(data_args.image_grid_pinpoints, str):
128+
data_args.image_grid_pinpoints = ast.literal_eval(data_args.image_grid_pinpoints)
129+
train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path, data_args=data_args, eval_args = eval_args)
130+
131+
data = train_dataset[0]
132+
133+
input_ids = data["input_ids"].unsqueeze(0).to(device)
134+
labels = data["labels"].unsqueeze(0).to(device)
135+
images = [data["image"][0][0].half().to(device)]
136+
image_sizes = [data["image"][0][1]]
137+
actions = torch.stack([data["image"][0][3].to(device)])
138+
attention_mask = torch.ones_like(input_ids).bool().to(device)
139+
modalities=["video"]
140+
141+
cont = model(
142+
input_ids=input_ids,
143+
attention_mask=attention_mask,
144+
images=images,
145+
image_sizes=image_sizes,
146+
modalities=modalities,
147+
labels=labels,
148+
actions=actions,
149+
)
150+
151+
aa = 2
152+
153+
154+
# # Function to extract frames from video
155+
# def load_video(video_path, max_frames_num):
156+
# if type(video_path) == str:
157+
# vr = VideoReader(video_path, ctx=cpu(0))
158+
# else:
159+
# vr = VideoReader(video_path[0], ctx=cpu(0))
160+
# total_frame_num = len(vr)
161+
# uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
162+
# frame_idx = uniform_sampled_frames.tolist()
163+
# spare_frames = vr.get_batch(frame_idx).asnumpy()
164+
# return spare_frames # (frames, height, width, channels)
165+
166+
167+
# # Load and process video
168+
# video_path = "docs/jobs.mp4"
169+
# video_frames = load_video(video_path, 16)
170+
# print(video_frames.shape) # (16, 1024, 576, 3)
171+
# image_tensors = []
172+
# frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].half().cuda()
173+
# image_tensors.append(frames)
174+
175+
# # Prepare conversation input
176+
# conv_template = "qwen_1_5"
177+
# question = f"{DEFAULT_IMAGE_TOKEN}\nDescribe what's happening in this video."
178+
179+
# conv = copy.deepcopy(conv_templates[conv_template])
180+
# conv.append_message(conv.roles[0], question)
181+
# conv.append_message(conv.roles[1], None)
182+
# prompt_question = conv.get_prompt()
183+
184+
# input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
185+
# image_sizes = [frame.size for frame in video_frames]
186+
187+
# # Generate response
188+
# cont = model.generate(
189+
# input_ids,
190+
# images=image_tensors,
191+
# image_sizes=image_sizes,
192+
# do_sample=False,
193+
# temperature=0,
194+
# max_new_tokens=4096,
195+
# modalities=["video"],
196+
# )
197+
# text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
198+
# print(text_outputs[0])

docs/LLaVA_Video_1003.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ print(text_outputs)
8484

8585
## Training
8686

87-
[[Scripts]](/Users/zhangyuanhan/Desktop/LLaVA-NeXT/scripts/video/train): Start training models on your single-image/multi-image/video data.
87+
[[Scripts]](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/yhzhang/video_dev/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9_aug6.sh): Start training models on your single-image/multi-image/video data.
8888

8989

9090
## Evaluation Guidance

docs/download_data.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,41 @@
11
import os
2+
os.environ["HF_HOME"] = "/mnt/SV_storage/VFM/huggingface"
23
from datasets import load_dataset
4+
from datasets import get_dataset_config_names, get_dataset_split_names
35
from tqdm import tqdm
46
import json
57
import yaml
68

9+
dataset_name = "lmms-lab/LLaVA-Video-178K"
10+
11+
save_root = "/mnt/SV_storage/VFM/onevision/llava_video_178k"
12+
13+
subsets = get_dataset_config_names(dataset_name)
14+
for subset in subsets:
15+
# download the dataset
16+
data = load_dataset(dataset_name, subset)
17+
for da in tqdm(data):
18+
json_data = {}
19+
json_data["id"] = da["id"]
20+
json_data["video"] = da["video"]
21+
json_data["conversations"] = da["conversations"]
22+
with open(os.path.join(save_root, '{}.json'.format(da["id"])), "w") as f:
23+
json.dump(json_data, f, indent=4, ensure_ascii=False)
24+
aa= 1
25+
26+
# splits = get_dataset_split_names(dataset_name, subset)
27+
28+
29+
# aa = 1
30+
31+
32+
# data = load_dataset("lmms-lab/LLaVA-Video-178K", '0_30_s_academic_v0_1', split="caption")
33+
34+
# for da in tqdm(data):
35+
# json_data = {}
36+
# json_data["id"] = da["id"]
37+
# aa= 2
38+
739
avaliable_datasets = ['CLEVR-Math(MathV360K)', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)',
840
'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'PMC-VQA(MathV360K)',
941
'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)',

0 commit comments

Comments
 (0)