1+ import os
2+ import sys
3+ sys .path [0 ] = os .path .dirname (sys .path [0 ])
4+
5+ # from llava.model.builder import load_pretrained_model
6+ # from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
7+ # from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
8+ # from llava.conversation import conv_templates, SeparatorStyle
9+
10+ # from PIL import Image
11+ # import requests
12+ # import copy
13+ # import torch
14+
15+ # import sys
16+ # import warnings
17+
18+
19+
20+ # warnings.filterwarnings("ignore")
21+ # pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-si"
22+ # model_name = "llava_qwen"
23+ # device = "cuda"
24+ # device_map = "auto"
25+ # tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
26+
27+ # model.eval()
28+
29+ # url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
30+ # image = Image.open(requests.get(url, stream=True).raw)
31+ # image_tensor = process_images([image], image_processor, model.config)
32+ # image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
33+
34+ # conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
35+ # question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image?"
36+ # conv = copy.deepcopy(conv_templates[conv_template])
37+ # conv.append_message(conv.roles[0], question)
38+ # conv.append_message(conv.roles[1], None)
39+ # prompt_question = conv.get_prompt()
40+
41+ # input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
42+ # image_sizes = [image.size]
43+
44+
45+ # cont = model.generate(
46+ # input_ids,
47+ # images=image_tensor,
48+ # image_sizes=image_sizes,
49+ # do_sample=False,
50+ # temperature=0,
51+ # max_new_tokens=4096,
52+ # )
53+ # text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
54+ # print(text_outputs)
55+
56+
57+
58+
59+ from operator import attrgetter
60+ from llava .model .builder import load_pretrained_model
61+ from llava .mm_utils import get_model_name_from_path , process_images , tokenizer_image_token
62+ from llava .constants import IMAGE_TOKEN_INDEX , DEFAULT_IMAGE_TOKEN , DEFAULT_IM_START_TOKEN , DEFAULT_IM_END_TOKEN , IGNORE_INDEX
63+ from llava .conversation import conv_templates , SeparatorStyle
64+
65+ import torch
66+ import cv2
67+ import numpy as np
68+ from PIL import Image
69+ import requests
70+ import copy
71+ import warnings
72+ from decord import VideoReader , cpu
73+ import transformers
74+ import ast
75+ import re
76+
77+ from llava .train .train import ModelArguments , DataArguments , TrainingArguments , EK100EvalArguments , LazySupervisedDataset
78+
79+ parser = transformers .HfArgumentParser ((ModelArguments , DataArguments , TrainingArguments , EK100EvalArguments ))
80+ model_args , data_args , training_args , eval_args = parser .parse_args_into_dataclasses ()
81+
82+
83+ os .environ ["HF_HOME" ] = "huggingface"
84+
85+ warnings .filterwarnings ("ignore" )
86+ # Load the OneVision model
87+ pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
88+ # pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_quick_config"
89+ model_base = None
90+ model_name = "llava_qwen"
91+
92+ # pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_lora_quick_check"
93+ # model_base = "/mnt/SV_storage/VFM/huggingface/hub/models--lmms-lab--llava-onevision-qwen2-0.5b-ov/snapshots/381d9947148efb1e58a577f451c05705ceec666e"
94+ # model_name = "lora_llava_qwen"
95+ device = "cuda"
96+ device_map = "auto"
97+ # tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, model_base, model_name, device_map=device_map, attn_implementation="sdpa")
98+ overwrite_config = {}
99+ if model_args .vision_supervision is not None :
100+ overwrite_config ["vision_supervision" ] = model_args .vision_supervision
101+ overwrite_config ["action_types" ] = model_args .action_types
102+ tokenizer , model , image_processor , max_length = load_pretrained_model (pretrained , model_base , model_name ,
103+ device_map = device_map , attn_implementation = "flash_attention_2" , overwrite_config = overwrite_config )
104+ # model.eval()
105+
106+
107+ vision_tower = model .get_vision_tower ()
108+ data_args .image_processor = vision_tower .image_processor
109+ data_args .is_multimodal = True
110+ data_args .mm_use_im_start_end = False
111+ if data_args .image_grid_pinpoints is not None :
112+ if isinstance (data_args .image_grid_pinpoints , str ) and "x" in data_args .image_grid_pinpoints :
113+ try :
114+ patch_size = data_args .image_processor .size [0 ]
115+ except Exception as e :
116+ patch_size = data_args .image_processor .size ["shortest_edge" ]
117+
118+ assert patch_size in [224 , 336 , 384 , 448 , 512 ], "patch_size should be in [224, 336, 384, 448, 512]"
119+ # Use regex to extract the range from the input string
120+ matches = re .findall (r"\((\d+)x(\d+)\)" , data_args .image_grid_pinpoints )
121+ range_start = tuple (map (int , matches [0 ]))
122+ range_end = tuple (map (int , matches [- 1 ]))
123+ # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
124+ grid_pinpoints = [(i , j ) for i in range (range_start [0 ], range_end [0 ] + 1 ) for j in range (range_start [1 ], range_end [1 ] + 1 )]
125+ # Multiply all elements by patch_size
126+ data_args .image_grid_pinpoints = [[dim * patch_size for dim in pair ] for pair in grid_pinpoints ]
127+ elif isinstance (data_args .image_grid_pinpoints , str ):
128+ data_args .image_grid_pinpoints = ast .literal_eval (data_args .image_grid_pinpoints )
129+ train_dataset = LazySupervisedDataset (tokenizer = tokenizer , data_path = data_args .data_path , data_args = data_args , eval_args = eval_args )
130+
131+ data = train_dataset [0 ]
132+
133+ input_ids = data ["input_ids" ].unsqueeze (0 ).to (device )
134+ labels = data ["labels" ].unsqueeze (0 ).to (device )
135+ images = [data ["image" ][0 ][0 ].half ().to (device )]
136+ image_sizes = [data ["image" ][0 ][1 ]]
137+ actions = torch .stack ([data ["image" ][0 ][3 ].to (device )])
138+ attention_mask = torch .ones_like (input_ids ).bool ().to (device )
139+ modalities = ["video" ]
140+
141+ cont = model (
142+ input_ids = input_ids ,
143+ attention_mask = attention_mask ,
144+ images = images ,
145+ image_sizes = image_sizes ,
146+ modalities = modalities ,
147+ labels = labels ,
148+ actions = actions ,
149+ )
150+
151+ aa = 2
152+
153+
154+ # # Function to extract frames from video
155+ # def load_video(video_path, max_frames_num):
156+ # if type(video_path) == str:
157+ # vr = VideoReader(video_path, ctx=cpu(0))
158+ # else:
159+ # vr = VideoReader(video_path[0], ctx=cpu(0))
160+ # total_frame_num = len(vr)
161+ # uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
162+ # frame_idx = uniform_sampled_frames.tolist()
163+ # spare_frames = vr.get_batch(frame_idx).asnumpy()
164+ # return spare_frames # (frames, height, width, channels)
165+
166+
167+ # # Load and process video
168+ # video_path = "docs/jobs.mp4"
169+ # video_frames = load_video(video_path, 16)
170+ # print(video_frames.shape) # (16, 1024, 576, 3)
171+ # image_tensors = []
172+ # frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].half().cuda()
173+ # image_tensors.append(frames)
174+
175+ # # Prepare conversation input
176+ # conv_template = "qwen_1_5"
177+ # question = f"{DEFAULT_IMAGE_TOKEN}\nDescribe what's happening in this video."
178+
179+ # conv = copy.deepcopy(conv_templates[conv_template])
180+ # conv.append_message(conv.roles[0], question)
181+ # conv.append_message(conv.roles[1], None)
182+ # prompt_question = conv.get_prompt()
183+
184+ # input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
185+ # image_sizes = [frame.size for frame in video_frames]
186+
187+ # # Generate response
188+ # cont = model.generate(
189+ # input_ids,
190+ # images=image_tensors,
191+ # image_sizes=image_sizes,
192+ # do_sample=False,
193+ # temperature=0,
194+ # max_new_tokens=4096,
195+ # modalities=["video"],
196+ # )
197+ # text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
198+ # print(text_outputs[0])
0 commit comments