|
176 | 176 | "model_name = \"llava_qwen\"\n", |
177 | 177 | "device = \"cuda\"\n", |
178 | 178 | "device_map = \"auto\"\n", |
179 | | - "tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)\n", |
| 179 | + "llava_model_args = {\n", |
| 180 | + " \"multimodal\": True,\n", |
| 181 | + " }\n", |
| 182 | + "overwrite_config = {}\n", |
| 183 | + "overwrite_config[\"image_aspect_ratio\"] = \"pad\"\n", |
| 184 | + "llava_model_args[\"overwrite_config\"] = overwrite_config\n", |
| 185 | + "tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, **llava_model_args)\n", |
180 | 186 | "\n", |
181 | 187 | "model.eval()\n", |
182 | 188 | "\n", |
|
299 | 305 | " do_sample=False,\n", |
300 | 306 | " temperature=0,\n", |
301 | 307 | " max_new_tokens=4096,\n", |
| 308 | + " modalities=[\"video\"],\n", |
302 | 309 | ")\n", |
303 | 310 | "text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)\n", |
304 | 311 | "print(text_outputs[0])" |
|
307 | 314 | ], |
308 | 315 | "metadata": { |
309 | 316 | "kernelspec": { |
310 | | - "display_name": "llava", |
| 317 | + "display_name": "Python 3.9.2 64-bit", |
311 | 318 | "language": "python", |
312 | 319 | "name": "python3" |
313 | 320 | }, |
|
321 | 328 | "name": "python", |
322 | 329 | "nbconvert_exporter": "python", |
323 | 330 | "pygments_lexer": "ipython3", |
324 | | - "version": "3.10.14" |
| 331 | + "version": "3.9.2" |
| 332 | + }, |
| 333 | + "vscode": { |
| 334 | + "interpreter": { |
| 335 | + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" |
| 336 | + } |
325 | 337 | } |
326 | 338 | }, |
327 | 339 | "nbformat": 4, |
|
0 commit comments