|
400 | 400 | "trust_remote_code": True, |
401 | 401 | } |
402 | 402 |
|
403 | | - # budget for image processor, since the compression ratio is 32 for Qwen3-VL, we can set the number of visual tokens of a single image to 256-1280 |
404 | | - # processor.image_processor.size = { |
405 | | - # "longest_edge": VLM_MAX_IMAGE_SIZE, |
406 | | - # "shortest_edge": VLM_MIN_IMAGE_SIZE, |
407 | | - # } |
| 403 | + if quantization_config is not None: |
| 404 | + load_kwargs["quantization_config"] = quantization_config |
| 405 | + else: |
| 406 | + load_kwargs["dtype"] = "auto" |
| 407 | + model = Qwen3VLMoeForConditionalGeneration.from_pretrained( |
| 408 | + MODEL_ID, **load_kwargs |
| 409 | + ).eval() |
| 410 | + |
| 411 | + model_default_prompt = """Read all the text in the image.""" |
| 412 | + model_default_do_sample = False |
| 413 | + model_default_top_p = 0.8 |
| 414 | + model_default_min_p = 0.0 |
| 415 | + model_default_top_k = 20 |
| 416 | + model_default_temperature = 0.7 |
| 417 | + model_default_repetition_penalty = 1.0 |
| 418 | + model_default_presence_penalty = 1.5 |
| 419 | + model_default_max_new_tokens = MAX_NEW_TOKENS |
| 420 | + model_supports_presence_penalty = ( |
| 421 | + False # I found that this doesn't work when using transformers |
| 422 | + ) |
| 423 | + |
| 424 | + elif SELECTED_MODEL == "Qwen3-VL-235B-A22B-Instruct": |
| 425 | + MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct" |
| 426 | + from transformers import Qwen3VLMoeForConditionalGeneration |
| 427 | + |
| 428 | + processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
| 429 | + load_kwargs = { |
| 430 | + "attn_implementation": attn_implementation, |
| 431 | + "device_map": "auto", |
| 432 | + "trust_remote_code": True, |
| 433 | + } |
408 | 434 |
|
409 | 435 | if quantization_config is not None: |
410 | 436 | load_kwargs["quantization_config"] = quantization_config |
|
0 commit comments