Added Qwen3-VL-235B-A22B-Instruct to transformers models options

seanpedrick-case · seanpedrick-case · commit 23710cf3461e · 2025-12-11T13:36:22.000Z
diff --git a/tools/config.py b/tools/config.py
@@ -710,7 +710,7 @@ def sanitize_markdown_text(text: str) -> str:
 
 SELECTED_MODEL = get_or_create_env_var(
     "SELECTED_MODEL", "Qwen3-VL-4B-Instruct"
-)  # Selected vision model. Choose from:  "Nanonets-OCR2-3B",  "Dots.OCR", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "PaddleOCR-VL"
+)  # Selected vision model. Choose from:  "Nanonets-OCR2-3B",  "Dots.OCR", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "Qwen3-VL-30B-A3B-Instruct", "Qwen3-VL-235B-A22B-Instruct", "PaddleOCR-VL"
 
 if SHOW_VLM_MODEL_OPTIONS:
     VLM_MODEL_OPTIONS = [
diff --git a/tools/run_vlm.py b/tools/run_vlm.py
@@ -400,11 +400,37 @@
             "trust_remote_code": True,
         }
 
-        # budget for image processor, since the compression ratio is 32 for Qwen3-VL, we can set the number of visual tokens of a single image to 256-1280
-        # processor.image_processor.size = {
-        #     "longest_edge": VLM_MAX_IMAGE_SIZE,
-        #     "shortest_edge": VLM_MIN_IMAGE_SIZE,
-        # }
+        if quantization_config is not None:
+            load_kwargs["quantization_config"] = quantization_config
+        else:
+            load_kwargs["dtype"] = "auto"
+        model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+            MODEL_ID, **load_kwargs
+        ).eval()
+
+        model_default_prompt = """Read all the text in the image."""
+        model_default_do_sample = False
+        model_default_top_p = 0.8
+        model_default_min_p = 0.0
+        model_default_top_k = 20
+        model_default_temperature = 0.7
+        model_default_repetition_penalty = 1.0
+        model_default_presence_penalty = 1.5
+        model_default_max_new_tokens = MAX_NEW_TOKENS
+        model_supports_presence_penalty = (
+            False  # I found that this doesn't work when using transformers
+        )
+
+    elif SELECTED_MODEL == "Qwen3-VL-235B-A22B-Instruct":
+        MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct"
+        from transformers import Qwen3VLMoeForConditionalGeneration
+
+        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+        load_kwargs = {
+            "attn_implementation": attn_implementation,
+            "device_map": "auto",
+            "trust_remote_code": True,
+        }
 
         if quantization_config is not None:
             load_kwargs["quantization_config"] = quantization_config