33Supported models:
44- Qwen2.5-VL, Qwen3-VL series
55- Kimi VL series
6+ - GLM VL series
67
78Provides functions to:
891. Parse prompts with media tags (<image>/<video>)
11124. Construct model-compatible message formats
1213
1314Note:
14- Only processors with class names containing both ("Qwen" OR "Kimi ") AND "Processor" are supported.
15+ Only processors with class names containing both ("Qwen", "Kimi" OR "Glm ") AND "Processor" are supported.
1516 Relies on `qwen_vl_utils.process_vision_info` for media extraction.
1617"""
1718import re
1819from typing import Any , Dict , List , Union
1920
2021
22+ def is_qwen_like_processor (processor : Any ) -> bool :
23+ return re .search (r"(Qwen|Kimi|Glm).*Processor" , processor .__class__ .__name__ ) is not None
24+
25+
2126def build_multi_modal_data (
2227 processor : Any ,
2328 messages : List [Dict ],
@@ -29,7 +34,7 @@ def build_multi_modal_data(
2934
3035 Args:
3136 processor: Vision-language processor instance (must have class name containing
32- ("Qwen" OR "Kimi ") AND "Processor").
37+ ("Qwen", "Kimi" OR "Glm ") AND "Processor").
3338 messages: List of conversation messages in model-expected format. Each message's "content"
3439 may be a string or list of content items (text/image/video dictionaries).
3540
@@ -49,9 +54,7 @@ def build_multi_modal_data(
4954 {"image": [processed_image]}
5055 """
5156 processor_class_name = processor .__class__ .__name__
52- if (
53- "Qwen" in processor_class_name or "Kimi" in processor_class_name
54- ) and "Processor" in processor_class_name :
57+ if is_qwen_like_processor (processor ):
5558 from qwen_vl_utils import process_vision_info
5659
5760 image_inputs , video_inputs = process_vision_info (messages )
@@ -63,7 +66,7 @@ def build_multi_modal_data(
6366
6467 return multi_modal_data
6568 raise NotImplementedError (
66- f"Processor '{ processor_class_name } ' not supported. Only Qwen/Kimi VL processors are supported."
69+ f"Processor '{ processor_class_name } ' not supported. Only Qwen/Kimi/Glm VL processors are supported."
6770 )
6871
6972
@@ -77,7 +80,7 @@ def build_mm_input_for_training(
7780
7881 Args:
7982 processor: Vision-language processor instance (must have class name containing
80- ("Qwen" OR "Kimi ") AND "Processor").
83+ ("Qwen", "Kimi" OR "Glm ") AND "Processor").
8184 prompt: Plain text prompt WITHOUT media tags (e.g., "Describe this image").
8285 Media placement is handled via `multi_modal_data`, not prompt tags.
8386 multi_modal_data: Dictionary from `build_multi_modal_data()` containing:
@@ -100,9 +103,7 @@ def build_mm_input_for_training(
100103 through the structured `multi_modal_data` dictionary.
101104 """
102105 processor_class_name = processor .__class__ .__name__
103- if (
104- "Qwen" in processor_class_name or "Kimi" in processor_class_name
105- ) and "Processor" in processor_class_name :
106+ if is_qwen_like_processor (processor ):
106107 inputs = processor (
107108 text = [prompt ],
108109 images = multi_modal_data .get ("image" , None ),
@@ -112,7 +113,7 @@ def build_mm_input_for_training(
112113 )
113114 return dict (inputs )
114115 raise NotImplementedError (
115- f"Processor '{ processor_class_name } ' not supported. Only Qwen/Kimi VL processors are supported."
116+ f"Processor '{ processor_class_name } ' not supported. Only Qwen/Kimi/Glm VL processors are supported."
116117 )
117118
118119
0 commit comments