ModelTC · llmc-reviewer · Nov 20, 2024 · Nov 20, 2024
diff --git a/llmc/compression/quantization/awq.py b/llmc/compression/quantization/awq.py
@@ -156,7 +156,7 @@ def search_scale_subset(self, layers_dict, input, inspect_module, subset_kwargs)
 
         # Identify the rank with the minimum loss
         global_best_rank = torch.tensor([dist.get_rank()
-                                        if best_error == global_best_error
+                                        if abs(best_error - global_best_error) < 1e-5
                                         else -1],
                                         device='cuda')
         dist.all_reduce(global_best_rank, op=dist.ReduceOp.MAX)

diff --git a/llmc/models/mllama.py b/llmc/models/mllama.py
@@ -7,7 +7,7 @@
 except Exception:
     logger.warning(
         'Can not import MllamaForConditionalGeneration. '
-        'Please upgrade transformers.'
+        'If you need it, please upgrade transformers.'
     )
 
 from llmc.utils.registry_factory import MODEL_REGISTRY

diff --git a/llmc/models/qwen2vl.py b/llmc/models/qwen2vl.py
@@ -6,7 +6,15 @@
 except Exception:
     logger.warning(
         'Can not import Qwen2VLForConditionalGeneration. '
-        'Please upgrade transformers.'
+        'If you need it, please upgrade transformers.'
+    )
+
+try:
+    from qwen_vl_utils import process_vision_info
+except Exception:
+    logger.warning(
+        'Can not import qwen_vl_utils. '
+        'If you need it, please pip install qwen-vl-utils'
     )
 
 from llmc.utils.registry_factory import MODEL_REGISTRY
@@ -40,24 +48,33 @@ def build_model(self):
         self.model = self.vlm_model
         self.model_config = self.vlm_model_config
 
+        self.min_pixels = 256 * 28 * 28
+        self.max_pixels = 1280 * 28 * 28
+        logger.warning(f'min_pixels is set to: {self.min_pixels}')
+        logger.warning(f'max_pixels is set to: {self.max_pixels}')
+        logger.warning('You can refer the link https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct '
+                       'to get more info of image Resolution for performance boost.')
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_path,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels
+        )
+
     def batch_process(self, img_qas):
-        from qwen_vl_utils import process_vision_info
-        processor = AutoProcessor.from_pretrained(self.model_path)
         messages = []
         for idx in range(len(img_qas)):
             img_path = img_qas[idx]['img']
             if img_path is not None:
+                content = []
+                if not isinstance(img_path, list):
+                    img_path = [img_path]
+                for img_idx in range(len(img_path)):
+                    content.append({'type': 'image', 'image': img_path[img_idx]})
+                content.append({'type': 'text', 'text': img_qas[idx]['question']})
                 message = [
                     {
                         'role': 'user',
-                        'content': [
-                            {
-                                'type': 'image', 'image': img_path,
-                                'resized_height': 280, 'resized_width': 420
-                                # default: original resolution
-                            },
-                            {'type': 'text', 'text': img_qas[idx]['question']}
-                        ]
+                        'content': content
                     }
                 ]
             else:
@@ -71,11 +88,11 @@ def batch_process(self, img_qas):
                 ]
             messages.append(message)
         texts = [
-            processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
+            self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
             for msg in messages
         ]
         image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(
+        inputs = self.processor(
             text=texts,
             images=image_inputs,
             videos=video_inputs,

diff --git a/requirements/runtime.txt b/requirements/runtime.txt
@@ -27,3 +27,4 @@ word2number
 more_itertools
 qtorch
 einops
+qwen-vl-utils
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,3 +27,4 @@ word2number @@
     more_itertools
     qtorch
     einops
+    qwen-vl-utils