update vispruner (#425)

zhangbilang · web-flow · commit c038265692a8 · 2025-07-26T14:33:50.000+08:00
diff --git a/configs/sparsification/methods/VisPruner/vispruner.yml b/configs/sparsification/methods/VisPruner/vispruner.yml
@@ -0,0 +1,26 @@
+base:
+    seed: &seed 42
+model:
+    type: Llava
+    path: model path
+    torch_dtype: auto
+eval:
+
+    eval_pos: [pretrain, transformed]
+    type: vqa
+    name: [mme]
+    download: False
+    path: MME dataset path
+    bs: 1
+    inference_per_block: False
+sparse:
+    vision:
+        method: TokenReduction
+        special:
+            method: VisPruner
+            prune_ratio: 0.778  # 0.667 0.778 0.889
+            important_ratio: 0.5
+save:
+    save_trans: False
+    save_fake: False
+    save_path: /path/to/save/
diff --git a/llmc/compression/token_reduction/__init__.py b/llmc/compression/token_reduction/__init__.py
@@ -13,3 +13,5 @@
 from .sparsevlm import SparseVLM
 from .tome import ToMe
 from .visionzip import VisionZip
+from .vispruner import VisPruner
+from .visualizer import Visualizer
diff --git a/llmc/compression/token_reduction/visionzip.py b/llmc/compression/token_reduction/visionzip.py
@@ -454,7 +454,7 @@ def wrapper(self, *args, **kwargs):
 
         def merger_hook(module, inputs, kwargs, layer_outs, pruning_paras):
             with torch.no_grad():
-                attn_mean = pruning_paras['attn_logits'].mean(dim=0)
+                attn_mean = pruning_paras['attn_logits'].mean(dim=0)  # 16 1120, 1120 -> 1120, 1120
                 attn_key = pruning_paras['attn_key']
 
                 window_index, _ = module.get_window_index(kwargs['grid_thw'])
@@ -539,10 +539,21 @@ def prune_qwenv25vl_hook(module, args, kwargs, pruning_paras):
             st_idx = torch.nonzero(img_mask, as_tuple=True)[0]
 
             if st_idx.numel() > 0:
-                first, last = st_idx[0].item(), st_idx[-1].item()
-                img_mask[first: last + 1] = ~select_mask
+                discontinuities = torch.where(st_idx[1:] - st_idx[:-1] != 1)[0]
+                if discontinuities.numel() > 0:
+                    raise ValueError('Visual tokens are not contiguous in input_ids!')
+                    segment_starts = [st_idx[0].item()] + [st_idx[i + 1].item() for i in discontinuities.tolist()]  # noqa
+                    segment_ends = [st_idx[i].item() for i in discontinuities.tolist()] + [st_idx[-1].item()]  # noqa
+                    offset = 0
+                    for first, last in zip(segment_starts, segment_ends):
+                        length = last - first + 1
+                        # [15 1502] [1505 3289]
+                        img_mask[first: last + 1] = ~select_mask[offset: offset + length]
+                else:
+                    first, last = st_idx[0].item(), st_idx[-1].item()
+                    img_mask[first: last + 1] = ~select_mask
                 img_mask = ~img_mask
-                contexual_input_idx = false_pos[target_indices] + first
+                contextual_input_idx = false_pos[target_indices] + first
 
             hidden_states_filtered = inputs_embeds[:, first: last + 1][:, contextual_mask]
             hidden_to_merge = hidden_states_filtered[
@@ -562,7 +573,7 @@ def prune_qwenv25vl_hook(module, args, kwargs, pruning_paras):
 
             kwargs['position_ids'] = position_ids[:, :, img_mask]
             kwargs['attention_mask'] = attention_mask[:, img_mask]
-            inputs_embeds[:, contexual_input_idx] = contextual_tokens
+            inputs_embeds[:, contextual_input_idx] = contextual_tokens
             kwargs['inputs_embeds'] = inputs_embeds[:, img_mask]
             del contextual_tokens, hidden_states_filtered, hidden_to_merge, aggregated_hidden
             torch.cuda.empty_cache()
diff --git a/llmc/compression/token_reduction/vispruner.py b/llmc/compression/token_reduction/vispruner.py
@@ -0,0 +1,119 @@
+import functools
+
+import torch
+
+from llmc.utils.registry_factory import TOKEN_REDUCTION_REGISTRY
+
+from .token_reduction_module import TokenReductionModule
+
+
+@TOKEN_REDUCTION_REGISTRY.register('VisPruner')
+class VisPruner(TokenReductionModule):
+    def __init__(self, config, model, blocks):
+        super().__init__(config, model, blocks)
+        self.add_sparse_config()
+        self.register_reduction_modules()
+
+    def add_sparse_config(self):
+        self.special_config['select_layer'] = self.model.pruning_config.get(
+            'select_layer', -1
+        )
+        self.special_config['select_feature'] = self.model.pruning_config.get(
+            'select_feature', None
+        )
+
+        self.pruning_paras = self.special_config
+
+    def register_reduction_modules(self):
+
+        def update_output_attentions_hook(module, args, kwargs):
+            kwargs['output_attentions'] = True
+
+        def store_attention_hook(module, inps, outs, pruning_paras):
+            image_attentions = outs.attentions[pruning_paras['select_layer']]
+            if pruning_paras['select_feature'] == 'patch':
+                image_attentions = image_attentions[:, :, 0, 1:]
+            elif pruning_paras['select_feature'] == 'cls_patch':
+                image_attentions = image_attentions
+                raise ValueError(f'Unexpected select feature: {self.select_feature}')
+
+            pruning_paras['image_attentions'] = image_attentions.to(inps[0].dtype)
+
+        def get_index_masks_hook(module, args, pruning_paras):
+            image_features = args[0]
+            image_attentions = pruning_paras['image_attentions']
+
+            B, N, C = image_features.shape
+            device = image_features.device
+            index_masks = torch.ones(B, N, dtype=torch.bool, device=device)
+
+            visual_token_num = round(
+                self.special_config['vision_token_length'] * (
+                    1 - self.special_config['prune_ratio']
+                )
+            )  # T
+            important_ratio = self.pruning_paras['important_ratio']  # r
+            important_token_num = int(visual_token_num * important_ratio)  # T_imp = T * r
+            diverse_token_num = visual_token_num - important_token_num  # T_div = T * (1 - r)
+
+            # [VisPruner] Select important tokens using attention scores
+            image_attentions = image_attentions.mean(dim=1)  # (B, N)
+            token_indices = image_attentions.argsort(dim=-1, descending=True)  # (B, N)
+            important_indices = token_indices[:, :important_token_num]  # (B, T_imp)
+            residual_indices = token_indices[:, important_token_num:]  # (B, N - T_imp)
+
+            # [VisPruner] Remove duplicate tokens by iterative matching and pruning
+            image_normalized = image_features / image_features.norm(dim=-1, keepdim=True)
+            while diverse_token_num > 0:
+                R = residual_indices.shape[1]
+                r = min(8, R - diverse_token_num)
+                if r <= 0:
+                    break
+
+                residual_tokens = image_normalized[
+                    torch.arange(B).unsqueeze(-1).expand(-1, R),
+                    residual_indices
+                ]  # (B, R, C)
+                a, b = residual_tokens[..., ::2, :], residual_tokens[..., 1::2, :]  # (B, R // 2, C)
+                scores = a @ b.transpose(-1, -2)  # (B, R // 2, R // 2)
+                scores = scores.max(dim=-1).values  # (B, R // 2)
+
+                distinct_indices = scores.argsort(dim=-1, descending=True)[:, r:]  # (B, R // 2 - r)
+                residual_indices = torch.cat([
+                    residual_indices[..., ::2][
+                        torch.arange(B).unsqueeze(-1).expand(-1, R // 2 - r),
+                        distinct_indices
+                    ],
+                    residual_indices[..., 1::2]
+                ], dim=-1)  # (B, R - r)
+
+            if diverse_token_num > 0:
+                selected_indices = torch.cat([important_indices, residual_indices], dim=-1)
+            else:
+                selected_indices = important_indices  # (B, T)
+            index_masks = torch.zeros(B, N, dtype=torch.bool, device=device)
+            index_masks.scatter_(1, selected_indices, True)
+
+            pruning_paras['index_masks'] = index_masks
+
+        def prune_hook(module, inputs, outputs, pruning_paras):
+            image_features = outputs
+            index_masks = pruning_paras['index_masks']
+            return image_features[index_masks].unsqueeze(0)
+
+        self.model.vision_model.vision_tower.register_forward_pre_hook(
+            update_output_attentions_hook,
+            with_kwargs=True
+        )
+
+        self.model.vision_model.vision_tower.register_forward_hook(
+            functools.partial(store_attention_hook, pruning_paras=self.pruning_paras),
+        )
+
+        self.model.vision_projector.register_forward_pre_hook(
+            functools.partial(get_index_masks_hook, pruning_paras=self.pruning_paras),
+        )
+
+        self.model.vision_projector.register_forward_hook(
+            functools.partial(prune_hook, pruning_paras=self.pruning_paras),
+        )
diff --git a/llmc/compression/token_reduction/visualizer.py b/llmc/compression/token_reduction/visualizer.py
@@ -0,0 +1,77 @@
+import functools
+
+from llmc.utils.registry_factory import TOKEN_REDUCTION_REGISTRY
+from llmc.utils.visualizer import (visualize_grid_to_grid, visualize_heads,
+                                   visualize_kept_patches)
+
+from .token_reduction_module import TokenReductionModule
+from .utils import prefill_wrapper
+
+
+@TOKEN_REDUCTION_REGISTRY.register('Visualizer')
+class Visualizer(TokenReductionModule):
+    def __init__(self, config, model, blocks):
+        super().__init__(config, model, blocks)
+        self.add_sparse_config()
+        self.register_reduction_modules()
+
+    def add_sparse_config(self):
+        self.pruning_paras = self.special_config
+        self.pruning_paras['attentions'] = []
+
+    def register_reduction_modules(self):
+
+        @prefill_wrapper
+        def update_attentions_hook(module, args, kwargs):
+            kwargs['output_attentions'] = True
+            return args, kwargs
+
+        @prefill_wrapper
+        def get_images_hook(module, input_args, pruning_paras):
+            pruning_paras['images'] = input_args[0]
+            return input_args
+
+        @prefill_wrapper
+        def get_attentions_hook(module, inps, layer_outs, pruning_paras):
+            pruning_paras['attentions'].append(layer_outs[1])
+            return layer_outs
+
+        @prefill_wrapper
+        def visualizer_hook(module, inps, layer_outs, pruning_paras):
+            attention_maps = pruning_paras['attentions'][0]
+            visual_attention_maps = attention_maps[:, :, 35: 35 + 576, 35: 35 + 576]
+            image = pruning_paras['images'][0]
+
+            visualize_heads(
+                visual_attention_maps[:, :6],
+                cols=4,
+                save_path=''
+            )
+            visualize_grid_to_grid(
+                visual_attention_maps[0, 4, :, :],
+                300,
+                image,
+                grid_size=24,
+                save_path=''
+            )
+            visualize_kept_patches(
+                pruning_paras['images'][0],
+                pruning_paras['visual_keep_indexs'],
+                save_path='',
+            )
+            return layer_outs
+
+        self.model.vision_model.register_forward_pre_hook(
+            functools.partial(get_images_hook, pruning_paras=self.pruning_paras),
+        )
+
+        for idx, blk in enumerate(self.blocks):
+            if idx == 5:
+                blk.register_forward_pre_hook(update_attentions_hook, with_kwargs=True)
+                blk.register_forward_hook(
+                    functools.partial(get_attentions_hook, pruning_paras=self.pruning_paras),
+                )
+            if idx == (len(self.blocks) - 1):
+                blk.register_forward_hook(
+                    functools.partial(visualizer_hook, pruning_paras=self.pruning_paras),
+                )
diff --git a/llmc/eval/eval_vqa.py b/llmc/eval/eval_vqa.py
@@ -89,6 +89,10 @@ def eval(
         datetime_str: str = get_datetime_str(),
         cli_args=None,
     ):
+        import argparse
+        cli_args = argparse.Namespace(
+            process_with_media=True,
+        )
 
         model = llmc_model.eval_name
         model_args = 'pretrained=' + self.model_path + ',device_map=auto'
diff --git a/llmc/models/llava.py b/llmc/models/llava.py
@@ -39,15 +39,10 @@ def build_tokenizer(self):
         pass
 
     def build_model(self):
-        self.llava_config = LlavaConfig.from_pretrained(
-            self.model_path, trust_remote_code=True
-        )
+
         self.vlm_model_config = AutoConfig.from_pretrained(
             self.model_path, trust_remote_code=True
         )
-        # llava need: use_cache
-        self.llava_config.use_cache = True
-        self.vlm_model_config.use_cache = True
         logger.info(f'self.vlm_model_config : {self.vlm_model_config}')
 
         self.tokenizer, self.vlm_model, self.image_processor, context_len = load_pretrained_model(
@@ -70,8 +65,8 @@ def build_model(self):
         self.pruning_config = {
             'is_video_model': False,
             'image_token_length': self.vlm_model_config.image_seq_length,
-            'select_layer': self.vlm_model_config.vision_feature_layer,
-            'select_feature': self.vlm_model_config.vision_feature_select_strategy,
+            'select_layer': self.vision_model.select_layer,
+            'select_feature': self.vision_model.select_feature,
             'image_token_index': IMAGE_TOKEN_INDEX,
             'IMAGE_TOKEN_INDEX': IMAGE_TOKEN_INDEX,  # for llava
             'vision_token_start_index': 35,
diff --git a/llmc/models/qwen2_5vl.py b/llmc/models/qwen2_5vl.py
@@ -62,8 +62,6 @@ def build_model(self):
 
         self.min_pixels = 256 * 28 * 28
         self.max_pixels = 1280 * 28 * 28
-        logger.warning(f'min_pixels is set to: {self.min_pixels}')
-        logger.warning(f'max_pixels is set to: {self.max_pixels}')
         self.processor = AutoProcessor.from_pretrained(
             self.model_path,
             min_pixels=self.min_pixels,
@@ -76,6 +74,7 @@ def build_model(self):
             'vision_start_token_id': self.vlm_model_config.vision_start_token_id,
             'vision_token_start_index': 15
         }
+        self.first_turn_question = True
 
     # todo: check
     def get_subsets_in_block(self, block):
diff --git a/llmc/utils/visualizer.py b/llmc/utils/visualizer.py