mustdrop

chengtao-lv · chengtao-lv · commit 439b78cfa94e · 2025-07-12T21:57:14.000+08:00
diff --git a/llmc/compression/token_reduction/mustdrop.py b/llmc/compression/token_reduction/mustdrop.py
@@ -1,4 +1,5 @@
 import functools
+
 import torch
 
 from llmc.utils.registry_factory import TOKEN_REDUCTION_REGISTRY
@@ -17,40 +18,51 @@ def add_sparse_config(self):
         self.pruning_paras = self.special_config
 
     def register_reduction_modules(self):
-        
+
         import math
         from typing import Callable, Tuple
 
+        import numpy as np
         import torch.nn.functional as F
         from einops import rearrange
-        import numpy as np
 
         def conditional_pooling(
             feat: torch.Tensor,
-            threshold:float,
+            threshold: float,
             window_size: Tuple[int, int],
         ) -> Tuple[Callable, Callable]:
-            
+
             with torch.no_grad():
-                  
-                ws_h, ws_w = int(window_size[0]), int(window_size[1])  #窗口尺寸,2*2
+
+                ws_h, ws_w = int(window_size[0]), int(window_size[1])  # 窗口尺寸,2*2
                 stride_h, stride_w = ws_h, ws_w
-                num_token_window = stride_h * stride_w #窗口内token数量,4
-                
-                x_cls, feat = feat[:, :1, :], feat[:, 1:, :] # 取出cls token之外的所有tokens,一共576个vision token
+                num_token_window = stride_h * stride_w  # 窗口内token数量,4
+
+                _, feat = (
+                    feat[:, :1, :],
+                    feat[:, 1:, :],
+                )  # 取出cls token之外的所有tokens,一共576个vision token
                 B, N, D = feat.size()
                 base_grid_H = int(math.sqrt(N))
                 base_grid_W = base_grid_H
-                assert base_grid_H * base_grid_W == N and base_grid_H % ws_h == 0 and base_grid_W % ws_w == 0
-
-                feat = rearrange(feat, "b (h w) c -> b c h w", h=base_grid_H)
-            
-                feat = rearrange(feat, 'b c (gh ps_h) (gw ps_w) -> b gh gw c ps_h ps_w', gh=base_grid_H//ws_h, gw=base_grid_W//ws_w)
+                assert (
+                    base_grid_H * base_grid_W == N
+                    and base_grid_H % ws_h == 0
+                    and base_grid_W % ws_w == 0
+                )
+
+                feat = rearrange(feat, 'b (h w) c -> b c h w', h=base_grid_H)
+
+                feat = rearrange(
+                    feat,
+                    'b c (gh ps_h) (gw ps_w) -> b gh gw c ps_h ps_w',
+                    gh=base_grid_H // ws_h,
+                    gw=base_grid_W // ws_w,
+                )
                 b, gh, gw, c, ps_h, ps_w = feat.shape
 
                 # Flatten mxm window for pairwise operations
                 tensor_flattened = feat.reshape(b, gh, gw, c, -1)
-            
 
                 # Expand dims for pairwise operations
                 tensor_1 = tensor_flattened.unsqueeze(-1)
@@ -64,65 +76,95 @@ def conditional_pooling(
                 sims = sims * sims_mask
 
                 # Average similarities (excluding the self-similarity)
-                similarity_map = sims.sum(-1).sum(-1) / ((ps_h * ps_w) * (ps_h * ps_w - 1))
-                    
-                similarity_map = rearrange(similarity_map.unsqueeze(1), 'b c h w-> b (c h w)')
-                
-                #--- adaptive section ---#
-            
+                similarity_map = sims.sum(-1).sum(-1) / (
+                    (ps_h * ps_w) * (ps_h * ps_w - 1)
+                )
+
+                similarity_map = rearrange(
+                    similarity_map.unsqueeze(1), 'b c h w-> b (c h w)'
+                )
+
+                # --- adaptive section ---#
+
                 n_B, n_H = similarity_map.shape
                 node_mean = torch.tensor(threshold).cuda(sims.device)
-                node_mean=node_mean.repeat(1,n_H)
+                node_mean = node_mean.repeat(1, n_H)
                 r = torch.ge(similarity_map, node_mean).sum(dim=1).min()
-                # -------------# 
-            
-                #   get top k similar super patches 
-                _, sim_super_patch_idxs = similarity_map.topk(r,dim=-1)
-            
-                # --- creating the mergabel and unmergable super  pathes
-                tensor = torch.arange(base_grid_H * base_grid_W).reshape(base_grid_H, base_grid_W).to(feat.device)
+                # -------------#
+
+                #   get top k similar super patches
+                _, sim_super_patch_idxs = similarity_map.topk(r, dim=-1)
+
+                # --- creating the mergabel and unmergable super patches
+                tensor = (
+                    torch.arange(base_grid_H * base_grid_W)
+                    .reshape(base_grid_H, base_grid_W)
+                    .to(feat.device)
+                )
 
                 # Repeat the tensor to create a batch of size 2
                 tensor = tensor.unsqueeze(0).repeat(B, 1, 1)
-                
 
                 # Apply unfold operation on last two dimensions to create the sliding window
-                windowed_tensor = tensor.unfold(1, ws_h, stride_h).unfold(2, ws_w, stride_w)
+                windowed_tensor = tensor.unfold(1, ws_h, stride_h).unfold(
+                    2, ws_w, stride_w
+                )
 
-                # Reshape the tensor to the desired shape 
+                # Reshape the tensor to the desired shape
                 windowed_tensor = windowed_tensor.reshape(B, -1, num_token_window)
-            
-                # Use torch.gather to collect the desired elements
-                gathered_tensor = torch.gather(windowed_tensor, 1, sim_super_patch_idxs.unsqueeze(-1).expand(-1, -1, num_token_window))
 
+                # Use torch.gather to collect the desired elements
+                gathered_tensor = torch.gather(
+                    windowed_tensor,
+                    1,
+                    sim_super_patch_idxs.unsqueeze(-1).expand(-1, -1, num_token_window),
+                )
 
                 # Create a mask for all indices, for each batch
-                mask = torch.ones((B, windowed_tensor.shape[1]), dtype=bool).to(feat.device)
+                mask = torch.ones((B, windowed_tensor.shape[1]), dtype=bool).to(
+                    feat.device
+                )
 
                 # Create a tensor that matches the shape of indices and fill it with False
-                mask_values = torch.zeros_like(sim_super_patch_idxs, dtype=torch.bool).to(feat.device)
+                mask_values = torch.zeros_like(
+                    sim_super_patch_idxs, dtype=torch.bool
+                ).to(feat.device)
 
-                # Use scatter_ to update the mask. This will set mask[b, indices[b]] = False for all b
+                # Use scatter_ to update the mask.
+                # This will set mask[b, indices[b]] = False for all b
                 mask.scatter_(1, sim_super_patch_idxs, mask_values)
 
                 # Get the remaining tensor
-                remaining_tensor = windowed_tensor[mask.unsqueeze(-1).expand(-1, -1, num_token_window)].reshape(B, -1, num_token_window)
-                unm_idx = remaining_tensor.reshape(B, -1).sort(dim=-1).values.unsqueeze(-1)
-                dim_index = (num_token_window)- 1 
-                src_idx= gathered_tensor[:, :, :dim_index].reshape(B, -1).unsqueeze(-1)
-                dst_idx= gathered_tensor[:, :, dim_index].reshape(B, -1).unsqueeze(-1)
-                merge_idx = torch.arange(src_idx.shape[1]//dim_index).repeat_interleave(dim_index).repeat(B, 1).unsqueeze(-1).to(feat.device)
-
-
-            def merge(x: torch.Tensor, mode="mean") -> torch.Tensor:
-            # TODO: num_token_window can be undefined
-            
-                x_cls , x_feat =  x[:, :1, :], x[:, 1:, :]
+                remaining_tensor = windowed_tensor[
+                    mask.unsqueeze(-1).expand(-1, -1, num_token_window)
+                ].reshape(B, -1, num_token_window)
+                unm_idx = (
+                    remaining_tensor.reshape(B, -1).sort(dim=-1).values.unsqueeze(-1)
+                )
+                dim_index = (num_token_window) - 1
+                src_idx = gathered_tensor[:, :, :dim_index].reshape(B, -1).unsqueeze(-1)
+                dst_idx = gathered_tensor[:, :, dim_index].reshape(B, -1).unsqueeze(-1)
+                merge_idx = (
+                    torch.arange(src_idx.shape[1] // dim_index)
+                    .repeat_interleave(dim_index)
+                    .repeat(B, 1)
+                    .unsqueeze(-1)
+                    .to(feat.device)
+                )
+
+            def merge(x: torch.Tensor, mode='mean') -> torch.Tensor:
+                # TODO: num_token_window can be undefined
+
+                x_cls, x_feat = x[:, :1, :], x[:, 1:, :]
                 n, t1, c = x_feat.shape
-                src = x_feat.gather(dim=-2, index=src_idx.expand(n, r*dim_index, c))
+                src = x_feat.gather(dim=-2, index=src_idx.expand(n, r * dim_index, c))
                 dst = x_feat.gather(dim=-2, index=dst_idx.expand(n, r, c))
-                unm = x_feat.gather(dim=-2, index=unm_idx.expand(n, t1 - (r*num_token_window), c))
-                dst = dst.scatter_reduce(-2, merge_idx.expand(n,r*dim_index, c), src, reduce=mode)
+                unm = x_feat.gather(
+                    dim=-2, index=unm_idx.expand(n, t1 - (r * num_token_window), c)
+                )
+                dst = dst.scatter_reduce(
+                    -2, merge_idx.expand(n, r * dim_index, c), src, reduce=mode
+                )
                 x = torch.cat([dst, unm], dim=1)
                 x = torch.cat((x_cls, x), dim=1)
                 return x
@@ -132,27 +174,27 @@ def merge(x: torch.Tensor, mode="mean") -> torch.Tensor:
         def merge_wavg(
             merge: Callable, x: torch.Tensor, size: torch.Tensor = None
         ) -> Tuple[torch.Tensor, torch.Tensor]:
-            
+
             if size is None:
                 size = torch.ones_like(x[..., 0, None])
 
-            x = merge(x * size, mode="sum")
-            size = merge(size, mode="sum")    
+            x = merge(x * size, mode='sum')
+            size = merge(size, mode='sum')
             x = x / size
-            
+
             return x, size
-        
+
         def spatial_merge_hook(module, args, kwargs, pruning_paras):
             spatial_threshold = pruning_paras['spatial_threshold']
             window_size = pruning_paras['window_size']
             hidden_states = args[0]
             merge = conditional_pooling(hidden_states, spatial_threshold, window_size)
-            hidden_states, size =merge_wavg(merge, hidden_states, None)
+            hidden_states, size = merge_wavg(merge, hidden_states, None)
             return (hidden_states,) + args[1:], kwargs
-        
+
         self.model.set_modality('vision')
         self.model.find_blocks()
         self.model.blocks[1].register_forward_pre_hook(
             functools.partial(spatial_merge_hook, pruning_paras=self.pruning_paras),
-            with_kwargs=True
+            with_kwargs=True,
         )