0808-opti-qwen2-vl-vit-2

SangChengC · SangChengC · commit 97e60fb8b25a · 2025-08-08T11:13:52.000Z
diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -24,12 +24,48 @@
 from lightllm.models.qwen2_vl.qwen2_visual import PatchEmbed, VisionRotaryEmbedding
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
-from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
 # adapted from
 # https://github.com/huggingface/transformers/blob/
 # be37d34f44ff1bc928e59ffb8a30adecab8835a8/src
 # /transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py#L30C1-L31C1
+class Qwen2_5_VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_5_vl"
+
+    def __init__(
+        self,
+        depth=32,
+        hidden_size=3584,
+        hidden_act="silu",
+        intermediate_size=3420,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=4,
+        window_size=112,
+        out_hidden_size=3584,
+        fullatt_block_indexes=[7, 15, 23, 31],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.out_hidden_size = out_hidden_size
+
+
 class Qwen2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -68,6 +104,27 @@ def forward(self, hidden_state):
         return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
 
 
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+
+
 class Qwen2_5_VLVisionFlashAttention(nn.Module):
     def __init__(self, dim: int, num_heads: int = 16) -> None:
         super().__init__()
@@ -76,27 +133,26 @@ def __init__(self, dim: int, num_heads: int = 16) -> None:
         self.qkv = nn.Linear(dim, dim * 3, bias=True)
         self.proj = nn.Linear(dim, dim)
 
-    def apply_rotary_pos_emb_vision(self, t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-        t_ = t.float()
-        cos = freqs.cos()
-        sin = freqs.sin()
-        output = apply_rotary_emb(t_, cos, sin).type_as(t)
-        return output
-
     def forward(
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        max_seqlen: int = 0,
         rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> torch.Tensor:
         seq_length = hidden_states.shape[0]
         q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
-        q = self.apply_rotary_pos_emb_vision(q, rotary_pos_emb)
-        k = self.apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+        if position_embeddings is None:
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
+        cu_seqlens = cu_seqlens.to(q.device, torch.int32)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         attn_output = g_cache_manager.alloc_tensor(q.shape, q.dtype, device=q.device)
-
         flash_attention_fwd(q, k, v, attn_output, cu_seqlens, max_seqlen)
         attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
@@ -127,14 +183,12 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        max_seqlen: int = 0,
         rotary_pos_emb: Optional[torch.Tensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
             rotary_pos_emb=rotary_pos_emb,
             position_embeddings=position_embeddings,
         )
@@ -178,7 +232,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
-        self.data_type = kvargs.get("data_type", "bfloat16")
 
         self.depth = depth
         self.hidden_size = hidden_size
@@ -204,7 +257,7 @@ def __init__(
         )
 
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2).to("cuda", non_blocking=True)
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2).to("cuda", dtype=self.get_dtype(), non_blocking=True)
 
         self.blocks = nn.ModuleList(
             [
@@ -226,62 +279,41 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-        self._init_datatype()
-
-    def _init_datatype(self):
-        if isinstance(self.data_type, torch.dtype):
-            return
-        if self.data_type in ["fp16", "float16"]:
-            self.data_type = torch.float16
-        elif self.data_type in ["bf16", "bfloat16"]:
-            self.data_type = torch.bfloat16
-        elif self.data_type in ["fp32", "float32"]:
-            self.data_type = torch.float32
-        else:
-            raise ValueError(f"Unsupport datatype {self.data_type}!")
-        return
+        self.device = self.get_device()
+        self.dtype = self.get_dtype()
 
-    def load_model(self, weight_dir):
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.down_proj.weight.dtype
 
-        processor_config_path = os.path.join(weight_dir, "preprocessor_config.json")
-        with open(processor_config_path, "r") as f:
-            processor_config_dict = json.load(f)
-        self.processor = Qwen2VLImageProcessor(**processor_config_dict)
-
-        bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
-        if bin_weight_files:
-            weight_dict = {}
-            for file_ in bin_weight_files:
-                f = torch.load(os.path.join(weight_dir, file_), "cpu")
-                for k, v in f.items():
-                    if "visual" in k:
-                        weight_dict[k[len("visual.") :]] = v
-
-        else:
-            hf_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".safetensors")]
-            weight_dict = {}
-            for file_ in hf_weight_files:
-                f = safe_open(os.path.join(weight_dir, file_), "pt", "cpu")
-                for k in f.keys():
-                    if "visual" in k:
-                        weight_dict[k[len("visual.") :]] = f.get_tensor(k)
-
-        self.load_state_dict(weight_dict)
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.down_proj.weight.device
 
     def rot_pos_emb(self, grid_thw):
         pos_ids = []
-        s = self.spatial_merge_size
-        for _, h, w in grid_thw:
-            pos_shape = (h // s, s, w // s, s)
+        for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
-            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            hpos_ids = hpos_ids.reshape(pos_shape).permute(0, 2, 1, 3).flatten()
-            wpos_ids = wpos_ids.reshape(pos_shape).permute(0, 2, 1, 3).flatten()
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
 
-            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1))
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size).type(torch.float32)
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
@@ -328,7 +360,7 @@ def get_window_index(self, grid_thw):
 
     def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
         hidden_states = self.patch_embed(hidden_states)
-        rotary_pos_emb = self.rot_pos_emb(grid_thw).to("cuda", non_blocking=True)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
         window_index, cu_window_seqlens = self.get_window_index(grid_thw)
         cu_window_seqlens = torch.tensor(
             cu_window_seqlens,
@@ -344,14 +376,20 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.
         rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
         rotary_pos_emb = rotary_pos_emb[window_index, :, :]
         rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
-        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1).to("cuda", dtype=self.get_dtype(), non_blocking=True)
+        position_embeddings = (emb.cos(), emb.sin())
 
         cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
             dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same
+            #      dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852
+            #      for more information
             dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
         )
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
 
         for layer_num, blk in enumerate(self.blocks):
             if layer_num in self.fullatt_block_indexes:
@@ -361,8 +399,7 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens_now,
-                max_seqlen=max_seqlen,
-                position_embeddings=emb,
+                position_embeddings=position_embeddings,
             )
 
         hidden_states = self.merger(hidden_states)
@@ -371,6 +408,33 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.
 
         return hidden_states
 
+    def load_model(self, weight_dir):
+
+        processor_config_path = os.path.join(weight_dir, "preprocessor_config.json")
+        with open(processor_config_path, "r") as f:
+            processor_config_dict = json.load(f)
+        self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+
+        bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
+        if bin_weight_files:
+            weight_dict = {}
+            for file_ in bin_weight_files:
+                f = torch.load(os.path.join(weight_dir, file_), "cpu")
+                for k, v in f.items():
+                    if "visual" in k:
+                        weight_dict[k[len("visual.") :]] = v
+
+        else:
+            hf_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".safetensors")]
+            weight_dict = {}
+            for file_ in hf_weight_files:
+                f = safe_open(os.path.join(weight_dir, file_), "pt", "cpu")
+                for k in f.keys():
+                    if "visual" in k:
+                        weight_dict[k[len("visual.") :]] = f.get_tensor(k)
+
+        self.load_state_dict(weight_dict)
+
     def encode(self, images: List[ImageItem]):
         img_tensors = []
         valid_ids = []
@@ -402,7 +466,7 @@ def encode(self, images: List[ImageItem]):
         imgs = torch.cat(img_tensors, dim=0)
         grid_thw = torch.cat(img_grids, dim=0)
 
-        pixel_values = imgs.to("cuda", dtype=self.data_type, non_blocking=True)
+        pixel_values = imgs.to("cuda", dtype=self.get_dtype(), non_blocking=True)
         image_grid_thw = grid_thw.to("cuda", non_blocking=True)
 
         all_img_embeds = self.forward(pixel_values, grid_thw=image_grid_thw)