Add decoder of Qwen2.5-VL model.

ai-edge-bot · copybara-github · commit 78bbac23e1c3 · 2025-01-30T15:35:45.000-08:00
- Image encoder and full Qwen2.5-VL model will be added in following CLs
- Decoder outputs last hidden states instead of logits

PiperOrigin-RevId: 721541592
diff --git a/ai_edge_torch/generative/examples/qwen_vl/__init__.py b/ai_edge_torch/generative/examples/qwen_vl/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2025 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
diff --git a/ai_edge_torch/generative/examples/qwen_vl/decoder.py b/ai_edge_torch/generative/examples/qwen_vl/decoder.py
@@ -0,0 +1,91 @@
+# Copyright 2025 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Example of building decoder for Qwen 2.5 VL models."""
+
+import ai_edge_torch.generative.layers.model_config as cfg
+from ai_edge_torch.generative.utilities import model_builder
+from torch import nn
+
+TENSOR_NAMES = model_builder.TENSOR_NAMES
+
+
+class Decoder(model_builder.DecoderOnlyModel):
+  """A decoder for Qwen-VL model built from the Edge Generative API layers."""
+  pass
+
+
+def get_decoder_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a Qwen 2.5 VL 3B model.
+
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+
+  Returns:
+    The model config for a Qwen 2.5 VL 3B model.
+  """
+  attn_config = cfg.AttentionConfig(
+      num_heads=16,
+      head_dim=128,
+      num_query_groups=2,
+      rotary_base=1000000,
+      rotary_percentage=1.0,
+      qkv_use_bias=True,
+  )
+  ff_config = cfg.FeedForwardConfig(
+      type=cfg.FeedForwardType.GATED,
+      activation=cfg.ActivationConfig(cfg.ActivationType.SILU),
+      intermediate_size=11008,
+  )
+  norm_config = cfg.NormalizationConfig(
+      type=cfg.NormalizationType.RMS_NORM,
+      epsilon=1e-06,
+  )
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
+  config = cfg.ModelConfig(
+      vocab_size=151936,
+      num_layers=36,
+      max_seq_len=32768,
+      embedding_dim=2048,
+      kv_cache_max_len=kv_cache_max_len,
+      block_configs=block_config,
+      final_norm_config=norm_config,
+      enable_hlfb=True,
+  )
+  return config
+
+
+def get_fake_decoder_config(**kwargs) -> cfg.ModelConfig:
+  config = get_decoder_config(**kwargs)
+  config.vocab_size = 128
+  config.num_layers = 2
+  # Decoder has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 64
+  return config
+
+
+def build_decoder(checkpoint_path: str, **kwargs) -> nn.Module:
+  return model_builder.build_decoder_only_model(
+      checkpoint_path=checkpoint_path,
+      config=get_decoder_config(**kwargs),
+      tensor_names=TENSOR_NAMES,
+      model_class=Decoder,
+  )
diff --git a/ai_edge_torch/generative/examples/qwen_vl/verify_decoder.py b/ai_edge_torch/generative/examples/qwen_vl/verify_decoder.py
@@ -0,0 +1,77 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Verifies the reauthored decoder of Qwen 2.5 VL 3B models."""
+
+import logging
+import pathlib
+
+from absl import app
+from ai_edge_torch.generative.examples.qwen_vl import decoder
+from ai_edge_torch.generative.utilities import verifier
+import torch
+import transformers
+
+
+class DecoderWrapper(verifier.ModelWrapper):
+  """Wraps the decoder of Qwen 2.5 VL models for verification."""
+
+  def __init__(self, model: torch.nn.Module, lm_head: torch.nn.Module):
+    super().__init__(model)
+    self.lm_head = lm_head
+
+  def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+    output = self.model.forward(tokens)
+    return self.lm_head(output["last_hidden_state"])
+
+
+def main(_):
+  checkpoint = "Qwen/Qwen2.5-VL-3B-Instruct"
+  logging.info("Loading the original model from: %s", checkpoint)
+  original_model = (
+      transformers.Qwen2_5_VLForConditionalGeneration.from_pretrained(
+          checkpoint
+      )
+  )
+
+  # Locate the cached dir.
+  cached_config_file = transformers.utils.cached_file(
+      checkpoint, transformers.utils.CONFIG_NAME
+  )
+  reauthored_checkpoint = pathlib.Path(cached_config_file).parent
+  logging.info("Building the reauthored model from: %s", reauthored_checkpoint)
+  reauthored_model = decoder.build_decoder(reauthored_checkpoint)
+
+  # Verify the reauthored model only with input IDs because the original decoder
+  # does not support generate() with prompts.
+  input_ids = [1, 2, 3, 4]
+  try:
+    verifier.verify_with_input_ids(
+        original_model=DecoderWrapper(
+            original_model.model,
+            original_model.lm_head,
+        ),
+        reauthored_model=verifier.ReauthoredModelWrapper(reauthored_model),
+        input_ids=input_ids,
+        atol=1e-04,
+    )
+  except AssertionError as e:
+    logging.error("*** FAILED *** verify with input IDs: %s", e)
+  else:
+    logging.info("*** PASSED *** verify with input IDs: %s", input_ids)
+
+
+if __name__ == "__main__":
+  app.run(main)