InternLM
diff --git a/‎autotest/config.yaml‎
Lines changed: 1 addition & 3 deletions b/‎autotest/config.yaml‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎docs/en/multi_modal/index.rst‎
Lines changed: 0 additions & 1 deletion b/‎docs/en/multi_modal/index.rst‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/en/multi_modal/mllama.md‎
Lines changed: 0 additions & 67 deletions b/‎docs/en/multi_modal/mllama.md‎
Lines changed: 0 additions & 67 deletions
diff --git a/‎docs/en/supported_models/supported_models.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/en/supported_models/supported_models.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/zh_cn/multi_modal/index.rst‎
Lines changed: 0 additions & 1 deletion b/‎docs/zh_cn/multi_modal/index.rst‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/zh_cn/multi_modal/mllama.md‎
Lines changed: 0 additions & 66 deletions b/‎docs/zh_cn/multi_modal/mllama.md‎
Lines changed: 0 additions & 66 deletions
diff --git a/‎docs/zh_cn/supported_models/supported_models.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/zh_cn/supported_models/supported_models.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmdeploy/metrics/loggers.py‎
Lines changed: 4 additions & 2 deletions b/‎lmdeploy/metrics/loggers.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎lmdeploy/pytorch/backends/cuda/attention/default.py‎
Lines changed: 0 additions & 7 deletions b/‎lmdeploy/pytorch/backends/cuda/attention/default.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎lmdeploy/pytorch/backends/cuda/moe/default.py‎
Lines changed: 2 additions & 7 deletions b/‎lmdeploy/pytorch/backends/cuda/moe/default.py‎
Lines changed: 2 additions & 7 deletions
@@ -117,7 +117,6 @@ pytorch_chat_model:
         - meta-llama/Llama-4-Scout-17B-16E-Instruct
         - meta-llama/Llama-3.2-1B-Instruct
         - meta-llama/Llama-3.2-3B-Instruct
-        - meta-llama/Llama-3.2-11B-Vision-Instruct
         - meta-llama/Meta-Llama-3-1-8B-Instruct
         - meta-llama/Meta-Llama-3-1-70B-Instruct
         - meta-llama/Meta-Llama-3-8B-Instruct
@@ -219,7 +218,6 @@ turbomind_vl_model:
 
 pytorch_vl_model:
     tp:
-        - meta-llama/Llama-3.2-11B-Vision-Instruct
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
         - OpenGVLab/InternVL2_5-26B-MPO
@@ -244,7 +242,7 @@ pytorch_vl_model:
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
         - THUDM/cogvlm-chat-hf
-        - THUDM/cogvlm2-llama3-chinese-chat-19B
+        # - THUDM/cogvlm2-llama3-chinese-chat-19B # 'HFChatTemplate' object has no attribute 'eoa'
         - THUDM/glm-4v-9b
         - microsoft/Phi-3-vision-128k-instruct
         - microsoft/Phi-3.5-vision-instruct
 
@@ -12,7 +12,6 @@ Vision-Language Models
    cogvlm.md
    minicpmv.md
    phi3.md
-   mllama.md
    qwen2_vl.md
    qwen2_5_vl.md
    molmo.md
 
@@ -65,7 +65,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |             Llama3             |     8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Llama3.1            |     8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Llama3.2            |     1B, 3B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|          Llama3.2-VL           |    11B, 90B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |             Llama4             | Scout, Maverick | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |            InternLM            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |           InternLM2            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
@@ -129,6 +128,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 ```{note}
 * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
 * [2] PyTorch engine removes the support of original llava models after v0.6.4. Please use their corresponding transformers models instead, which can be found in https://huggingface.co/llava-hf
+Starting from version 0.11.1, PytorchEngine no longer provides support for mllama.
 ```
 
 ## PyTorchEngine on Other Platforms
 
@@ -12,7 +12,6 @@
    cogvlm.md
    minicpmv.md
    phi3.md
-   mllama.md
    qwen2_vl.md
    qwen2_5_vl.md
    molmo.md
 
@@ -65,7 +65,6 @@
 |             Llama3             |     8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Llama3.1            |     8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Llama3.2            |     1B, 3B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|          Llama3.2-VL           |    11B, 90B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |             Llama4             | Scout, Maverick | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |            InternLM            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |           InternLM2            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
@@ -129,6 +128,7 @@
 ```{note}
 * [1] 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16
 * [2] 自 0.6.4 之后，PyTorch 引擎移除了对 llava 模型原始格式的支持。我们建议使用它们对应的 transformers 格式的模型。这些模型可以在 https://huggingface.co/llava-hf 中找到
+自 0.11.1 起，PytorchEngine 移除了 mllama 的支持
 ```
 
 ## PyTorchEngine 其他平台
 
@@ -93,10 +93,11 @@ def log_spec_msg(self):
                    f'Accepted: {self.num_accepted_tokens} tokens, '
                    f'Drafted: {self.num_draft_tokens} tokens, '
                    f'Per-position acceptance rate: {rates_str}')
-        print(log_msg, flush=True)
+        return log_msg
 
     def log(self):
         now = time.perf_counter()
+        spec_msg = self.log_spec_msg()
 
         # skip logging if no tokens were processed
         if self.total_prompt_tokens == 0 and self.total_generation_tokens == 0:
@@ -121,8 +122,9 @@ def log(self):
                    f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, '
                    f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%')
 
+        if spec_msg is not None:
+            log_msg += ', ' + spec_msg
         print(log_msg, flush=True)
-        self.log_spec_msg()
 
 
 class PrometheusStatLogger(StatLoggerBase):
 
@@ -24,7 +24,6 @@ class TritonAttentionMetadata(AttentionMetadata):
         q_seqlens: Length of each query sequence [batch_size].
         kv_start_loc: Start location of each KV sequence [batch_size].
         kv_seqlens: Length of each KV sequence [batch_size].
-        fill_seqlens: Fill sequence lengths (for special cases like MLlama).
         quant_policy: Quantization policy (0=none, 4=int4, 8=int8/fp8).
         kv_flatten_size: Total size of flattened KV cache.
         tile_scheduler_metadata: Scheduler metadata for Flash MLA.
@@ -41,7 +40,6 @@ class TritonAttentionMetadata(AttentionMetadata):
     q_seqlens: torch.Tensor = None
     kv_start_loc: torch.Tensor = None
     kv_seqlens: torch.Tensor = None
-    fill_seqlens: torch.Tensor = None
     quant_policy: Literal[0, 4, 8] = 0
     kv_flatten_size: int = None
     # flash mla
@@ -135,11 +133,6 @@ def _get_fill_meta(
         fill_seqlens = attn_metadata.q_seqlens
         fill_max_q_seqlen = max_q_seqlen
         fill_q_start_loc = attn_metadata.q_start_loc
-        # For MLlama only
-        if attn_metadata.fill_seqlens is not None:
-            fill_seqlens = attn_metadata.fill_seqlens
-            fill_max_q_seqlen = key.numel() // (key.size(-1) * key.size(-2))
-            fill_q_start_loc = fill_seqlens.cumsum(0) - fill_seqlens
         return fill_seqlens, fill_max_q_seqlen, fill_q_start_loc
 
     def _fill_kv_cache_impl(
 
@@ -260,7 +260,7 @@ def experts(
     ):
         from dlblas.utils.utils import DisposibleTensor
 
-        from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul
+        from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul_moe_ep
         from lmdeploy.pytorch.third_party.deep_gemm import m_grouped_bf16_gemm_nt_masked
         num_groups, m, _ = hidden_states.shape
         n = gate_up_weight.size(1)
@@ -269,12 +269,7 @@ def experts(
         m_grouped_bf16_gemm_nt_masked(DisposibleTensor.maybe_unwrap(hidden_states), gate_up_weight, gateup_output,
                                       masked_m, expected_m)
         DisposibleTensor.maybe_dispose(hidden_states)
-        down_input = silu_and_mul(gateup_output.flatten(0, -2))
-        down_input = down_input.view(
-            gateup_output.shape[0],
-            gateup_output.shape[1],
-            gateup_output.shape[2] // 2,
-        )
+        down_input = silu_and_mul_moe_ep(gateup_output, masked_m)
         del gateup_output
         n = gate_down_weight.size(1)
         down_output = down_input.new_empty((num_groups, m, n))