diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 60fe5b887952..3eb179385637 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -366,7 +366,7 @@ th {
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
+| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | ✅︎ | | ✅︎ |
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -671,7 +671,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
-| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
+| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | ✅︎ | | ✅︎ |
| `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index d2f017c19ccd..20ff51b0da61 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -157,7 +157,12 @@ def apply(
# In transformers backend, x and output have extra batch dimension like
# (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
# therefore we need to flatten the batch dimensions.
- if x.ndim == 3 and output.ndim == 3:
+ if (
+ x.shape[0] == 1
+ and x.ndim == 3
+ and output.shape[0] == 1
+ and output.ndim == 3
+ ):
output = output.flatten(0, 1)
x = x.flatten(0, 1)
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index e4ea4256ebc2..a274e7a21f59 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -52,7 +52,7 @@
from vllm.sequence import IntermediateTensors
from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
-from .interfaces import SupportsQuant
+from .interfaces import SupportsLoRA, SupportsQuant
from .utils import (
AutoWeightsLoader,
extract_layer_index,
@@ -1081,7 +1081,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
return loaded_params
-class Gemma3nForCausalLM(nn.Module):
+class Gemma3nForCausalLM(nn.Module, SupportsLoRA):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 0e69fcfd8feb..e0dbbf9dfd5b 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -20,6 +20,7 @@
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
+from vllm.config.lora import LoRAConfig
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.model_executor.layers.layernorm import RMSNorm
@@ -54,7 +55,12 @@
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription
+from .interfaces import (
+ MultiModalEmbeddings,
+ SupportsLoRA,
+ SupportsMultiModal,
+ SupportsTranscription,
+)
from .utils import (
AutoWeightsLoader,
WeightsMapper,
@@ -390,6 +396,7 @@ def __init__(
self,
multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig],
text_config: Gemma3nTextConfig,
+ lora_config: Optional[LoRAConfig] = None,
):
super().__init__()
@@ -399,9 +406,17 @@ def __init__(
self.vocab_size = multimodal_config.vocab_size
self.text_hidden_size = text_config.hidden_size
+ lora_vocab = (
+ (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+ if lora_config
+ else 0
+ )
+ self.vocab_size = self.vocab_size + lora_vocab
+
self.embedding = VocabParallelEmbedding(
self.vocab_size,
self.multimodal_hidden_size,
+ org_num_embeddings=multimodal_config.vocab_size,
)
self.hard_embedding_norm = RMSNorm(
@@ -445,7 +460,6 @@ def forward(
raise ValueError(
"You must specify exactly one of input_ids or inputs_embeds"
)
-
if inputs_embeds is not None:
emb_norm = self.soft_embedding_norm(inputs_embeds)
else:
@@ -453,6 +467,9 @@ def forward(
emb_norm = self.hard_embedding_norm(hard_emb)
emb_norm_proj, _ = self.embedding_projection(emb_norm)
+ if emb_norm_proj.ndim == 2:
+ # One-element batch squeezing when lora is enabled
+ emb_norm_proj = emb_norm_proj.unsqueeze(0)
return self.embedding_post_projection_norm(emb_norm_proj)
@@ -462,7 +479,7 @@ def forward(
dummy_inputs=Gemma3nDummyInputsBuilder,
)
class Gemma3nForConditionalGeneration(
- nn.Module, SupportsMultiModal, SupportsTranscription
+ nn.Module, SupportsMultiModal, SupportsTranscription, SupportsLoRA
):
merge_by_field_config = True
supported_languages = ISO639_1_SUPPORTED_LANGS
@@ -502,14 +519,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.quant_config = quant_config
self.multimodal_config = multimodal_config
self.vocab_size = config.text_config.vocab_size
+ self.lora_config = vllm_config.lora_config
self.vision_tower = AutoModel.from_config(config=config.vision_config)
self.audio_tower = AutoModel.from_config(config=config.audio_config)
self.embed_vision = Gemma3nMultimodalEmbedder(
- config.vision_config, config.text_config
+ config.vision_config, config.text_config, self.lora_config
)
self.embed_audio = Gemma3nMultimodalEmbedder(
- config.audio_config, config.text_config
+ config.audio_config, config.text_config, self.lora_config
)
self.language_model: nn.Module = init_vllm_registered_model(
@@ -745,7 +763,7 @@ def get_mm_mapping(self) -> MultiModelKeys:
return MultiModelKeys.from_string_field(
language_model="language_model",
connector="multi_modal_projector",
- tower_model="vision_tower",
+ tower_model=["vision_tower", "audio_tower"],
)
@classmethod