Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions fast_llm/models/multimodal/conversion/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ class LlavaVisionAdapterConverter:
@classmethod
def import_config(cls, config: dict) -> dict:
return {
"intermediate_size": config["vision_config"]["hidden_size"],
"intermediate_size": config["text_config"]["hidden_size"],
"add_linear_biases": config["multimodal_projector_bias"],
"gated": False,
"activation": ActivationType.from_hf_name(config["projector_hidden_act"]),
Expand All @@ -183,8 +183,6 @@ def export_config(cls, config: MLPConfig) -> dict:
return {
"projector_hidden_act": config.activation.hf_name,
"multimodal_projector_bias": config.add_linear_biases,
# Not in LlavaConfig, but needed for consistency check in LlavaBaseModelConverter.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why removing? This is essential to ensure compatibility.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to ensure compatibility with what?
As stated in the comment, this is not in LlavaConfig. And it caused issues when trying to load Apriel-1.5, where it would set the default value for this param and fail in the assertion here https://github.com/ServiceNow/Fast-LLM/pull/399/files#diff-319643f77a4055995eb8f844aee095266ba3b15fa11f52e16acd89386058e51bL314

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The projector intermediate size needs to match with the LM hidden size, which is not guaranteed on the Fast-LLM size. The entry is not in the final output, it's there specifically for the assertion in https://github.com/ServiceNow/Fast-LLM/pull/399/files#diff-319643f77a4055995eb8f844aee095266ba3b15fa11f52e16acd89386058e51bL314. A failing assertion points to an actual error elsewhere.

What do you mean by "load Apriel-1.5"? Shouldn't that go through import?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I guess this could be due to the bug you fixed above, where the intermediate size was set incorrectly on import?

"projector_intermediate_size": config.intermediate_size,
}

@classmethod
Expand Down Expand Up @@ -243,13 +241,13 @@ def export_config(cls, config: VisionEncoderConfig) -> dict:
def get_converters(cls, config: VisionEncoderConfig) -> list[WeightConverter]:
return [
*cls.embeddings_converter_class.get_converters(
config.embeddings, "vision_encoder.embeddings", "model.vision_tower"
config.embeddings, "vision_encoder.embeddings", "vision_tower"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why these changes? The current names are required for LlavaForConditionalGeneration and confirmed to work. The model prefix is explicitly needed for LlavaForConditionalGeneration https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava/modeling_llava.py#L316and the language model is a MistralModel which takes no model prefix.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm indeed, it's strange.
Without all these changes, we're not able to load https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker/tree/main in fast-llm. The weights in that model somehow match this different format with language_model.model...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

@jlamypoirier jlamypoirier Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that there are two equivalent ways to see the model. It can either be a LlavaForConditionalGeneration with a MistralModel text model, or a LlavaModel with a MistralForCausalLM. Main exports in the first format, but the dev branch seems to use the second one, though is still uses LlavaForConditionalGeneration as the architecture (maybe _checkpoint_conversion_mapping addresses the mismatch?)

I'd think the first option is more appropriate, but I could be wrong. Maybe we could just support both cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. From what I understand, this _checkpoint_conversion_mapping is something they made for backward compatibility. So indeed I think you're right that the first option is the right one, but our Apriel-1.5 checkpoint uses this older format.
How should we support both cases? Shall we create a new format called llava_legacy or something?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would work.

),
*cls.encoder_converter_class.get_converters(
config.encoder, "vision_encoder.encoder", "model.vision_tower.transformer.layers"
config.encoder, "vision_encoder.encoder", "vision_tower.transformer.layers"
),
*cls.vision_adapter_converter_class.get_converters(
config.adapter, "vision_encoder.adapter", "model.multi_modal_projector"
config.adapter, "vision_encoder.adapter", "multi_modal_projector"
),
]

Expand All @@ -266,11 +264,11 @@ def get_converters(
*cls.normalization_converter_class.get_converters(
config.normalization,
f"{fast_llm_prefix}.final_norm",
f"model.language_model.norm",
f"language_model.model.norm",
),
get_parameter_converter(
f"{fast_llm_prefix}.output_weights",
"lm_head.weight",
"language_model.lm_head.weight",
drop_on_import=exported_config["tie_word_embeddings"],
),
]
Expand Down Expand Up @@ -309,18 +307,17 @@ def export_config(cls, config: MultiModalBaseModelConfig) -> dict:
"vision_feature_layer": -1,
},
)
Assert.eq(out.pop("projector_intermediate_size"), out["text_config"]["hidden_size"])
return out

@classmethod
def get_converters(cls, config: MultiModalBaseModelConfig, exported_config: dict) -> list[WeightConverter]:
return [
*cls.vision_model_converter_class.get_converters(config.vision_encoder),
*cls.language_model_converter_class.embeddings_converter_class.get_converters(
config.embeddings, "embeddings", "model.language_model"
config.embeddings, "embeddings", "language_model.model"
),
*cls.language_model_converter_class.decoder_converter_class.get_converters(
config.decoder, "decoder", "model.language_model.layers"
config.decoder, "decoder", "language_model.model.layers"
),
*cls.language_model_converter_class.head_converter_class.get_converters(
config.head, {"tie_word_embeddings": False}, "head"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def __init__(
text_config=None,
image_token_index=32000,
projector_hidden_act="gelu",
projector_intermediate_size=4096,
vision_feature_select_strategy="default",
vision_feature_layer=-2,
image_seq_length=576,
Expand All @@ -68,8 +67,6 @@ def __init__(
):
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
# projector_intermediate_size is an addition to the original Llava config
self.projector_intermediate_size = projector_intermediate_size
self.image_seq_length = image_seq_length

if vision_feature_select_strategy not in ["default", "full"]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ def __init__(self, config: LlavaHybridConfig):
num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size * num_feature_layers,
config.projector_intermediate_size,
config.text_config.hidden_size,
bias=config.multimodal_projector_bias,
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(
config.projector_intermediate_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
Expand Down