-
Notifications
You must be signed in to change notification settings - Fork 39
llava conversion fixes #399
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
8445aaf
aa46283
17c9970
6e5da16
f260277
98b6283
9cce2aa
f0f2f79
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -167,7 +167,7 @@ class LlavaVisionAdapterConverter: | |
| @classmethod | ||
| def import_config(cls, config: dict) -> dict: | ||
| return { | ||
| "intermediate_size": config["vision_config"]["hidden_size"], | ||
| "intermediate_size": config["text_config"]["hidden_size"], | ||
| "add_linear_biases": config["multimodal_projector_bias"], | ||
| "gated": False, | ||
| "activation": ActivationType.from_hf_name(config["projector_hidden_act"]), | ||
|
|
@@ -183,8 +183,6 @@ def export_config(cls, config: MLPConfig) -> dict: | |
| return { | ||
| "projector_hidden_act": config.activation.hf_name, | ||
| "multimodal_projector_bias": config.add_linear_biases, | ||
| # Not in LlavaConfig, but needed for consistency check in LlavaBaseModelConverter. | ||
| "projector_intermediate_size": config.intermediate_size, | ||
| } | ||
|
|
||
| @classmethod | ||
|
|
@@ -243,13 +241,13 @@ def export_config(cls, config: VisionEncoderConfig) -> dict: | |
| def get_converters(cls, config: VisionEncoderConfig) -> list[WeightConverter]: | ||
| return [ | ||
| *cls.embeddings_converter_class.get_converters( | ||
| config.embeddings, "vision_encoder.embeddings", "model.vision_tower" | ||
| config.embeddings, "vision_encoder.embeddings", "vision_tower" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why these changes? The current names are required for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm indeed, it's strange.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It must have something to do with
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My understanding is that there are two equivalent ways to see the model. It can either be a I'd think the first option is more appropriate, but I could be wrong. Maybe we could just support both cases.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. From what I understand, this
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would work. |
||
| ), | ||
| *cls.encoder_converter_class.get_converters( | ||
| config.encoder, "vision_encoder.encoder", "model.vision_tower.transformer.layers" | ||
| config.encoder, "vision_encoder.encoder", "vision_tower.transformer.layers" | ||
| ), | ||
| *cls.vision_adapter_converter_class.get_converters( | ||
| config.adapter, "vision_encoder.adapter", "model.multi_modal_projector" | ||
| config.adapter, "vision_encoder.adapter", "multi_modal_projector" | ||
| ), | ||
| ] | ||
|
|
||
|
|
@@ -266,11 +264,11 @@ def get_converters( | |
| *cls.normalization_converter_class.get_converters( | ||
| config.normalization, | ||
| f"{fast_llm_prefix}.final_norm", | ||
| f"model.language_model.norm", | ||
| f"language_model.model.norm", | ||
| ), | ||
| get_parameter_converter( | ||
| f"{fast_llm_prefix}.output_weights", | ||
| "lm_head.weight", | ||
| "language_model.lm_head.weight", | ||
| drop_on_import=exported_config["tie_word_embeddings"], | ||
| ), | ||
| ] | ||
|
|
@@ -309,18 +307,17 @@ def export_config(cls, config: MultiModalBaseModelConfig) -> dict: | |
| "vision_feature_layer": -1, | ||
| }, | ||
| ) | ||
| Assert.eq(out.pop("projector_intermediate_size"), out["text_config"]["hidden_size"]) | ||
| return out | ||
|
|
||
| @classmethod | ||
| def get_converters(cls, config: MultiModalBaseModelConfig, exported_config: dict) -> list[WeightConverter]: | ||
| return [ | ||
| *cls.vision_model_converter_class.get_converters(config.vision_encoder), | ||
| *cls.language_model_converter_class.embeddings_converter_class.get_converters( | ||
| config.embeddings, "embeddings", "model.language_model" | ||
| config.embeddings, "embeddings", "language_model.model" | ||
| ), | ||
| *cls.language_model_converter_class.decoder_converter_class.get_converters( | ||
| config.decoder, "decoder", "model.language_model.layers" | ||
| config.decoder, "decoder", "language_model.model.layers" | ||
| ), | ||
| *cls.language_model_converter_class.head_converter_class.get_converters( | ||
| config.head, {"tie_word_embeddings": False}, "head" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why removing? This is essential to ensure compatibility.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is to ensure compatibility with what?
As stated in the comment, this is not in LlavaConfig. And it caused issues when trying to load Apriel-1.5, where it would set the default value for this param and fail in the assertion here https://github.com/ServiceNow/Fast-LLM/pull/399/files#diff-319643f77a4055995eb8f844aee095266ba3b15fa11f52e16acd89386058e51bL314
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The projector intermediate size needs to match with the LM hidden size, which is not guaranteed on the Fast-LLM size. The entry is not in the final output, it's there specifically for the assertion in https://github.com/ServiceNow/Fast-LLM/pull/399/files#diff-319643f77a4055995eb8f844aee095266ba3b15fa11f52e16acd89386058e51bL314. A failing assertion points to an actual error elsewhere.
What do you mean by "load Apriel-1.5"? Shouldn't that go through import?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, I guess this could be due to the bug you fixed above, where the intermediate size was set incorrectly on import?