-
Notifications
You must be signed in to change notification settings - Fork 14k
convert : experimental support for --mmproj flag
#13023
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
d5e03e6
d59a7bb
55651ad
ddd7920
93b5f71
43c2e75
ad186f4
0822e15
e37dec6
1f71edb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -67,14 +67,20 @@ class Model: | |||
| dir_model_card: Path | ||||
| remote_hf_model_id: str | None | ||||
|
|
||||
| # for vision encoders | ||||
| mmproj: bool | ||||
| ignore_vision: bool = False # subclasses may overwrite this | ||||
| mtmd_model: MultimodalModel | None = None | ||||
|
|
||||
| # subclasses should define this! | ||||
| model_arch: gguf.MODEL_ARCH | ||||
|
|
||||
| def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, | ||||
| use_temp_file: bool = False, eager: bool = False, | ||||
| metadata_override: Path | None = None, model_name: str | None = None, | ||||
| split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, | ||||
| small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): | ||||
| small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, | ||||
| mmproj: bool = False): | ||||
| if type(self) is Model: | ||||
| raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") | ||||
|
|
||||
|
|
@@ -109,6 +115,7 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: | |||
| self.metadata_override = metadata_override | ||||
| self.model_name = model_name | ||||
| self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py | ||||
| self.mmproj = mmproj | ||||
|
|
||||
| # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type | ||||
| if self.ftype == gguf.LlamaFileType.GUESSED: | ||||
|
|
@@ -125,6 +132,28 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: | |||
| self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, | ||||
| split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) | ||||
|
|
||||
| # vision encoder | ||||
| if mmproj: | ||||
| vision_hparams = self.hparams.get("vision_config") | ||||
| if vision_hparams is None: | ||||
| raise ValueError("Vision config not found in model config") | ||||
| elif self.ignore_vision: | ||||
| raise ValueError("Vision config found, but mmproj conversion for this model is not supported yet") | ||||
| else: | ||||
| self.mtmd_model = MultimodalModel( | ||||
| hparams=vision_hparams, | ||||
| ftype=self.ftype, | ||||
| fname_out=self.fname_out, | ||||
| endianess=self.endianess, | ||||
| use_temp_file=self.use_temp_file, | ||||
| ) | ||||
|
|
||||
| @classmethod | ||||
| def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: | ||||
| stem, suffix = path.stem, path.suffix | ||||
| new_name = f"{prefix}{stem}{suffix}" | ||||
| return path.with_name(new_name) | ||||
|
|
||||
| @classmethod | ||||
| def __init_subclass__(cls): | ||||
| # can't use an abstract property, because overriding it without type errors | ||||
|
|
@@ -272,8 +301,13 @@ def set_gguf_parameters(self): | |||
| self.gguf_writer.add_key_length(head_dim) | ||||
| self.gguf_writer.add_value_length(head_dim) | ||||
|
|
||||
| self.gguf_writer.add_file_type(self.ftype) | ||||
| logger.info(f"gguf: file type = {self.ftype}") | ||||
| if not self.mmproj: | ||||
| self.gguf_writer.add_file_type(self.ftype) | ||||
| logger.info(f"gguf: file type = {self.ftype}") | ||||
| else: | ||||
| assert self.mtmd_model is not None | ||||
| self.mtmd_model.set_gguf_parameters(n_embd_text=n_embd) | ||||
| logger.info(f"mmproj: file type = {self.mtmd_model.ftype}") | ||||
|
|
||||
| def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||||
| del bid # unused | ||||
|
|
@@ -311,6 +345,10 @@ def prepare_tensors(self): | |||
| break | ||||
|
|
||||
| for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): | ||||
| # skip adding tensor if we're working with a vision model | ||||
| if self.mmproj: | ||||
| continue | ||||
|
|
||||
| # TODO: why do we squeeze here? | ||||
| # data = data_torch.squeeze().numpy() | ||||
| data = data_torch.numpy() | ||||
|
|
@@ -455,12 +493,18 @@ def prepare_metadata(self, vocab_only: bool): | |||
| self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) | ||||
|
|
||||
| def write(self): | ||||
| self.prepare_tensors() | ||||
| self.prepare_metadata(vocab_only=False) | ||||
| self.gguf_writer.write_header_to_file(path=self.fname_out) | ||||
| self.gguf_writer.write_kv_data_to_file() | ||||
| self.gguf_writer.write_tensors_to_file(progress=True) | ||||
| self.gguf_writer.close() | ||||
| if self.mtmd_model is not None: | ||||
| self.prepare_tensors() | ||||
| self.prepare_metadata(vocab_only=False) | ||||
| logger.info("Writing vision model") | ||||
| self.mtmd_model.write() | ||||
| else: | ||||
| self.prepare_tensors() | ||||
| self.prepare_metadata(vocab_only=False) | ||||
| self.gguf_writer.write_header_to_file(path=self.fname_out) | ||||
| self.gguf_writer.write_kv_data_to_file() | ||||
| self.gguf_writer.write_tensors_to_file(progress=True) | ||||
| self.gguf_writer.close() | ||||
|
|
||||
| def write_vocab(self): | ||||
| if len(self.gguf_writer.tensors) != 1: | ||||
|
|
@@ -485,7 +529,10 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] | |||
| @staticmethod | ||||
| def load_hparams(dir_model: Path): | ||||
| with open(dir_model / "config.json", "r", encoding="utf-8") as f: | ||||
| return json.load(f) | ||||
| hparams = json.load(f) | ||||
| if "text_config" in hparams: | ||||
| hparams = {**hparams, **hparams["text_config"]} | ||||
| return hparams | ||||
|
|
||||
| @classmethod | ||||
| def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: | ||||
|
|
@@ -1024,6 +1071,101 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab | |||
| self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) | ||||
|
|
||||
|
|
||||
| # for converting mmproj file | ||||
| class MultimodalModel: | ||||
| hparams: dict | ||||
| dir_model: Path | ||||
| ftype: gguf.LlamaFileType | ||||
| fname_out: Path | ||||
| tensor_map: gguf.TensorNameMap | ||||
| gguf_writer: gguf.GGUFWriter | ||||
|
|
||||
| def __init__(self, hparams: dict, ftype: gguf.LlamaFileType, fname_out: Path, endianess: gguf.GGUFEndian, use_temp_file: bool): | ||||
| self.hparams = hparams | ||||
| self.ftype = ftype | ||||
| self.fname_out = fname_out | ||||
| self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128) | ||||
| self.gguf_writer = gguf.GGUFWriter(path=None, | ||||
| arch="clip", | ||||
| endianess=endianess, | ||||
| use_temp_file=use_temp_file) | ||||
|
|
||||
| def set_gguf_parameters(self, n_embd_text: int): | ||||
| """Function to be called by Model.set_gguf_parameters()""" | ||||
| self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) | ||||
| self.gguf_writer.add_file_type(self.ftype) | ||||
| self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, n_embd_text) | ||||
| self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True) | ||||
|
|
||||
| # vision config | ||||
| self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE, self.find_hparam(["image_size"])) | ||||
| self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE, self.find_hparam(["patch_size"])) | ||||
| self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH, self.find_hparam(["hidden_size"])) | ||||
| self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH, self.find_hparam(["intermediate_size"])) | ||||
| self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"])) | ||||
| self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"])) | ||||
|
Comment on lines
+1090
to
+1095
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that I didn't add |
||||
|
|
||||
| def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: | ||||
| key = next((k for k in keys if k in self.hparams), None) | ||||
| if key is not None: | ||||
| return self.hparams[key] | ||||
| if optional: | ||||
| return None | ||||
| raise KeyError(f"could not find any of: {keys}") | ||||
|
|
||||
| def get_quantization(self, mapped_name: str, data_torch: Tensor) -> gguf.GGMLQuantizationType: | ||||
| is_1d = len(data_torch.shape) == 1 | ||||
| is_embd = "_embd" in mapped_name | ||||
| can_quantize = not is_1d and not is_embd | ||||
| data_qtype = gguf.GGMLQuantizationType.F32 | ||||
| if can_quantize: | ||||
| if self.ftype == gguf.LlamaFileType.ALL_F32: | ||||
| data_qtype = gguf.GGMLQuantizationType.F32 | ||||
| elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: | ||||
| data_qtype = gguf.GGMLQuantizationType.F16 | ||||
| elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: | ||||
| data_qtype = gguf.GGMLQuantizationType.BF16 | ||||
| elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: | ||||
| data_qtype = gguf.GGMLQuantizationType.Q8_0 | ||||
| else: | ||||
| raise ValueError(f"Unsupported file type: {self.ftype}") | ||||
| return data_qtype | ||||
|
|
||||
| def add_tensor(self, original_name: str, data_torch: Tensor) -> None: | ||||
| """Function to be called inside Model.modify_tensors()""" | ||||
| # name mapping | ||||
| new_name = self.tensor_map.get_name(key=original_name, try_suffixes=(".weight", ".bias")) | ||||
| if new_name is None: | ||||
| raise ValueError(f"Can not map tensor {original_name!r}") | ||||
|
|
||||
| # process data | ||||
| # old_dtype = data_torch.dtype | ||||
| data_qtype = self.get_quantization(new_name, data_torch) | ||||
| data = data_torch.numpy() | ||||
| try: | ||||
| data = gguf.quants.quantize(data, data_qtype) | ||||
| except Exception as e: | ||||
| logger.error(f"Error quantizing tensor '{new_name}': {e}, fallback to F16") | ||||
| data_qtype = gguf.GGMLQuantizationType.F16 | ||||
| data = gguf.quants.quantize(data, data_qtype) | ||||
|
|
||||
| # reverse shape to make it similar to the internal ggml dimension order | ||||
| # TODO: we don't print old_dtype because it's not correct, to be fixed later | ||||
| old_dtype = "" | ||||
| shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" | ||||
| logger.info(f"{f'%-32s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") | ||||
|
|
||||
| # add tensor | ||||
| self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) | ||||
|
|
||||
| def write(self): | ||||
| """Function to be called by Model.write()""" | ||||
| self.gguf_writer.write_header_to_file(path=self.fname_out) | ||||
| self.gguf_writer.write_kv_data_to_file() | ||||
| self.gguf_writer.write_tensors_to_file(progress=True) | ||||
| self.gguf_writer.close() | ||||
|
|
||||
|
|
||||
| @Model.register("GPTNeoXForCausalLM") | ||||
| class GPTNeoXModel(Model): | ||||
| model_arch = gguf.MODEL_ARCH.GPTNEOX | ||||
|
|
@@ -1781,20 +1923,13 @@ def prepare_tensors(self): | |||
| @Model.register("Llama4ForConditionalGeneration") | ||||
| class Llama4Model(LlamaModel): | ||||
| model_arch = gguf.MODEL_ARCH.LLAMA4 | ||||
| has_vision: bool = False | ||||
| undo_permute = False | ||||
| ignore_vision = True | ||||
|
||||
| ignore_vision = True |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated to the conversation, but Llama 4 vision support also seem to be a low-hanging fruit. They no longer use cross-attn like in llama 3, here it's just simple embeddings passed from encoder to decoder, so also would be a nice thing to try out.
(Noting here so I remember)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@compilade currently, the
GGUFWriterfor mmproj file is wrapped insideMultimodalModel. The consequence is thatMultimodalModelis now an attribute ofModelAnother way is to male
MultimodalModelinheritsModel, but this seems a bit complicated to think about. Not sure which way you prefer?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This means that when
self.mmprojis true, thenself.gguf_writeris unused (but still created (!)), and anotherGGUFWriteris created somewhere inself.mtmd_model. It works because the output files are no longer opened/created as soon as theGGUFWriteris instantiated since #7827. (but there's still some unnecessary metadata keys set and ignored)There's probably some way to simplify this.
What seems to be needed (eventually, to make this cleaner) is some more general abstraction to convert submodels (unless I'm misunderstanding the problem).
A submodel is part of a model, and a model is one or more submodels. Not quite sure how that should interact with model architectures, though. Each submodel could have its own architecture and tensor mappings, but I don't know what the main model architecture would be (the first submodel? a meta-model? or maybe there doesn't need to be a main one).
Since model loading doesn't quite support sub-models yet (we'll need to figure out namespaces or other ideas from #13028), only one submodel can be exported at a time, but at least conceptually it might be simpler to adapt such an abstraction to actually include multiple submodels in a single GGUF file once we've figured that out.
I think I prefer the way you currently did it for now, because you're right that
Modeldoes a lot, and refactoring multimodal support will be simpler by duplicating some parts ofModelin a smaller class like withMultimodalModeluntil we figure out something cleaner.(I also don't know how
MultimodalModelcould cleanly subclassModelin this case)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's some interesting questions. What I'm thinking is:
llama_modelorclip_model, which only loads some tensors that it needslibllama, it is distinguished by model arch, so currently each submodel has onesubmodel.arch. But this can be tricky in the case of models forclip.cppwhich does not care about arch (the equivalent is the notion of "projector type")So from my POV above, what I'm thinking is that a submodel is just a
Modelwith a custom list of tensors and metadataOne idea could be:
Modelthat provides some basic functions like reading safetensors, GGUFWriter, etcTextModelthat inheritsModelMultimodalModelinheritsModelPlease note that, the currently and the mentioned issue is not quite related atm. The main problem is that mmproj is currently not supported by
libllama, so it's currently not possible to bundle mmproj + LLM.My currently PR is made mostly for 2 purposes:
examples/llavasince some of them are very hacky and better to just abandon themThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok so I ended up doing this and this seems to be more generic (while being less hacky at the same time), please have a look on this commit: ddd7920
The main idea is to have
VisionModelandTextModelboth inheritsModelsuper class, and existing text models inheritTextModel(hence why you see many LOC changed in the commit, but most of them are just changingModel-->TextModel)Btw, it would be nice if we can finalize this during the week, so I can go ahead and add SmolVLM support. The
clip.cppimplementation should be very straight-forward, the only thing blocking me is that the currentmmprojconversion script is a nightmare to work with 😂 So would be nice if we can finally useconvert_hf_to_ggufto get themmprojThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, this does feel much better, especially with how quant types are overridden in the intended way.
LoraModelwill likely need adaptation, though. Not sure if it should be based onTextModelorModelstill. (Does it make sense to have LoRA adapters of mmproj?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't seen anyone doing this, so I guess it doesn't make sense practically. In most (if not all) cases, people interested in doing LoRA for text model because it's easier to prepare the dataset.
And since
LoraModelusingModel.from_model_architecturewhich returns theTextModelsubclass by default, I think it will continue to work as-is. Can you think of any cases which need to be adapted?