Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 57 additions & 32 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,25 @@ class ModelBase:
block_count: int
tensor_map: gguf.TensorNameMap

def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
def __init__(
self,
dir_model : Path,
ftype : gguf.LlamaFileType,
fname_out : Path,
hf_arch : str,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw, if we update it here, we should also update convert_lora_to_gguf, that's why I think it's better not to add to many input arguments for this class. hf_arch can indeed be implied from hparams, so could we remove it from this list?

*,
is_big_endian : bool = False,
use_temp_file : bool = False,
eager : bool = False,
metadata_override : Path | None = None,
model_name : str | None = None,
split_max_tensors : int = 0,
split_max_size : int = 0,
dry_run : bool = False,
small_first_shard : bool = False,
hparams : dict[str, Any] | None = None,
remote_hf_model_id : str | None = None,
):
if type(self) is ModelBase or \
type(self) is TextModel or \
type(self) is VisionModel:
Expand All @@ -94,6 +108,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
self.dir_model = dir_model
self.ftype = ftype
self.fname_out = fname_out
self.hf_arch = hf_arch
self.is_big_endian = is_big_endian
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file
Expand Down Expand Up @@ -1073,6 +1088,32 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])

def _try_set_pooling_type(self) -> None:
# get pooling path
pooling_path = None
module_path = self.dir_model / "modules.json"
if module_path.is_file():
with open(module_path, encoding="utf-8") as f:
modules = json.load(f)
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
pooling_path = mod["path"]
break

# get pooling type
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
if pooling["pooling_mode_mean_tokens"]:
pooling_type = gguf.PoolingType.MEAN
elif pooling["pooling_mode_cls_token"]:
pooling_type = gguf.PoolingType.CLS
elif pooling["pooling_mode_lasttoken"]:
pooling_type = gguf.PoolingType.LAST
else:
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type)


class VisionModel(ModelBase):
model_arch = gguf.MODEL_ARCH.CLIP_VISION
Expand Down Expand Up @@ -2538,7 +2579,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_file_type(self.ftype)


@ModelBase.register("Qwen2ForCausalLM")
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2

Expand All @@ -2550,12 +2591,18 @@ def set_vocab(self):

def set_gguf_parameters(self):
super().set_gguf_parameters()
self._try_set_pooling_type()
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if self.hf_arch == "Qwen2Model":
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
class Qwen2VLModel(TextModel):
Expand Down Expand Up @@ -3316,29 +3363,7 @@ def __init__(self, *args, **kwargs):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_causal_attention(False)

# get pooling path
pooling_path = None
module_path = self.dir_model / "modules.json"
if module_path.is_file():
with open(module_path, encoding="utf-8") as f:
modules = json.load(f)
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
pooling_path = mod["path"]
break

# get pooling type
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
if pooling["pooling_mode_mean_tokens"]:
pooling_type = gguf.PoolingType.MEAN
elif pooling["pooling_mode_cls_token"]:
pooling_type = gguf.PoolingType.CLS
else:
raise NotImplementedError("Only MEAN and CLS pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type)
self._try_set_pooling_type()

def set_vocab(self):
tokens, toktypes, tokpre = self.get_vocab_base()
Expand Down Expand Up @@ -3533,15 +3558,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
class NomicBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT

def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, hf_arch: str, **kwargs: Any):
hparams = kwargs.pop("hparams", None)
if hparams is None:
hparams = ModelBase.load_hparams(dir_model)

self.is_moe = bool(hparams.get("moe_every_n_layers"))
self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT

super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
super().__init__(dir_model, ftype, fname_out, hf_arch, hparams=hparams, **kwargs)

self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
if self._tokenizer_is_xlmroberta:
Expand Down Expand Up @@ -5957,7 +5982,7 @@ def main() -> None:
logger.error(f"Model {model_architecture} is not supported")
sys.exit(1)

model_instance = model_class(dir_model, output_type, fname_out,
model_instance = model_class(dir_model, output_type, fname_out, model_architecture,
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
eager=args.no_lazy,
metadata_override=args.metadata, model_name=args.model_name,
Expand Down
2 changes: 2 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2032,6 +2032,8 @@ class PoolingType(IntEnum):
NONE = 0
MEAN = 1
CLS = 2
LAST = 3
RANK = 4


class GGMLQuantizationType(IntEnum):
Expand Down
1 change: 1 addition & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// fall through
case LLM_ARCH_QWEN2:
{
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
Expand Down
Loading