diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py index e0149decde31..49dbc5e3ad90 100755 --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -57,7 +57,6 @@ "fsdp": ["is_fsdp_managed_module"], "ggml": [ "GGUF_CONFIG_MAPPING", - "GGUF_TENSOR_MAPPING", "GGUF_TOKENIZER_MAPPING", "_gguf_parse_value", "load_dequant_gguf_tensor", @@ -161,7 +160,6 @@ from .fsdp import is_fsdp_managed_module from .ggml import ( GGUF_CONFIG_MAPPING, - GGUF_TENSOR_MAPPING, GGUF_TOKENIZER_MAPPING, _gguf_parse_value, load_dequant_gguf_tensor, diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 6bb939e64592..e88071b6a02e 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -33,254 +33,6 @@ logger = logging.get_logger(__name__) -GGUF_TENSOR_MAPPING = { - "llama": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.up_proj", - "ffn_down": "mlp.down_proj", - "ffn_gate": "mlp.gate_proj", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "mistral": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.up_proj", - "ffn_down": "mlp.down_proj", - "ffn_gate": "mlp.gate_proj", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "qwen2": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.up_proj", - "ffn_down": "mlp.down_proj", - "ffn_gate": "mlp.gate_proj", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "qwen2moe": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up_exps": "mlp.experts", - "ffn_up_shexp": "mlp.shared_expert.up_proj", - "ffn_down_exps": "mlp.experts", - "ffn_down_shexp": "mlp.shared_expert.down_proj", - "ffn_norm": "post_attention_layernorm", - "ffn_gate_inp.weight": "mlp.gate.weight", - "ffn_gate_exps": "mlp.experts", - "ffn_gate_shexp": "mlp.shared_expert.gate_proj", - "ffn_gate_inp_shexp": "mlp.shared_expert_gate", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "phi3": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.gate_up_proj", - "ffn_down": "mlp.down_proj", - "ffn_gate": "mlp.gate_up_proj", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_qkv": "self_attn.qkv_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "bloom": { - "token_embd.weight": "transformer.word_embeddings.weight", - "token_embd_norm": "transformer.word_embeddings_layernorm", - "blk": "transformer.h", - "ffn_up": "mlp.dense_h_to_4h", - "ffn_down": "mlp.dense_4h_to_h", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_qkv": "self_attention.query_key_value", - "attn_output": "self_attention.dense", - "output.weight": "lm_head.weight", - "output_norm": "transformer.ln_f", - }, - "falcon7b": { - "token_embd": "word_embeddings", - "blk": "h", - "ffn_up": "mlp.dense_h_to_4h", - "ffn_down": "mlp.dense_4h_to_h", - "attn_norm": "input_layernorm", - "attn_qkv": "self_attention.query_key_value", - "attn_output": "self_attention.dense", - ".output.": ".lm_head.", - "output_norm": "ln_f", - }, - "falcon40b": { - "token_embd": "word_embeddings", - "blk": "h", - "ffn_up": "mlp.dense_h_to_4h", - "ffn_down": "mlp.dense_4h_to_h", - ".attn_norm.": ".ln_mlp.", - "attn_norm_2": "ln_attn", - "attn_qkv": "self_attention.query_key_value", - "attn_output": "self_attention.dense", - ".output.": ".lm_head.", - "output_norm": "ln_f", - }, - "t5": { - "token_embd": "shared", - "dec.blk.{bid}.attn_q": "decoder.block.{bid}.layer.0.SelfAttention.q", - "dec.blk.{bid}.attn_k": "decoder.block.{bid}.layer.0.SelfAttention.k", - "dec.blk.{bid}.attn_v": "decoder.block.{bid}.layer.0.SelfAttention.v", - "dec.blk.{bid}.attn_o": "decoder.block.{bid}.layer.0.SelfAttention.o", - "dec.blk.{bid}.attn_rel_b": "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", - "dec.blk.{bid}.attn_norm": "decoder.block.{bid}.layer.0.layer_norm", - "dec.blk.{bid}.cross_attn_q": "decoder.block.{bid}.layer.1.EncDecAttention.q", - "dec.blk.{bid}.cross_attn_k": "decoder.block.{bid}.layer.1.EncDecAttention.k", - "dec.blk.{bid}.cross_attn_v": "decoder.block.{bid}.layer.1.EncDecAttention.v", - "dec.blk.{bid}.cross_attn_o": "decoder.block.{bid}.layer.1.EncDecAttention.o", - "dec.blk.{bid}.cross_attn_norm": "decoder.block.{bid}.layer.1.layer_norm", - "dec.blk.{bid}.ffn_gate": "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", - "dec.blk.{bid}.ffn_up": "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", - "dec.blk.{bid}.ffn_down": "decoder.block.{bid}.layer.2.DenseReluDense.wo", - "dec.blk.{bid}.ffn_norm": "decoder.block.{bid}.layer.2.layer_norm", - "dec.output_norm": "decoder.final_layer_norm", - "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q", - "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k", - "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v", - "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o", - "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", - "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm", - "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", - "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", - "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo", - "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm", - "enc.output_norm": "encoder.final_layer_norm", - "output.weight": "lm_head.weight", - }, - "t5encoder": { - "token_embd": "shared", - "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q", - "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k", - "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v", - "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o", - "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", - "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm", - "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", - "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", - "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo", - "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm", - "enc.output_norm": "encoder.final_layer_norm", - }, - "stablelm": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.up_proj", - "ffn_down": "mlp.down_proj", - "ffn_gate": "mlp.gate_proj", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "gpt2": { - "token_embd": "transformer.wte", - "blk": "transformer.h", - "position_embd": "transformer.wpe", - "output_norm": "transformer.ln_f", - "attn_norm": "ln_1", - "attn_qkv": "attn.c_attn", - "attn_output.weight": "attn.c_proj.weight", - "attn_output.bias": "attn.c_proj.bias", - "ffn_norm": "ln_2", - "ffn_up": "mlp.c_fc", - "ffn_down": "mlp.c_proj", - }, - "starcoder2": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.c_fc", - "ffn_down": "mlp.c_proj", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "mamba": { - "token_embd": "backbone.embeddings", - "blk": "backbone.layers", - "ssm_a": "mixer.A_log", - "ssm_conv1d": "mixer.conv1d", - "ssm_in": "mixer.in_proj", - "ssm_out": "mixer.out_proj", - "ssm_x": "mixer.x_proj", - "ssm_dt": "mixer.dt_proj", - "attn_norm": "norm", - "output_norm": "backbone.norm_f", - "output.weight": "lm_head.weight", - }, - "nemotron": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.up_proj", - "ffn_down": "mlp.down_proj", - "ffn_norm": "post_attention_layernorm", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output.weight": "lm_head.weight", - "output_norm": "model.norm", - }, - "gemma2": { - "token_embd": "model.embed_tokens", - "blk": "model.layers", - "ffn_up": "mlp.up_proj", - "ffn_down": "mlp.down_proj", - "ffn_gate": "mlp.gate_proj", - "ffn_norm": "pre_feedforward_layernorm", - "post_attention_norm": "post_attention_layernorm", - "post_ffw_norm": "post_feedforward_layernorm", - "attn_norm": "input_layernorm", - "attn_q": "self_attn.q_proj", - "attn_v": "self_attn.v_proj", - "attn_k": "self_attn.k_proj", - "attn_output": "self_attn.o_proj", - "output_norm": "model.norm", - }, -} - - GGUF_CONFIG_MAPPING = { "general": { "architecture": "model_type", diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 5565fb152bc3..9b20c1b61226 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -22,7 +22,6 @@ from .integrations import ( GGUF_CONFIG_MAPPING, - GGUF_TENSOR_MAPPING, GGUF_TOKENIZER_MAPPING, _gguf_parse_value, ) @@ -47,12 +46,11 @@ "general": {"file_type": "file_type", "quantization_version": "quantization_version"}, }, "config": GGUF_CONFIG_MAPPING, - "tensors": GGUF_TENSOR_MAPPING, "tokenizer": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer"]}, "tokenizer_config": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer_config"]}, } -GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys()) +GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["config"].keys()) class GGUFTensor(NamedTuple): @@ -121,21 +119,10 @@ def _split_moe_expert_tensor( ): # Original merge implementation # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022 - exp_name = "" - if "ffn_gate_exps" in name: - exp_name = "gate_proj" - elif "ffn_down_exps" in name: - exp_name = "down_proj" - elif "ffn_up_exps" in name: - exp_name = "up_proj" - else: - raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.") - for tensor_name in tensor_key_mapping: - if tensor_name in name: - name = name.replace(tensor_name, tensor_key_mapping[tensor_name]) + name = tensor_key_mapping[name] w_counter = self.config.get("num_experts", 60) for i in range(0, w_counter): - temp_name = name.replace(".weight", f".{i}.{exp_name}.weight") + temp_name = name.replace("mlp.experts.", f"mlp.experts.{i}.") exp_weight = weights[i] parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight)) @@ -223,10 +210,6 @@ def __init__(self, config=None): super().__init__(config=config) def process(self, weights, name, **kwargs): - if "ssm_d" in name and "bias" not in name and "weight" not in name: - # ssm_d has conflicts with ssm_dt in name checking - # we have to explicitly check that name is exactly ssm_d - name = name.replace("ssm_d", "mixer.D") if "ssm_conv1d.weight" in name: # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim, # quantized one is (5120, 4) @@ -267,7 +250,84 @@ def read_field(reader, field): return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data] -def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): +# modified from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/loader.py#L1115-L1147 +def get_gguf_hf_weights_map( + hf_model, + model_type: Optional[str] = None, + num_layers: Optional[int] = None, + qual_name: str = "", +): + """ + GGUF uses this naming convention for their tensors from HF checkpoint: + `blk.N.BB.weight` and `blk.N.BB.bias` + where N signifies the block number of a layer, and BB signifies the + attention/mlp layer components. + See "Standardized tensor names" in + https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details. + """ + if is_gguf_available() and is_torch_available(): + from gguf import MODEL_ARCH_NAMES, get_tensor_name_map + else: + logger.error( + "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " + "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions." + ) + raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") + + model_type = hf_model.config.model_type if model_type is None else model_type + num_layers = hf_model.config.num_hidden_layers if num_layers is None else num_layers + # hack: ggufs have a different name for cohere + if model_type == "cohere": + model_type = "command-r" + if model_type == "qwen2_moe": + model_type = "qwen2moe" + arch = None + for key, value in MODEL_ARCH_NAMES.items(): + if value == model_type: + arch = key + break + if arch is None: + raise NotImplementedError( + f"Unknown gguf model_type: {model_type} in gguf-py. " + "This might because you're using an outdated version of gguf-py package, " + "you can install `gguf` package from source refer to " + "https://github.com/ggerganov/llama.cpp/tree/master/gguf-py#development" + ) + name_map = get_tensor_name_map(arch, num_layers) + + # Use a dummy conversion to get the mapping, because + # hf => gguf and gguf => hf mappings are reversed + gguf_to_hf_name_map = {} + state_dict = hf_model.state_dict() + for hf_name in state_dict.keys(): + # An exception for qwen2moe model, where the expert layers are packed + if model_type == "qwen2moe" and "mlp.experts." in hf_name: + hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name) + + name, suffix = hf_name, "" + if hf_name.endswith(".weight") or hf_name.endswith(".bias"): + name, suffix = hf_name.rsplit(".", 1) + suffix = "." + suffix + + gguf_name = name_map.get_name(name) + if gguf_name is None: + continue + + gguf_to_hf_name_map[gguf_name + suffix] = qual_name + hf_name + + # Some model like Bloom converted from BloomModel instead of BloomForCausalLM + # Therefore, we need to check submodule as well to get a correct mapping + if named_children := hf_model.named_children(): + for name, child in named_children: + sub_map = get_gguf_hf_weights_map(child, model_type, num_layers, qual_name=f"{qual_name}{name}.") + # Ignore the keys that are already in the main map to avoid overwriting + sub_map = {k: v for k, v in sub_map.items() if k not in gguf_to_hf_name_map} + gguf_to_hf_name_map.update(sub_map) + + return gguf_to_hf_name_map + + +def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_load=None): """ Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed tokenizer and config attributes. @@ -323,20 +383,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): parsed_parameters["config"]["use_qkv_bias"] = qkv_bias parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual - model_size = "" - # extract the number of params from file name as architectures can differ ; - # eg. for falcon : `...falcon-7b-...` - if "falcon" in architecture: - gguf_file_name = gguf_checkpoint_path.split("/")[-1].lower() - m = re.search(r"-\d+b-", gguf_file_name) # regex to catch `-7b-` - if m is None: - raise ValueError( - f"From file name, cannot determine the number of parameters for {architecture} architecture" - ) - model_size = m.group().strip("-") # only keeps `7b` - - if architecture + model_size not in GGUF_SUPPORTED_ARCHITECTURES: - raise ValueError(f"Architecture {architecture + model_size} not supported") + if architecture not in GGUF_SUPPORTED_ARCHITECTURES: + raise ValueError(f"GGUF model with architecture {architecture} is not supported yet.") # Handle tie_word_embeddings, if lm_head.weight is not present in tensors, # tie_word_embeddings is true otherwise false @@ -388,7 +436,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): ) if return_tensors: - tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size] + parsed_parameters["tensors"] = {} + + tensor_key_mapping = get_gguf_hf_weights_map(model_to_load) config = parsed_parameters.get("config", {}) ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor) @@ -407,16 +457,12 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): weights = result.weights name = result.name - bid = result.metadata.get("bid") - if name is None: + if name not in tensor_key_mapping: continue - for tensor_name in tensor_key_mapping: - if tensor_name.format(bid=bid) in name: - name = name.replace(tensor_name.format(bid=bid), tensor_key_mapping[tensor_name].format(bid=bid)) + name = tensor_key_mapping[name] - # Use copy to avoid errors with numpy and pytorch parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) if len(reader_keys) > 0: diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index ead3f1a03717..33ddc2fbcc43 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -3917,7 +3917,10 @@ def from_pretrained( gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **cached_file_kwargs) - state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True)["tensors"] + # we need a dummy model to help rename state_dict + with torch.device("meta"): + dummy_model = cls(config) + state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True, model_to_load=dummy_model)["tensors"] resolved_archive_file = None is_sharded = False