Add OPT model support - Add OPT architecture support in C++ code - Implement OPT-specific graph builder with separate Q/K/V projections - Add OPT model conversion support in Python - Add OPT tensor mappings and constants in gguf-py - Support some OPT model sizes - Tested with OPT-125M and OPT-13B models

NoAmateur · NoAmateur · commit 3f2839efdadb · 2025-05-26T09:51:45.000Z
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -803,6 +803,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
             # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
             res = "seed-coder"
+        if chkhsh == "7f2212c1b7fec62b4b75447509a4ecc8acd82813ce90d715dd99c1460a52d978":
+            # ref: https://huggingface.co/facebook/opt-13b
+            res = "gpt-2"
+        if chkhsh == "2c934e5e1c8275b75011b9942836389a87eaa1a63116104e52424515e7649c46":
+            # ref: https://huggingface.co/SousChef/OPT-13B-Erebus (OPT-13B-Erebus model)
+            res = "gpt-2"
 
         if res is None:
             logger.warning("\n")
@@ -3902,7 +3908,7 @@ def set_vocab(self):
     def set_gguf_parameters(self):
         hparams = self.hparams
         block_count = hparams["num_hidden_layers"]
-
+        
         self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
         self.gguf_writer.add_embedding_length(hparams["hidden_size"])
         self.gguf_writer.add_block_count(block_count)
@@ -4014,7 +4020,7 @@ def set_gguf_parameters(self):
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
-
+        
         if name.startswith("language_model."):
             name = name.replace("language_model.", "")
 
@@ -4520,7 +4526,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             data_torch = LlamaModel.permute(data_torch, n_head, n_head)
         if name.endswith("k_proj.weight"):
             data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
-
+            
         return [(self.map_tensor_name(name), data_torch)]
 
 
@@ -5231,7 +5237,7 @@ def set_gguf_parameters(self):
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
-
+        
         # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
         # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
         # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
@@ -6124,6 +6130,26 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         return cls._wrap_fn(func)(*args, **kwargs)
 
 
+@ModelBase.register("OPTForCausalLM")
+class OPTModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.OPT
+
+    def set_vocab(self):
+        # OPT typically uses GPT2 tokenizer
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        
+        # OPT-specific parameters that are not handled by the base class
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # OPT model uses standard tensor mapping - let the mapping handle the conversion
+        return [(self.map_tensor_name(name), data_torch)]
+
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Convert a huggingface model to a GGML compatible file")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -340,6 +340,7 @@ class MODEL_ARCH(IntEnum):
     WAVTOKENIZER_DEC = auto()
     PLM              = auto()
     BAILINGMOE       = auto()
+    OPT              = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -620,6 +621,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
     MODEL_ARCH.PLM:              "plm",
     MODEL_ARCH.BAILINGMOE:       "bailingmoe",
+    MODEL_ARCH.OPT:              "opt",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -2040,6 +2042,20 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
     ],
+    MODEL_ARCH.OPT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -31,6 +31,7 @@ class TensorNameMap:
             "model.embeddings",                          # rwkv7
             "model.word_embeddings",                     # bailingmoe
             "language_model.model.embed_tokens",         # llama4
+            "model.decoder.embed_tokens",               # opt
         ),
 
         # Token type embeddings
@@ -56,6 +57,7 @@ class TensorNameMap:
             "transformer.wpe",                 # gpt2
             "embeddings.position_embeddings",  # bert
             "wpe",                             # gpt2
+            "model.decoder.embed_positions",  # opt
         ),
 
         # Output
@@ -68,7 +70,7 @@ class TensorNameMap:
             "output_layer",              # chatglm
             "head",                      # rwkv
             "head.out",                  # wavtokenizer
-            "lm_head",                   # llama4
+            "lm_head",                   # llama4 opt
         ),
 
         # Output norm
@@ -92,6 +94,7 @@ class TensorNameMap:
             "model.ln_out",                            # rwkv7
             "backbone.final_layer_norm",               # wavtokenizer
             "model.norm",                              # llama4
+            "model.decoder.final_layer_norm",          # opt
         ),
 
         # Rope frequencies
@@ -134,6 +137,7 @@ class TensorNameMap:
             "rwkv.blocks.{bid}.ln1",                                # rwkv6
             "model.layers.{bid}.ln1",                               # rwkv7
             "model.layers.{bid}.input_layernorm",                   # llama4
+            "model.decoder.layers.{bid}.self_attn_layer_norm",      # opt
         ),
 
         # Attention norm 2
@@ -174,6 +178,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
             "transformer.h.{bid}.attn.attention.q_proj",                 # exaone
             "model.layers.{bid}.self_attn.q_proj",                       # llama4
+            "model.decoder.layers.{bid}.self_attn.q_proj",               # opt
         ),
 
         # Attention key
@@ -189,6 +194,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
             "transformer.h.{bid}.attn.attention.k_proj",               # exaone
             "model.layers.{bid}.self_attn.k_proj",                     # llama4
+            "model.decoder.layers.{bid}.self_attn.k_proj",             # opt
         ),
 
         # Attention value
@@ -203,6 +209,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
             "transformer.h.{bid}.attn.attention.v_proj",                 # exaone
             "model.layers.{bid}.self_attn.v_proj",                       # llama4
+            "model.decoder.layers.{bid}.self_attn.v_proj",               # opt
         ),
 
         # Attention output
@@ -230,6 +237,7 @@ class TensorNameMap:
             "transformer.layers.{bid}.attn.out_proj",                       # openelm
             "transformer.h.{bid}.attn.attention.out_proj",                  # exaone
             "model.layers.{bid}.self_attn.o_proj",                          # llama4
+            "model.decoder.layers.{bid}.self_attn.out_proj",                # opt
         ),
 
         # Attention output norm
@@ -269,6 +277,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
             "transformer.layers.{bid}.ffn_norm",                             # openelm
             "model.layers.{bid}.post_attention_layernorm",                   # llama4
+            "model.decoder.layers.{bid}.final_layer_norm",                   # opt
         ),
 
         # Post feed-forward norm
@@ -330,6 +339,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
             "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
             "model.layers.{bid}.feed_forward.up_proj",                # llama4
+            "model.decoder.layers.{bid}.fc1",                         # opt
         ),
 
         MODEL_TENSOR.FFN_UP_EXP: (
@@ -411,6 +421,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
             "model.layers.h.{bid}.mlp.c_proj",                        # exaone
             "model.layers.{bid}.feed_forward.down_proj",              # llama4
+            "model.decoder.layers.{bid}.fc2",                         # opt
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -72,6 +72,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+    { LLM_ARCH_OPT,              "opt"              },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -1530,6 +1531,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
         },
     },
+    {
+        LLM_ARCH_OPT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_POS_EMBD,        "position_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_BAILINGMOE,
         {
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -76,6 +76,7 @@ enum llm_arch {
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
+    LLM_ARCH_OPT,
     LLM_ARCH_UNKNOWN,
 };
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp