Skip to content

Commit 6f3a3ba

Browse files
authored
Special handling of Seed Coder FIM tokens (#585)
* Special handling of Seed Coder FIM tokens * vocab: Add Seed Coder pretokenizer * Formatting fix * Update llama.h
1 parent 22f6791 commit 6f3a3ba

File tree

5 files changed

+27
-0
lines changed

5 files changed

+27
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
636636
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
637637
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
638638
res = "deepseek-v3"
639+
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
640+
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
641+
res = "seed-coder"
639642

640643
if res is None:
641644
logger.warning("\n")
@@ -1520,6 +1523,17 @@ def set_vocab(self):
15201523
special_vocab._set_special_token("eot", 32010)
15211524
special_vocab.add_to_gguf(self.gguf_writer)
15221525

1526+
# Apply to Seed-Coder only (and ignore otherwise)
1527+
if self.hparams.get("vocab_size", 32000) == 155136:
1528+
special_vocab = gguf.SpecialVocab(
1529+
self.dir_model, load_merges=False,
1530+
special_token_types = ['prefix', 'suffix', 'middle', 'eot']
1531+
)
1532+
special_vocab._set_special_token("prefix", 124)
1533+
special_vocab._set_special_token("suffix", 125)
1534+
special_vocab._set_special_token("middle", 126)
1535+
special_vocab.add_to_gguf(self.gguf_writer)
1536+
15231537
def set_gguf_parameters(self):
15241538
super().set_gguf_parameters()
15251539
hparams = self.hparams

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ class TOKENIZER_TYPE(IntEnum):
9595
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
9696
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
9797
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
98+
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
9899
]
99100

100101

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ extern "C" {
110110
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
111111
LLAMA_VOCAB_PRE_TYPE_FALCON_3 = 34,
112112
LLAMA_VOCAB_PRE_TYPE_FALCON_E = 35,
113+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 36, //llama.cpp lists this as 35
113114
};
114115

115116
// note: these values should be synchronized with ggml_rope

src/llama-vocab.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,13 @@ struct llm_tokenizer_bpe {
477477
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
478478
};
479479
break;
480+
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
481+
regex_exprs = {
482+
// original regex from tokenizer.json
483+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
484+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
485+
};
486+
break;
480487
default:
481488
// default regex for BPE tokenization pre-processing
482489
regex_exprs = {

src/llama.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6302,6 +6302,10 @@ static void llm_load_vocab(
63026302
tokenizer_pre == "bailingmoe") {
63036303
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
63046304
vocab.tokenizer_clean_spaces = false;
6305+
} else if (
6306+
tokenizer_pre == "seed-coder") {
6307+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
6308+
vocab.tokenizer_clean_spaces = false;
63056309
} else {
63066310
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
63076311
}

0 commit comments

Comments
 (0)