Skip to content

Commit 94bdcea

Browse files
committed
Enhance handling of merges data in HFTokenizerConverter for improved compatibility
1 parent 5f8f774 commit 94bdcea

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

onnxruntime_extensions/_hf_cvt.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,15 @@ def convert_json_vocab(hf_tokenizer):
5454
# get vocab object from json file
5555
vocab = tokenizer_json.get("model", {}).get("vocab", {})
5656
sorted_merges = tokenizer_json.get("model", {}).get("merges", [])
57-
sorted_merges = [v_.replace("\n", "<0x0A>") for v_ in sorted_merges]
57+
5858
attrs = {"vocab": json.dumps(vocab, separators=(",", ":"))}
59+
60+
# merges data can be a list of string or list of list of string
61+
if (all(isinstance(v_,(list,tuple))) for v_ in sorted_merges) :
62+
sorted_merges = [ " ".join(v if v != "\n" else "<0x0A>" for v in v_ ) for v_ in sorted_merges]
63+
else :
64+
sorted_merges = [v_.replace("\n", "<0x0A>") for v_ in sorted_merges]
65+
5966
attrs["merges"] = "\n".join(sorted_merges)
6067
if hf_tokenizer.added_tokens_encoder:
6168
token_map = [f"{_k}={_v}" for _k,

0 commit comments

Comments
 (0)