File tree Expand file tree Collapse file tree 1 file changed +8
-1
lines changed Expand file tree Collapse file tree 1 file changed +8
-1
lines changed Original file line number Diff line number Diff line change @@ -54,8 +54,15 @@ def convert_json_vocab(hf_tokenizer):
5454 # get vocab object from json file
5555 vocab = tokenizer_json .get ("model" , {}).get ("vocab" , {})
5656 sorted_merges = tokenizer_json .get ("model" , {}).get ("merges" , [])
57- sorted_merges = [ v_ . replace ( " \n " , "<0x0A>" ) for v_ in sorted_merges ]
57+
5858 attrs = {"vocab" : json .dumps (vocab , separators = ("," , ":" ))}
59+
60+ # merges data can be a list of string or list of list of string
61+ if (all (isinstance (v_ ,(list ,tuple ))) for v_ in sorted_merges ) :
62+ sorted_merges = [ " " .join (v if v != "\n " else "<0x0A>" for v in v_ ) for v_ in sorted_merges ]
63+ else :
64+ sorted_merges = [v_ .replace ("\n " , "<0x0A>" ) for v_ in sorted_merges ]
65+
5966 attrs ["merges" ] = "\n " .join (sorted_merges )
6067 if hf_tokenizer .added_tokens_encoder :
6168 token_map = [f"{ _k } ={ _v } " for _k ,
You can’t perform that action at this time.
0 commit comments