Skip to content

Commit cff4035

Browse files
committed
fix: bug in vocab
1 parent 077a550 commit cff4035

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

model2vec/distill/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,8 @@ def _process_unigram(tokenizer_json: dict[str, Any], pre_tokenized_tokens: list[
134134
"""Process the Unigram tokenizer JSON."""
135135
current_probas = dict(tokenizer_json["model"]["vocab"])
136136
avg_proba = sum(current_probas.values()) / len(current_probas)
137-
new_probas = {word: current_probas.get(word, avg_proba) for word in pre_tokenized_tokens}
138-
tokenizer_json["model"]["vocab"] = sorted(new_probas.items(), key=lambda x: x[1], reverse=True)
137+
new_probas = [[word, current_probas.get(word, avg_proba)] for word in pre_tokenized_tokens]
138+
tokenizer_json["model"]["vocab"] = new_probas
139139

140140
tokens, _ = zip(*tokenizer_json["model"]["vocab"])
141141
tokenizer_json["model"]["unk_id"] = list(tokens).index(unk_token)

0 commit comments

Comments
 (0)