-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbpe_tokenizer_2.PY
More file actions
57 lines (41 loc) · 1.45 KB
/
bpe_tokenizer_2.PY
File metadata and controls
57 lines (41 loc) · 1.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from collections import Counter
def byte_pair_encoding(corpus, num_merges):
tokens = list(corpus)
vocab = set(tokens)
for _ in range(num_merges):
pairs = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
if not pairs:
break
pair_counts = Counter(pairs)
most_frequent = pair_counts.most_common(1)[0][0] # (tL, tR)
new_token = ''.join(most_frequent)
vocab.add(new_token)
i = 0
new_tokens = []
while i < len(tokens):
if i < len(tokens) - 1 and (tokens[i], tokens[i+1]) == most_frequent:
new_tokens.append(new_token)
i += 2
else:
new_tokens.append(tokens[i])
i += 1
tokens = new_tokens
return vocab, tokens
corpus = "set new new renew reset renew"
vocab, encoded = byte_pair_encoding(corpus, num_merges=5)
print("Final Vocabulary:", vocab)
print("Encoded Corpus:", encoded)
# _________________________________________________
from collections import Counter
def bpe(c,k):
t,V=list(c),set(c)
for _ in range(k):
if not (p:=Counter(zip(t,t[1:]))):break
a,b=p.most_common(1)[0][0];n=a+b;V|={n}
i,u=0,[]
while i<len(t):
if i<len(t)-1 and (t[i],t[i+1])==(a,b):u+=[n];i+=2
else:u+=[t[i]];i+=1
t=u
return V,t
print(bpe("set new new renew reset renew",5))