forked from acl-org/reviewer-paper-matching
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentencepiece_abstracts.py
More file actions
31 lines (25 loc) · 997 Bytes
/
sentencepiece_abstracts.py
File metadata and controls
31 lines (25 loc) · 997 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import argparse
import sentencepiece as spm
parser = argparse.ArgumentParser()
parser.add_argument("--infile", help="name of input file (tokenized, 1 text per line)")
parser.add_argument("--outfile", help="name of output file processed by sentencepiece")
parser.add_argument("--model-name", help="sentencepiece model name")
parser.add_argument("--vocab-size", help="sentencepiece vocabulary size")
args = parser.parse_args()
spm.SentencePieceTrainer.Train('--input={0} --model_prefix={1} --vocab_size={2} --character_coverage=0.9995 '
'--hard_vocab_limit=false'.format(args.infile, args.model_name, args.vocab_size))
sp = spm.SentencePieceProcessor()
sp.Load(args.model_name+'.model')
f = open(args.infile, 'r')
lines = f.readlines()
output = []
for i in lines:
i = i.strip().lower()
s0 = sp.EncodeAsPieces(i)
s0 = " ".join(s0)
output.append(s0)
fout = args.outfile
fout = open(fout, "w")
for i in output:
fout.write(i + "\n")
fout.close()