Skip to content

Commit 69f1e08

Browse files
StefanwuuHaotian WualbertzNeoLegendsmichelwi
authored
Add ApplySentencepieceToTextJob (#599)
* add ApplySentencepieceToTextJob * apply black * address reviewer's requests * fix wrong format * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * adress change requested by reviewers * Update text/label/sentencepiece/apply.py Co-authored-by: Moritz Gunz <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * small fix * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * undo change causing problem with gzip * reformat * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * reformat * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * reformat * remove tmp files * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * official way to enable unk suggested by Albert * Update text/label/sentencepiece/apply.py Co-authored-by: michelwi <[email protected]> * refine suggestion from Willi * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * Update text/label/sentencepiece/apply.py Co-authored-by: Albert Zeyer <[email protected]> * adress reviewer's request * Update text/label/sentencepiece/apply.py Co-authored-by: Eugen Beck <[email protected]> * adress reviewer's request --------- Co-authored-by: Haotian Wu <[email protected]> Co-authored-by: Albert Zeyer <[email protected]> Co-authored-by: Moritz Gunz <[email protected]> Co-authored-by: michelwi <[email protected]> Co-authored-by: Eugen Beck <[email protected]> Co-authored-by: Haotian Wu <[email protected]>
1 parent 4795d0a commit 69f1e08

File tree

1 file changed

+63
-0
lines changed

1 file changed

+63
-0
lines changed

text/label/sentencepiece/apply.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import logging
2+
import shutil
3+
import subprocess as sp
4+
import tempfile
5+
import os
6+
7+
from sisyphus import Job, Task, tk, gs
8+
from typing import Any, Dict, Optional
9+
10+
import i6_core.util as util
11+
12+
try:
13+
import sentencepiece
14+
except ImportError:
15+
if not hasattr(gs, "WARNING_NO_SENTENCEPIECE") or gs.WARNING_NO_SENTENCEPIECE is True:
16+
logging.warning(
17+
"The package 'sentencepiece' is not installed in the manager python env. Please make sure it is installed "
18+
"in the python environment running the Sisyphus worker. To suppress this warning set "
19+
"'WARNING_NO_SENTENCEPIECE=False' in the settings.py"
20+
)
21+
22+
23+
class ApplySentencepieceToTextJob(Job):
24+
"""
25+
Apply sentencepiece model on a text file, basically a wrapper for spm.encode
26+
"""
27+
28+
def __init__(
29+
self,
30+
*,
31+
text_file: tk.Path,
32+
sentencepiece_model: tk.Path,
33+
enable_unk: bool = True,
34+
gzip_output: bool = True,
35+
):
36+
"""
37+
:param text_file: words text file to convert to sentencepiece
38+
:param sentencepiece_model: path to the trained sentencepiece model
39+
:param enable_unk: whether enable unk to map OOV symbol to the unknown symbol set in training or keep it as is
40+
:param gzip_output: use gzip on the output text
41+
"""
42+
self.text_file = text_file
43+
self.sentencepiece_model = sentencepiece_model
44+
self.enable_unk = enable_unk
45+
46+
self.out_sentencepiece_text = self.output_path(
47+
"words_to_sentencepiece.txt.gz" if gzip_output else "words_to_sentencepiece.txt"
48+
)
49+
50+
self.rqmt: Optional[Dict[str, Any]] = {"cpu": 1, "mem": 2.0, "time": 2.0}
51+
52+
def tasks(self):
53+
yield Task("run", rqmt=self.rqmt, mini_task=self.rqmt is None)
54+
55+
def run(self):
56+
spm = sentencepiece.SentencePieceProcessor(model_file=self.sentencepiece_model.get_path())
57+
if self.enable_unk:
58+
spm.set_encode_extra_options("unk")
59+
60+
with util.uopen(self.text_file, "rt") as fin, util.uopen(self.out_sentencepiece_text, "wt") as fout:
61+
for line in fin:
62+
pieces = spm.encode(line.rstrip("\n"), out_type=str)
63+
fout.write(" ".join(pieces) + "\n")

0 commit comments

Comments
 (0)