-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsentencizer.py
More file actions
29 lines (24 loc) · 881 Bytes
/
sentencizer.py
File metadata and controls
29 lines (24 loc) · 881 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import spacy
from spacy.language import Language
from spacy.symbols import ORTH
import pysbd
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
@Language.component('set_custom_boundaries')
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text in ('lit.', 'Lit.', 'lit', 'Lit'):
doc[token.i].is_sent_start = False
return doc
def nlp_returner(args):
nlp = spacy.load(args.spacy_model)
nlp.add_pipe('set_custom_boundaries', before="parser")
nlp.tokenizer.add_special_case('lit.', [{ORTH: 'lit.'}])
nlp.tokenizer.add_special_case('Lit.', [{ORTH: 'Lit.'}])
return nlp
def pysbd_sentencizer(sentence: str, language="en"):
seg = pysbd.Segmenter(language="en", clean=False)
return seg.segment(sentence)
def nltk_sentencizer(sentence: str, language="en"):
return sent_tokenize(sentence)