-
Notifications
You must be signed in to change notification settings - Fork 36
Expand file tree
/
Copy pathdecaps_text.py
More file actions
56 lines (42 loc) · 1.35 KB
/
decaps_text.py
File metadata and controls
56 lines (42 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import logging
from . import nlp
class decaps_text(object):
"""
Normalizes capitalization patterns. Words with only a single capital
will be converted into lower case.
"""
def __init__(self):
""" Initialize the parser. """
self.logger = logging.getLogger(__name__)
def diffn(self, s1, s2):
""" Returns the number of different characters between two strings."""
return len([a for a, b in zip(s1, s2) if a != b])
def modify_word(self, org):
"""
Changes a word to lower case if it contains exactly one capital letter.
Args:
org: a string
Returns:
lower: the lowercase of org, a string
"""
lower = org.lower()
if self.diffn(org, lower) > 1:
return org
elif org != lower:
self.logger.info("Decapitalizing word %s to %s" % (org, lower))
return lower
def __call__(self, text):
"""
Runs the parser.
Args:
text: a string document
Returns:
doc2: a string document
"""
doc2 = []
for token in nlp(text, disable=["parser", "tagger"]):
new_token = self.modify_word(token.text)
doc2.append(new_token)
doc2.append(token.whitespace_)
doc2 = "".join(doc2)
return doc2