Skip to content

Commit 3b9e5d6

Browse files
committed
PY3 some work towards text_de
1 parent 72a5a3e commit 3b9e5d6

File tree

3 files changed

+27
-19
lines changed

3 files changed

+27
-19
lines changed

pattern/text/__init__.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -399,10 +399,11 @@ def _read(path, encoding="utf-8", comment=";;;"):
399399
# From file or buffer.
400400
f = path
401401
for i, line in enumerate(f):
402-
line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(
403-
line, str) else line
402+
line = (line.strip(codecs.BOM_UTF8)
403+
if i == 0 and isinstance(line, bytes)
404+
else line)
404405
line = line.strip()
405-
line = decode_utf8(line, encoding)
406+
line = line.decode(encoding)
406407
if not line or (comment and line.startswith(comment)):
407408
continue
408409
yield line
@@ -424,6 +425,7 @@ def load(self):
424425
# Arnold NNP x
425426
dict.update(self, (x.split(" ")[:2] for x in _read(self._path)))
426427

428+
427429
#--- FREQUENCY -----------------------------------------------------------
428430

429431

@@ -859,7 +861,7 @@ def __init__(self, lexicon={}, frequency={}, model=None, morphology=None, contex
859861
The given default tags are used for unknown words.
860862
Unknown words that start with a capital letter are tagged NNP (except for German).
861863
Unknown words that contain only digits and punctuation are tagged CD.
862-
Optionally, morphological and contextual rules (or a language model) can be used
864+
Optionally, morphological and contextual rules (or a language model) can be used
863865
to improve the tags of unknown words.
864866
The given language can be used to discern between
865867
Germanic and Romance languages for phrase chunking.
@@ -1727,7 +1729,7 @@ def commandline(parse=Parser().parse):
17271729
# The output can be either slash-formatted string or XML.
17281730
if "xml" in arguments:
17291731
s = Tree(s, s.tags).xml
1730-
print(encode_utf8(s))
1732+
print(s)
17311733

17321734
#### VERBS ###############################################################
17331735

pattern/text/tree.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def unique(iterable):
8888

8989

9090
def zip(*args, **kwargs):
91-
""" Returns a list of tuples, where the i-th tuple contains the i-th element
91+
""" Returns a list of tuples, where the i-th tuple contains the i-th element
9292
from each of the argument sequences or iterables (or default if too short).
9393
"""
9494
args = [list(iterable) for iterable in args]
@@ -810,13 +810,13 @@ def append(self, word, lemma=None, type=None, chunk=None, role=None, relation=No
810810
def parse_token(self, token, tags=[WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA]):
811811
""" Returns the arguments for Sentence.append() from a tagged token representation.
812812
The order in which token tags appear can be specified.
813-
The default order is (separated by slashes):
814-
- word,
815-
- part-of-speech,
816-
- (IOB-)chunk,
817-
- (IOB-)preposition,
818-
- chunk(-relation)(-role),
819-
- anchor,
813+
The default order is (separated by slashes):
814+
- word,
815+
- part-of-speech,
816+
- (IOB-)chunk,
817+
- (IOB-)preposition,
818+
- chunk(-relation)(-role),
819+
- anchor,
820820
- lemma.
821821
Examples:
822822
The/DT/B-NP/O/NP-SBJ-1/O/the
@@ -1079,7 +1079,7 @@ def get(self, index, tag=LEMMA):
10791079

10801080
def loop(self, *tags):
10811081
""" Iterates over the tags in the entire Sentence,
1082-
For example, Sentence.loop(POS, LEMMA) yields tuples of the part-of-speech tags and lemmata.
1082+
For example, Sentence.loop(POS, LEMMA) yields tuples of the part-of-speech tags and lemmata.
10831083
Possible tags: WORD, LEMMA, POS, CHUNK, PNP, RELATION, ROLE, ANCHOR or a custom word tag.
10841084
Any order or combination of tags can be supplied.
10851085
"""
@@ -1339,7 +1339,12 @@ def xml(self):
13391339
xml.append("<%s>" % XML_TEXT)
13401340
xml.extend([sentence.xml for sentence in self])
13411341
xml.append("</%s>" % XML_TEXT)
1342-
return "\n".join(xml)
1342+
xml_ = "\n".join(xml)
1343+
try:
1344+
xml_.encode("utf-8")
1345+
except AttributeError: # TODO remove this hack
1346+
pass
1347+
return xml_
13431348

13441349
@classmethod
13451350
def from_xml(cls, xml):

test/test_de.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def test_parse(self):
213213
# 3) Assert the accuracy of the German tagger.
214214
i, n = 0, 0
215215
for sentence in open(os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines():
216-
sentence = sentence.decode("utf-8").strip()
216+
sentence = sentence.strip()
217217
s1 = [w.split("/") for w in sentence.split(" ")]
218218
s1 = [de.stts2penntreebank(w, pos) for w, pos in s1]
219219
s2 = [[w for w, pos in s1]]
@@ -239,13 +239,14 @@ def test_command_line(self):
239239

240240
# Assert parsed output from the command-line (example from the
241241
# documentation).
242-
p = ["python", "-m", "pattern.de", "-s", "Der grosse Hund.", "-OTCRL"]
243-
p = subprocess.Popen(p, stdout=subprocess.PIPE)
242+
command = ["python", "-m", "pattern.de", "-s", "Der grosse Hund.", "-OTCRL"]
243+
p = subprocess.Popen(command, stdout=subprocess.PIPE)
244244
p.wait()
245245
v = p.stdout.read()
246246
v = v.strip()
247247
self.assertEqual(
248-
v, "Der/DT/B-NP/O/O/der grosse/JJ/I-NP/O/O/gross Hund/NN/I-NP/O/O/hund ././O/O/O/.")
248+
v,
249+
b"Der/DT/B-NP/O/O/der grosse/JJ/I-NP/O/O/gross Hund/NN/I-NP/O/O/hund ././O/O/O/.")
249250
print("python -m pattern.de")
250251

251252
#-------------------------------------------------------------------------

0 commit comments

Comments
 (0)