Skip to content

Commit 7dabc3d

Browse files
committed
PY3 text passing tests
1 parent b428018 commit 7dabc3d

File tree

14 files changed

+97
-72
lines changed

14 files changed

+97
-72
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ script:
2424
# TODO perhaps split build into tests and examples?
2525
# For now we only run the passing python 3 tests are run on the 3.4 build
2626
- if [ "$TRAVIS_PYTHON_VERSION" == "3.4" ]; then
27-
nosetests test/test_graph.py test/test_metrics.py test_de.py; else
27+
nosetests test/test_graph.py test/test_metrics.py test_de.py test/test_en.py test/test_es.py test/test_fr.py test/test_it.py test/test_nl.py test/test_text.py; else
2828
nosetests --exclude=test_05vector_07slp --with-coverage --cover-package=pattern;
2929
fi
3030

pattern/text/__init__.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,10 @@ def _read(path, encoding="utf-8", comment=";;;"):
402402
line = (line.strip(codecs.BOM_UTF8)
403403
if i == 0 and isinstance(line, bytes)
404404
else line)
405+
405406
line = line.strip()
406-
line = line.decode(encoding)
407+
line = line.decode(encoding) if isinstance(line, bytes) else line
408+
407409
if not line or (comment and line.startswith(comment)):
408410
continue
409411
yield line
@@ -2155,9 +2157,11 @@ def tenses(self, verb, parse=True):
21552157
for id1, id2 in self._default.items():
21562158
if id2 in a:
21572159
a.add(id1)
2158-
a = (TENSES[id][:-2] for id in a)
2159-
a = Tenses(sorted(a))
2160-
return a
2160+
t = (TENSES[id][:-2] for id in a)
2161+
# TODO fix this hack
2162+
t = Tenses(sorted(t, key=lambda x: (x[0] or '', x[1] or 0, x[2] or '',
2163+
x[3] or '', x[4] or '')))
2164+
return t
21612165

21622166
def find_lemma(self, verb):
21632167
# Must be overridden in a subclass.
@@ -2291,14 +2295,14 @@ def load(self, path=None):
22912295
self._language = xml.attrib.get("language", self._language)
22922296
# Average scores of all word senses per part-of-speech tag.
22932297
for w in words:
2294-
words[w] = dict((pos, map(avg, zip(*psi)))
2298+
words[w] = dict((pos, [avg(x) for x in zip(*psi)])
22952299
for pos, psi in words[w].items())
22962300
# Average scores of all part-of-speech tags.
22972301
for w, pos in words.items():
2298-
words[w][None] = map(avg, zip(*pos.values()))
2302+
words[w][None] = [avg(x) for x in zip(*pos.values())]
22992303
# Average scores of all synonyms per synset.
23002304
for id, psi in synsets.items():
2301-
synsets[id] = map(avg, zip(*psi))
2305+
synsets[id] = [avg(x) for x in zip(*psi)]
23022306
dict.update(self, words)
23032307
dict.update(self.labeler, labels)
23042308
dict.update(self._synsets, synsets)
@@ -2630,7 +2634,7 @@ def suggest(self, w):
26302634
def _module(language):
26312635
""" Returns the given language module (e.g., "en" => pattern.en).
26322636
"""
2633-
return _modules.setdefault(language, __import__(language, globals(), {}, [], -1))
2637+
return _modules.setdefault(language, __import__(language, globals(), {}, [], 1))
26342638

26352639

26362640
def _multilingual(function, *args, **kwargs):

pattern/text/en/inflect.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
# Based on the Ruby Linguistics module by Michael Granger:
4949
# http://www.deveiate.org/projects/Linguistics/wiki/English
5050

51-
RE_ARTICLE = map(lambda x: (re.compile(x[0]), x[1]), (
51+
RE_ARTICLE = [(re.compile(x[0]), x[1]) for x in (
5252
# exceptions: an hour, an honor
5353
("euler|hour(?!i)|heir|honest|hono", "an"),
5454
# Abbreviations:
@@ -67,7 +67,7 @@
6767
# y like "i": an yclept, a year
6868
(r"y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)", "an"),
6969
(r"", "a") # guess "a"
70-
))
70+
)]
7171

7272

7373
def definite_article(word):
@@ -85,14 +85,16 @@ def indefinite_article(word):
8585
if rule.search(word) is not None:
8686
return article
8787

88-
DEFINITE, INDEFINITE = \
89-
"definite", "indefinite"
88+
DEFINITE, INDEFINITE = "definite", "indefinite"
9089

9190

9291
def article(word, function=INDEFINITE):
9392
"""Returns the indefinite (a or an) or definite (the) article for the given
9493
word."""
95-
return function == DEFINITE and definite_article(word) or indefinite_article(word)
94+
if function == DEFINITE:
95+
return definite_article(word)
96+
else:
97+
return indefinite_article(word)
9698

9799
_article = article
98100

pattern/text/en/modality.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
# License: BSD (see LICENSE.txt for details).
66
# http://www.clips.ua.ac.be/pages/pattern
77

8+
try:
9+
basestring
10+
except NameError: # Python 3
11+
basestring = str
12+
813

914
### LIST FUNCTIONS #######################################################
1015

pattern/text/en/wordnet/__init__.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
# Note that pywordnet has been included in nltk upstream
3232
# TODO ensure these are fixed upstream (so we can use that?
3333

34+
import codecs # TODO use this exclusively for opening?
3435
import os
3536
import sys
3637
import glob
@@ -53,9 +54,9 @@
5354

5455
try:
5556
basestring
56-
except NameError:
57+
except NameError: # python 3
5758
basestring = str
58-
59+
unicode = str
5960

6061
VERSION = ""
6162
s = open(os.path.join(MODULE, CORPUS, "dict", "index.noun")).read(2048)
@@ -215,22 +216,25 @@ def antonym(self):
215216
def meronyms(self):
216217
""" Yields a list of synsets that are semantic members/parts of this synset, for example:
217218
synsets("house")[0].meronyms() =>
218-
[Synset("library"),
219-
Synset("loft"),
219+
[Synset("library"),
220+
Synset("loft"),
220221
Synset("porch")
221222
]
222223
"""
223-
p = self._synset.getPointers(wn.MEMBER_HOLONYM)
224-
p += self._synset.getPointers(wn.PART_HOLONYM)
225-
return [Synset(p.getTarget()) for p in p]
224+
p1 = self._synset.getPointers(wn.MEMBER_HOLONYM)
225+
p2 = self._synset.getPointers(wn.PART_HOLONYM)
226+
return ([Synset(p.getTarget()) for p in p1] +
227+
[Synset(p.getTarget()) for p in p2])
228+
226229

227230
def holonyms(self):
228231
""" Yields a list of synsets of which this synset is a member/part, for example:
229232
synsets("tree")[0].holonyms() => Synset("forest").
230233
"""
231-
p = self._synset.getPointers(wn.MEMBER_MERONYM)
232-
p += self._synset.getPointers(wn.PART_MERONYM)
233-
return [Synset(p.getTarget()) for p in p]
234+
p1 = self._synset.getPointers(wn.MEMBER_MERONYM)
235+
p2 = self._synset.getPointers(wn.PART_MERONYM)
236+
return ([Synset(p.getTarget()) for p in p1] +
237+
[Synset(p.getTarget()) for p in p2])
234238

235239
def hyponyms(self, recursive=False, depth=None):
236240
""" Yields a list of semantically more specific synsets, for example:
@@ -277,7 +281,11 @@ def hypernym(self):
277281
synsets("train")[0].hypernym => Synset("public transport").
278282
"""
279283
p = self._synset.getPointers(wn.HYPERNYM)
280-
return len(p) > 0 and Synset(p[0].getTarget()) or None
284+
try:
285+
first = next(p)
286+
return Synset(first.getTarget())
287+
except StopIteration:
288+
return None
281289

282290
def similar(self):
283291
""" Returns a list of similar synsets for adjectives and adverbs, for example:
@@ -386,14 +394,18 @@ def map32(id, pos=NOUN):
386394
"""
387395
global _map32_cache
388396
if not _map32_cache:
389-
_map32_cache = open(
390-
os.path.join(MODULE, "dict", "index.32")).readlines()
397+
_map32_cache = codecs.open(os.path.join(MODULE, "dict", "index.32"))\
398+
.readlines()
391399
_map32_cache = (x for x in _map32_cache if x[0] != ";") # comments
392-
_map32_cache = dict(x.strip().split(" ") for x in _map32_cache)
400+
_map32_cache = (x.strip().split(b" ", 1) for x in _map32_cache)
401+
_map32_cache = dict(x for x in _map32_cache if len(x) == 2)
402+
393403
k = pos in _map32_pos2 and pos or _map32_pos1.get(pos, "x")
394404
k += str(id).lstrip("0")
395-
k = _map32_cache.get(k, None)
405+
k = _map32_cache.get(k.encode("utf-8"), None)
406+
396407
if k is not None:
408+
k = k.decode("utf-8")
397409
return int(k[1:]), _map32_pos2[k[0]]
398410
return None
399411

pattern/text/en/wordnet/pywordnet/wordnet.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,15 +394,15 @@ def __init__(self, pos, offset, line):
394394
self.lexname = Lexname.lexnames and Lexname.lexnames[
395395
int(tokens[1])] or []
396396
(self._senseTuples, remainder) = _partition(
397-
tokens[4:], 2, string.atoi(tokens[3], 16))
397+
tokens[4:], 2, int(tokens[3], 16))
398398
(self._pointerTuples, remainder) = _partition(
399399
remainder[1:], 4, int(remainder[0]))
400400
if pos == VERB:
401401
(vfTuples, remainder) = _partition(
402402
remainder[1:], 3, int(remainder[0]))
403403

404404
def extractVerbFrames(index, vfTuples):
405-
return tuple(map(lambda t: string.atoi(t[1]), filter(lambda t, i=index: string.atoi(t[2], 16) in (0, i), vfTuples)))
405+
return tuple(map(lambda t: int(t[1]), filter(lambda t, i=index: int(t[2], 16) in (0, i), vfTuples)))
406406
senseVerbFrames = []
407407
for index in range(1, len(self._senseTuples) + 1):
408408
senseVerbFrames.append(extractVerbFrames(index, vfTuples))
@@ -752,7 +752,7 @@ def __init__(self, sourceOffset, pointerTuple):
752752
self.targetOffset = int(offset)
753753
self.pos = _normalizePOS(pos)
754754
"""part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
755-
indices = string.atoi(indices, 16)
755+
indices = int(indices, 16)
756756
self.sourceIndex = indices >> 8
757757
self.targetIndex = indices & 255
758758

pattern/text/fr/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def load(self, path=None):
172172
_Sentiment.load(self, path)
173173
# Map "précaire" to "precaire" (without diacritics, +1% accuracy).
174174
if not path:
175-
for w, pos in dict.items(self):
175+
for w, pos in list(dict.items(self)):
176176
w0 = w
177177
if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
178178
w = w.replace(u"à", "a")

pattern/text/tree.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# "the cat eats its snackerel with vigor" => eat with vigor?
2929
# OR => vigorous snackerel?
3030

31-
# The Text and Sentece classes are containers:
31+
# The Text and Sentence classes are containers:
3232
# no parsing functionality should be added to it.
3333

3434
from itertools import chain
@@ -39,8 +39,9 @@
3939

4040
try:
4141
unicode
42-
except NameError:
42+
except NameError: # Python 3
4343
unicode = str
44+
basestring = str
4445

4546
try:
4647
from config import SLASH
@@ -1187,7 +1188,7 @@ def __unicode__(self):
11871188
return self.string
11881189

11891190
def __repr__(self):
1190-
return "Sentence(%s)" % repr(" ".join(["/".join(word.tags) for word in self.words]).encode("utf-8"))
1191+
return "Sentence(\"%s\")" % " ".join(["/".join(word.tags) for word in self.words])
11911192

11921193
def __eq__(self, other):
11931194
if not isinstance(other, Sentence):
@@ -1198,7 +1199,8 @@ def __eq__(self, other):
11981199
def xml(self):
11991200
""" Yields the sentence as an XML-formatted string (plain bytestring, UTF-8 encoded).
12001201
"""
1201-
return parse_xml(self, tab="\t", id=self.id or "")
1202+
xml = parse_xml(self, tab="\t", id=self.id or "")
1203+
return xml.decode("utf-8") if isinstance(xml, bytes) else xml
12021204

12031205
@classmethod
12041206
def from_xml(cls, xml):

test/test_en.py

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ def test_parse(self):
566566
i, n = 0, 0
567567
for corpus, a in (("tagged-en-wsj.txt", (0.968, 0.945)), ("tagged-en-oanc.txt", (0.929, 0.932))):
568568
for sentence in open(os.path.join(PATH, "corpora", corpus)).readlines():
569-
sentence = sentence.decode("utf-8").strip()
569+
sentence = sentence.strip()
570570
s1 = [w.split("/") for w in sentence.split(" ")]
571571
s2 = [[w for w, pos in s1]]
572572
s2 = en.parse(s2, tokenize=False)
@@ -635,13 +635,13 @@ def test_command_line(self):
635635

636636
# Assert parsed output from the command-line (example from the
637637
# documentation).
638-
p = ["python", "-m", "pattern.en", "-s", "Nice cat.", "-OTCRL"]
639-
p = subprocess.Popen(p, stdout=subprocess.PIPE)
638+
command = ["python", "-m", "pattern.en", "-s", "Nice cat.", "-OTCRL"]
639+
p = subprocess.Popen(command, stdout=subprocess.PIPE)
640640
p.wait()
641641
v = p.stdout.read()
642642
v = v.strip()
643643
self.assertEqual(
644-
v, "Nice/JJ/B-NP/O/O/nice cat/NN/I-NP/O/O/cat ././O/O/O/.")
644+
v, b"Nice/JJ/B-NP/O/O/nice cat/NN/I-NP/O/O/cat ././O/O/O/.")
645645
print("python -m pattern.en")
646646

647647
#-------------------------------------------------------------------------
@@ -678,18 +678,19 @@ def test_text(self):
678678
def test_sentence(self):
679679
# Assert Sentence.
680680
v = self.text[0]
681-
self.assertTrue(v.start == 0)
682-
self.assertTrue(v.stop == 8)
683-
self.assertTrue(v.string == "I 'm eating pizza with a fork .")
684-
self.assertTrue(v.subjects == [self.text[0].chunks[0]])
685-
self.assertTrue(v.verbs == [self.text[0].chunks[1]])
686-
self.assertTrue(v.objects == [self.text[0].chunks[2]])
687-
self.assertTrue(
688-
v.nouns == [self.text[0].words[3], self.text[0].words[6]])
681+
self.assertEqual(v.start, 0)
682+
self.assertEqual(v.stop, 8)
683+
self.assertEqual(v.string, "I 'm eating pizza with a fork .")
684+
# TODO may be possible to not list each of these?
685+
self.assertEqual(list(v.subjects), [self.text[0].chunks[0]])
686+
self.assertEqual(list(v.verbs), [self.text[0].chunks[1]])
687+
self.assertEqual(list(v.objects), [self.text[0].chunks[2]])
688+
self.assertEqual(
689+
v.nouns, [self.text[0].words[3], self.text[0].words[6]])
689690
# Sentence.string must be unicode.
690-
self.assertTrue(isinstance(v.string, unicode) == True)
691-
self.assertTrue(isinstance(unicode(v), unicode) == True)
692-
self.assertTrue(isinstance(str(v), str) == True)
691+
self.assertEqual(isinstance(v.string, unicode), True)
692+
self.assertEqual(isinstance(unicode(v), unicode), True)
693+
self.assertEqual(isinstance(str(v), str), True)
693694
print("pattern.en.Sentence")
694695

695696
def test_sentence_constituents(self):
@@ -739,7 +740,7 @@ def test_chunk(self):
739740
# Assert chunk traversal.
740741
self.assertEqual(v.nearest("VP"), self.text[0].chunks[1])
741742
self.assertEqual(v.previous(), self.text[0].chunks[1])
742-
self.assertEqual(next(v), self.text[0].chunks[3])
743+
self.assertEqual(v.next(), self.text[0].chunks[3])
743744
print("pattern.en.Chunk")
744745

745746
def test_chunk_conjunctions(self):
@@ -805,12 +806,6 @@ def test_find(self):
805806
self.assertEqual(v, 11)
806807
print("pattern.text.tree.find()")
807808

808-
def test_zip(self):
809-
# Assert list of zipped tuples, using default to balance uneven lists.
810-
v = text.tree.zip([1, 2, 3], [4, 5, 6, 7], default=0)
811-
self.assertEqual(v, [(1, 4), (2, 5), (3, 6), (0, 7)])
812-
print("pattern.text.tree.zip()")
813-
814809
def test_unzip(self):
815810
v = text.tree.unzip(1, [(1, 4), (2, 5), (3, 6)])
816811
self.assertEqual(v, [4, 5, 6])

test/test_es.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -224,14 +224,14 @@ def test_parse(self):
224224
# "el gato negro" is a noun phrase, "en la alfombra" is a prepositional noun phrase.
225225
v = es.parser.parse(u"El gato negro se sentó en la alfombra.")
226226
self.assertEqual(v, # XXX - shouldn't "se" be part of the verb phrase?
227-
u"El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O " +
227+
(u"El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O " +
228228
u"se/PRP/B-NP/O sentó/VB/B-VP/O " +
229229
u"en/IN/B-PP/B-PNP la/DT/B-NP/I-PNP alfombra/NN/I-NP/I-PNP ././O/O"
230-
)
230+
))
231231
# Assert the accuracy of the Spanish tagger.
232232
i, n = 0, 0
233233
for sentence in open(os.path.join(PATH, "corpora", "tagged-es-wikicorpus.txt")).readlines():
234-
sentence = sentence.decode("utf-8").strip()
234+
sentence = sentence.strip()
235235
s1 = [w.split("/") for w in sentence.split(" ")]
236236
s2 = [[w for w, pos in s1]]
237237
s2 = es.parse(s2, tokenize=False, tagset=es.PAROLE)
@@ -263,7 +263,7 @@ def test_command_line(self):
263263
v = p.stdout.read()
264264
v = v.strip()
265265
self.assertEqual(
266-
v, "El/DT/B-NP/O/O/el gato/NN/I-NP/O/O/gato negro/JJ/I-NP/O/O/negro ././O/O/O/.")
266+
v, b"El/DT/B-NP/O/O/el gato/NN/I-NP/O/O/gato negro/JJ/I-NP/O/O/negro ././O/O/O/.")
267267
print("python -m pattern.es")
268268

269269
#-------------------------------------------------------------------------

0 commit comments

Comments
 (0)