Skip to content

Commit ddfd919

Browse files
committed
Merge pull request #16 from hayd/de
text passing py3 tests
2 parents 72a5a3e + 8d42094 commit ddfd919

File tree

19 files changed

+189
-132
lines changed

19 files changed

+189
-132
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ script:
2424
# TODO perhaps split build into tests and examples?
2525
# For now we only run the passing python 3 tests are run on the 3.4 build
2626
- if [ "$TRAVIS_PYTHON_VERSION" == "3.4" ]; then
27-
nosetests test/test_graph.py test/test_metrics.py; else
27+
nosetests --ignore-files=test_examples\|test_db\|test_vector\|test_web; else
2828
nosetests --exclude=test_05vector_07slp --with-coverage --cover-package=pattern;
2929
fi
3030

pattern/db/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2134,7 +2134,7 @@ def csv_header_encode(field, type=STRING):
21342134
# csv_header_encode("age", INTEGER) => "age (INTEGER)".
21352135
t = re.sub(r"^varchar\(.*?\)", "string", (type or ""))
21362136
t = t and " (%s)" % t or ""
2137-
s = "%s%s" % (encode_utf8(field or ""), t.upper())
2137+
s = "%s%s" % (field or "", t.upper())
21382138
return s
21392139

21402140

pattern/text/__init__.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -399,10 +399,13 @@ def _read(path, encoding="utf-8", comment=";;;"):
399399
# From file or buffer.
400400
f = path
401401
for i, line in enumerate(f):
402-
line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(
403-
line, str) else line
402+
line = (line.strip(codecs.BOM_UTF8)
403+
if i == 0 and isinstance(line, bytes)
404+
else line)
405+
404406
line = line.strip()
405-
line = decode_utf8(line, encoding)
407+
line = line.decode(encoding) if isinstance(line, bytes) else line
408+
406409
if not line or (comment and line.startswith(comment)):
407410
continue
408411
yield line
@@ -424,6 +427,7 @@ def load(self):
424427
# Arnold NNP x
425428
dict.update(self, (x.split(" ")[:2] for x in _read(self._path)))
426429

430+
427431
#--- FREQUENCY -----------------------------------------------------------
428432

429433

@@ -859,7 +863,7 @@ def __init__(self, lexicon={}, frequency={}, model=None, morphology=None, contex
859863
The given default tags are used for unknown words.
860864
Unknown words that start with a capital letter are tagged NNP (except for German).
861865
Unknown words that contain only digits and punctuation are tagged CD.
862-
Optionally, morphological and contextual rules (or a language model) can be used
866+
Optionally, morphological and contextual rules (or a language model) can be used
863867
to improve the tags of unknown words.
864868
The given language can be used to discern between
865869
Germanic and Romance languages for phrase chunking.
@@ -1727,7 +1731,7 @@ def commandline(parse=Parser().parse):
17271731
# The output can be either slash-formatted string or XML.
17281732
if "xml" in arguments:
17291733
s = Tree(s, s.tags).xml
1730-
print(encode_utf8(s))
1734+
print(s)
17311735

17321736
#### VERBS ###############################################################
17331737

@@ -2153,9 +2157,11 @@ def tenses(self, verb, parse=True):
21532157
for id1, id2 in self._default.items():
21542158
if id2 in a:
21552159
a.add(id1)
2156-
a = (TENSES[id][:-2] for id in a)
2157-
a = Tenses(sorted(a))
2158-
return a
2160+
t = (TENSES[id][:-2] for id in a)
2161+
# TODO fix this hack
2162+
t = Tenses(sorted(t, key=lambda x: (x[0] or '', x[1] or 0, x[2] or '',
2163+
x[3] or '', x[4] or '')))
2164+
return t
21592165

21602166
def find_lemma(self, verb):
21612167
# Must be overridden in a subclass.
@@ -2289,14 +2295,14 @@ def load(self, path=None):
22892295
self._language = xml.attrib.get("language", self._language)
22902296
# Average scores of all word senses per part-of-speech tag.
22912297
for w in words:
2292-
words[w] = dict((pos, map(avg, zip(*psi)))
2298+
words[w] = dict((pos, [avg(x) for x in zip(*psi)])
22932299
for pos, psi in words[w].items())
22942300
# Average scores of all part-of-speech tags.
22952301
for w, pos in words.items():
2296-
words[w][None] = map(avg, zip(*pos.values()))
2302+
words[w][None] = [avg(x) for x in zip(*pos.values())]
22972303
# Average scores of all synonyms per synset.
22982304
for id, psi in synsets.items():
2299-
synsets[id] = map(avg, zip(*psi))
2305+
synsets[id] = [avg(x) for x in zip(*psi)]
23002306
dict.update(self, words)
23012307
dict.update(self.labeler, labels)
23022308
dict.update(self._synsets, synsets)
@@ -2628,7 +2634,7 @@ def suggest(self, w):
26282634
def _module(language):
26292635
""" Returns the given language module (e.g., "en" => pattern.en).
26302636
"""
2631-
return _modules.setdefault(language, __import__(language, globals(), {}, [], -1))
2637+
return _modules.setdefault(language, __import__(language, globals(), {}, [], 1))
26322638

26332639

26342640
def _multilingual(function, *args, **kwargs):

pattern/text/en/inflect.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
# Based on the Ruby Linguistics module by Michael Granger:
4949
# http://www.deveiate.org/projects/Linguistics/wiki/English
5050

51-
RE_ARTICLE = map(lambda x: (re.compile(x[0]), x[1]), (
51+
RE_ARTICLE = [(re.compile(x[0]), x[1]) for x in (
5252
# exceptions: an hour, an honor
5353
("euler|hour(?!i)|heir|honest|hono", "an"),
5454
# Abbreviations:
@@ -67,7 +67,7 @@
6767
# y like "i": an yclept, a year
6868
(r"y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)", "an"),
6969
(r"", "a") # guess "a"
70-
))
70+
)]
7171

7272

7373
def definite_article(word):
@@ -85,14 +85,16 @@ def indefinite_article(word):
8585
if rule.search(word) is not None:
8686
return article
8787

88-
DEFINITE, INDEFINITE = \
89-
"definite", "indefinite"
88+
DEFINITE, INDEFINITE = "definite", "indefinite"
9089

9190

9291
def article(word, function=INDEFINITE):
9392
"""Returns the indefinite (a or an) or definite (the) article for the given
9493
word."""
95-
return function == DEFINITE and definite_article(word) or indefinite_article(word)
94+
if function == DEFINITE:
95+
return definite_article(word)
96+
else:
97+
return indefinite_article(word)
9698

9799
_article = article
98100

pattern/text/en/modality.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
# License: BSD (see LICENSE.txt for details).
66
# http://www.clips.ua.ac.be/pages/pattern
77

8+
try:
9+
basestring
10+
except NameError: # Python 3
11+
basestring = str
12+
813

914
### LIST FUNCTIONS #######################################################
1015

pattern/text/en/wordnet/__init__.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
# Note that pywordnet has been included in nltk upstream
3232
# TODO ensure these are fixed upstream (so we can use that?
3333

34+
import codecs # TODO use this exclusively for opening?
3435
import os
3536
import sys
3637
import glob
@@ -53,9 +54,9 @@
5354

5455
try:
5556
basestring
56-
except NameError:
57+
except NameError: # python 3
5758
basestring = str
58-
59+
unicode = str
5960

6061
VERSION = ""
6162
s = open(os.path.join(MODULE, CORPUS, "dict", "index.noun")).read(2048)
@@ -215,22 +216,25 @@ def antonym(self):
215216
def meronyms(self):
216217
""" Yields a list of synsets that are semantic members/parts of this synset, for example:
217218
synsets("house")[0].meronyms() =>
218-
[Synset("library"),
219-
Synset("loft"),
219+
[Synset("library"),
220+
Synset("loft"),
220221
Synset("porch")
221222
]
222223
"""
223-
p = self._synset.getPointers(wn.MEMBER_HOLONYM)
224-
p += self._synset.getPointers(wn.PART_HOLONYM)
225-
return [Synset(p.getTarget()) for p in p]
224+
p1 = self._synset.getPointers(wn.MEMBER_HOLONYM)
225+
p2 = self._synset.getPointers(wn.PART_HOLONYM)
226+
return ([Synset(p.getTarget()) for p in p1] +
227+
[Synset(p.getTarget()) for p in p2])
228+
226229

227230
def holonyms(self):
228231
""" Yields a list of synsets of which this synset is a member/part, for example:
229232
synsets("tree")[0].holonyms() => Synset("forest").
230233
"""
231-
p = self._synset.getPointers(wn.MEMBER_MERONYM)
232-
p += self._synset.getPointers(wn.PART_MERONYM)
233-
return [Synset(p.getTarget()) for p in p]
234+
p1 = self._synset.getPointers(wn.MEMBER_MERONYM)
235+
p2 = self._synset.getPointers(wn.PART_MERONYM)
236+
return ([Synset(p.getTarget()) for p in p1] +
237+
[Synset(p.getTarget()) for p in p2])
234238

235239
def hyponyms(self, recursive=False, depth=None):
236240
""" Yields a list of semantically more specific synsets, for example:
@@ -277,7 +281,11 @@ def hypernym(self):
277281
synsets("train")[0].hypernym => Synset("public transport").
278282
"""
279283
p = self._synset.getPointers(wn.HYPERNYM)
280-
return len(p) > 0 and Synset(p[0].getTarget()) or None
284+
try:
285+
first = p[0] if isinstance(p, tuple) else next(p)
286+
return Synset(first.getTarget())
287+
except StopIteration:
288+
return None
281289

282290
def similar(self):
283291
""" Returns a list of similar synsets for adjectives and adverbs, for example:
@@ -386,14 +394,18 @@ def map32(id, pos=NOUN):
386394
"""
387395
global _map32_cache
388396
if not _map32_cache:
389-
_map32_cache = open(
390-
os.path.join(MODULE, "dict", "index.32")).readlines()
397+
_map32_cache = codecs.open(os.path.join(MODULE, "dict", "index.32"))\
398+
.readlines()
391399
_map32_cache = (x for x in _map32_cache if x[0] != ";") # comments
392-
_map32_cache = dict(x.strip().split(" ") for x in _map32_cache)
400+
_map32_cache = (x.strip().split(b" ", 1) for x in _map32_cache)
401+
_map32_cache = dict(x for x in _map32_cache if len(x) == 2)
402+
393403
k = pos in _map32_pos2 and pos or _map32_pos1.get(pos, "x")
394404
k += str(id).lstrip("0")
395-
k = _map32_cache.get(k, None)
405+
k = _map32_cache.get(k.encode("utf-8"), None)
406+
396407
if k is not None:
408+
k = k.decode("utf-8")
397409
return int(k[1:]), _map32_pos2[k[0]]
398410
return None
399411

pattern/text/en/wordnet/pywordnet/wordnet.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,15 +394,15 @@ def __init__(self, pos, offset, line):
394394
self.lexname = Lexname.lexnames and Lexname.lexnames[
395395
int(tokens[1])] or []
396396
(self._senseTuples, remainder) = _partition(
397-
tokens[4:], 2, string.atoi(tokens[3], 16))
397+
tokens[4:], 2, int(tokens[3], 16))
398398
(self._pointerTuples, remainder) = _partition(
399399
remainder[1:], 4, int(remainder[0]))
400400
if pos == VERB:
401401
(vfTuples, remainder) = _partition(
402402
remainder[1:], 3, int(remainder[0]))
403403

404404
def extractVerbFrames(index, vfTuples):
405-
return tuple(map(lambda t: string.atoi(t[1]), filter(lambda t, i=index: string.atoi(t[2], 16) in (0, i), vfTuples)))
405+
return tuple(map(lambda t: int(t[1]), filter(lambda t, i=index: int(t[2], 16) in (0, i), vfTuples)))
406406
senseVerbFrames = []
407407
for index in range(1, len(self._senseTuples) + 1):
408408
senseVerbFrames.append(extractVerbFrames(index, vfTuples))
@@ -752,7 +752,7 @@ def __init__(self, sourceOffset, pointerTuple):
752752
self.targetOffset = int(offset)
753753
self.pos = _normalizePOS(pos)
754754
"""part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
755-
indices = string.atoi(indices, 16)
755+
indices = int(indices, 16)
756756
self.sourceIndex = indices >> 8
757757
self.targetIndex = indices & 255
758758

pattern/text/fr/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def load(self, path=None):
172172
_Sentiment.load(self, path)
173173
# Map "précaire" to "precaire" (without diacritics, +1% accuracy).
174174
if not path:
175-
for w, pos in dict.items(self):
175+
for w, pos in list(dict.items(self)):
176176
w0 = w
177177
if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
178178
w = w.replace(u"à", "a")

pattern/text/search.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -153,15 +153,15 @@ def combinations(iterable, n):
153153

154154
def product(*args, **kwargs):
155155
""" Yields all permutations with replacement:
156-
list(product("cat", repeat=2)) =>
157-
[("c", "c"),
158-
("c", "a"),
159-
("c", "t"),
160-
("a", "c"),
161-
("a", "a"),
162-
("a", "t"),
163-
("t", "c"),
164-
("t", "a"),
156+
list(product("cat", repeat=2)) =>
157+
[("c", "c"),
158+
("c", "a"),
159+
("c", "t"),
160+
("a", "c"),
161+
("a", "a"),
162+
("a", "t"),
163+
("t", "c"),
164+
("t", "a"),
165165
("t", "t")]
166166
"""
167167
p = [[]]
@@ -196,7 +196,7 @@ def variations(iterable, optional=lambda x: False):
196196
v = tuple(iterable[i] for i in range(len(v)) if not v[i])
197197
a.add(v)
198198
# Longest-first.
199-
return sorted(a, cmp=lambda x, y: len(y) - len(x))
199+
return sorted(a, key=len, reverse=True)
200200

201201
#### TAXONOMY ############################################################
202202

@@ -626,7 +626,7 @@ def match(self, word):
626626
Some part-of-speech-tags can also contain wildcards: NN*, VB*, JJ*, RB*, PR*.
627627
If the given word contains spaces (e.g., proper noun),
628628
the entire chunk will also be compared.
629-
For example: Constraint(words=["Mac OS X*"])
629+
For example: Constraint(words=["Mac OS X*"])
630630
matches the word "Mac" if the word occurs in a Chunk("Mac OS X 10.5").
631631
"""
632632
# If the constraint has a custom function it must return True.
@@ -918,7 +918,7 @@ def match(self, sentence, start=0, _v=None, _u=None):
918918
_u[id(sequence)] = False
919919
# Return the leftmost-longest.
920920
if len(a) > 0:
921-
return sorted(a)[0][-1]
921+
return sorted(a, key=lambda x: x[:2])[0][-1]
922922

923923
def _variations(self):
924924
v = variations(

0 commit comments

Comments
 (0)