PY3 text passing tests

hayd · hayd · commit 7dabc3d272b9 · 2014-12-10T14:57:25.000-08:00
diff --git a/.travis.yml b/.travis.yml
@@ -24,7 +24,7 @@ script:
     # TODO perhaps split build into tests and examples?
     # For now we only run the passing python 3 tests are run on the 3.4 build
     - if [ "$TRAVIS_PYTHON_VERSION" == "3.4" ]; then
-      nosetests test/test_graph.py test/test_metrics.py test_de.py; else
+      nosetests test/test_graph.py test/test_metrics.py test_de.py test/test_en.py test/test_es.py test/test_fr.py test/test_it.py test/test_nl.py test/test_text.py; else
       nosetests --exclude=test_05vector_07slp --with-coverage --cover-package=pattern;
       fi
 
diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py
@@ -402,8 +402,10 @@ def _read(path, encoding="utf-8", comment=";;;"):
             line = (line.strip(codecs.BOM_UTF8)
                     if i == 0 and isinstance(line, bytes)
                     else line)
+
             line = line.strip()
-            line = line.decode(encoding)
+            line = line.decode(encoding) if isinstance(line, bytes) else line
+
             if not line or (comment and line.startswith(comment)):
                 continue
             yield line
@@ -2155,9 +2157,11 @@ def tenses(self, verb, parse=True):
                 for id1, id2 in self._default.items():
                     if id2 in a:
                         a.add(id1)
-        a = (TENSES[id][:-2] for id in a)
-        a = Tenses(sorted(a))
-        return a
+        t = (TENSES[id][:-2] for id in a)
+        # TODO fix this hack
+        t = Tenses(sorted(t, key=lambda x: (x[0] or '', x[1] or 0, x[2] or '',
+                                            x[3] or '', x[4] or '')))
+        return t
 
     def find_lemma(self, verb):
         # Must be overridden in a subclass.
@@ -2291,14 +2295,14 @@ def load(self, path=None):
         self._language = xml.attrib.get("language", self._language)
         # Average scores of all word senses per part-of-speech tag.
         for w in words:
-            words[w] = dict((pos, map(avg, zip(*psi)))
+            words[w] = dict((pos, [avg(x) for x in zip(*psi)])
                             for pos, psi in words[w].items())
         # Average scores of all part-of-speech tags.
         for w, pos in words.items():
-            words[w][None] = map(avg, zip(*pos.values()))
+            words[w][None] = [avg(x) for x in zip(*pos.values())]
         # Average scores of all synonyms per synset.
         for id, psi in synsets.items():
-            synsets[id] = map(avg, zip(*psi))
+            synsets[id] = [avg(x) for x in zip(*psi)]
         dict.update(self, words)
         dict.update(self.labeler, labels)
         dict.update(self._synsets, synsets)
@@ -2630,7 +2634,7 @@ def suggest(self, w):
 def _module(language):
     """ Returns the given language module (e.g., "en" => pattern.en).
     """
-    return _modules.setdefault(language, __import__(language, globals(), {}, [], -1))
+    return _modules.setdefault(language, __import__(language, globals(), {}, [], 1))
 
 
 def _multilingual(function, *args, **kwargs):
diff --git a/pattern/text/en/inflect.py b/pattern/text/en/inflect.py
@@ -48,7 +48,7 @@
 # Based on the Ruby Linguistics module by Michael Granger:
 # http://www.deveiate.org/projects/Linguistics/wiki/English
 
-RE_ARTICLE = map(lambda x: (re.compile(x[0]), x[1]), (
+RE_ARTICLE = [(re.compile(x[0]), x[1]) for x in (
     # exceptions: an hour, an honor
     ("euler|hour(?!i)|heir|honest|hono", "an"),
     # Abbreviations:
@@ -67,7 +67,7 @@
     # y like "i": an yclept, a year
     (r"y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)", "an"),
     (r"", "a")  # guess "a"
-))
+)]
 
 
 def definite_article(word):
@@ -85,14 +85,16 @@ def indefinite_article(word):
         if rule.search(word) is not None:
             return article
 
-DEFINITE, INDEFINITE = \
-    "definite", "indefinite"
+DEFINITE, INDEFINITE = "definite", "indefinite"
 
 
 def article(word, function=INDEFINITE):
     """Returns the indefinite (a or an) or definite (the) article for the given
     word."""
-    return function == DEFINITE and definite_article(word) or indefinite_article(word)
+    if function == DEFINITE:
+        return definite_article(word)
+    else:
+        return indefinite_article(word)
 
 _article = article
 
diff --git a/pattern/text/en/modality.py b/pattern/text/en/modality.py
@@ -5,6 +5,11 @@
 # License: BSD (see LICENSE.txt for details).
 # http://www.clips.ua.ac.be/pages/pattern
 
+try:
+    basestring
+except NameError:  # Python 3
+    basestring = str
+
 
 ### LIST FUNCTIONS #######################################################
 
diff --git a/pattern/text/en/wordnet/__init__.py b/pattern/text/en/wordnet/__init__.py
@@ -31,6 +31,7 @@
 # Note that pywordnet has been included in nltk upstream
 # TODO ensure these are fixed upstream (so we can use that?
 
+import codecs  # TODO use this exclusively for opening?
 import os
 import sys
 import glob
@@ -53,9 +54,9 @@
 
 try:
     basestring
-except NameError:
+except NameError: # python 3
     basestring = str
-
+    unicode = str
 
 VERSION = ""
 s = open(os.path.join(MODULE, CORPUS, "dict", "index.noun")).read(2048)
@@ -215,22 +216,25 @@ def antonym(self):
     def meronyms(self):
         """ Yields a list of synsets that are semantic members/parts of this synset, for example:
             synsets("house")[0].meronyms() =>
-            [Synset("library"), 
-             Synset("loft"), 
+            [Synset("library"),
+             Synset("loft"),
              Synset("porch")
             ]
         """
-        p = self._synset.getPointers(wn.MEMBER_HOLONYM)
-        p += self._synset.getPointers(wn.PART_HOLONYM)
-        return [Synset(p.getTarget()) for p in p]
+        p1 = self._synset.getPointers(wn.MEMBER_HOLONYM)
+        p2 = self._synset.getPointers(wn.PART_HOLONYM)
+        return ([Synset(p.getTarget()) for p in p1] +
+                [Synset(p.getTarget()) for p in p2])
+
 
     def holonyms(self):
         """ Yields a list of synsets of which this synset is a member/part, for example:
             synsets("tree")[0].holonyms() => Synset("forest").
         """
-        p = self._synset.getPointers(wn.MEMBER_MERONYM)
-        p += self._synset.getPointers(wn.PART_MERONYM)
-        return [Synset(p.getTarget()) for p in p]
+        p1 = self._synset.getPointers(wn.MEMBER_MERONYM)
+        p2 = self._synset.getPointers(wn.PART_MERONYM)
+        return ([Synset(p.getTarget()) for p in p1] +
+                [Synset(p.getTarget()) for p in p2])
 
     def hyponyms(self, recursive=False, depth=None):
         """ Yields a list of semantically more specific synsets, for example:
@@ -277,7 +281,11 @@ def hypernym(self):
             synsets("train")[0].hypernym => Synset("public transport").
         """
         p = self._synset.getPointers(wn.HYPERNYM)
-        return len(p) > 0 and Synset(p[0].getTarget()) or None
+        try:
+            first = next(p)
+            return Synset(first.getTarget())
+        except StopIteration:
+            return None
 
     def similar(self):
         """ Returns a list of similar synsets for adjectives and adverbs, for example:
@@ -386,14 +394,18 @@ def map32(id, pos=NOUN):
     """
     global _map32_cache
     if not _map32_cache:
-        _map32_cache = open(
-            os.path.join(MODULE, "dict", "index.32")).readlines()
+        _map32_cache = codecs.open(os.path.join(MODULE, "dict", "index.32"))\
+                             .readlines()
         _map32_cache = (x for x in _map32_cache if x[0] != ";")  # comments
-        _map32_cache = dict(x.strip().split(" ") for x in _map32_cache)
+        _map32_cache = (x.strip().split(b" ", 1) for x in _map32_cache)
+        _map32_cache = dict(x for x in _map32_cache if len(x) == 2)
+
     k = pos in _map32_pos2 and pos or _map32_pos1.get(pos, "x")
     k += str(id).lstrip("0")
-    k = _map32_cache.get(k, None)
+    k = _map32_cache.get(k.encode("utf-8"), None)
+
     if k is not None:
+        k = k.decode("utf-8")
         return int(k[1:]), _map32_pos2[k[0]]
     return None
 
diff --git a/pattern/text/en/wordnet/pywordnet/wordnet.py b/pattern/text/en/wordnet/pywordnet/wordnet.py
@@ -394,15 +394,15 @@ def __init__(self, pos, offset, line):
         self.lexname = Lexname.lexnames and Lexname.lexnames[
             int(tokens[1])] or []
         (self._senseTuples, remainder) = _partition(
-            tokens[4:], 2, string.atoi(tokens[3], 16))
+            tokens[4:], 2, int(tokens[3], 16))
         (self._pointerTuples, remainder) = _partition(
             remainder[1:], 4, int(remainder[0]))
         if pos == VERB:
             (vfTuples, remainder) = _partition(
                 remainder[1:], 3, int(remainder[0]))
 
             def extractVerbFrames(index, vfTuples):
-                return tuple(map(lambda t: string.atoi(t[1]), filter(lambda t, i=index: string.atoi(t[2], 16) in (0, i), vfTuples)))
+                return tuple(map(lambda t: int(t[1]), filter(lambda t, i=index: int(t[2], 16) in (0, i), vfTuples)))
             senseVerbFrames = []
             for index in range(1, len(self._senseTuples) + 1):
                 senseVerbFrames.append(extractVerbFrames(index, vfTuples))
@@ -752,7 +752,7 @@ def __init__(self, sourceOffset, pointerTuple):
         self.targetOffset = int(offset)
         self.pos = _normalizePOS(pos)
         """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
-        indices = string.atoi(indices, 16)
+        indices = int(indices, 16)
         self.sourceIndex = indices >> 8
         self.targetIndex = indices & 255
 
diff --git a/pattern/text/fr/__init__.py b/pattern/text/fr/__init__.py
@@ -172,7 +172,7 @@ def load(self, path=None):
         _Sentiment.load(self, path)
         # Map "précaire" to "precaire" (without diacritics, +1% accuracy).
         if not path:
-            for w, pos in dict.items(self):
+            for w, pos in list(dict.items(self)):
                 w0 = w
                 if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
                     w = w.replace(u"à", "a")
diff --git a/pattern/text/tree.py b/pattern/text/tree.py
@@ -28,7 +28,7 @@
 #                "the cat eats its snackerel with vigor" => eat with vigor?
 #                                                     OR => vigorous snackerel?
 
-# The Text and Sentece classes are containers:
+# The Text and Sentence classes are containers:
 # no parsing functionality should be added to it.
 
 from itertools import chain
@@ -39,8 +39,9 @@
 
 try:
     unicode
-except NameError:
+except NameError: # Python 3
     unicode = str
+    basestring = str
 
 try:
     from config import SLASH
@@ -1187,7 +1188,7 @@ def __unicode__(self):
         return self.string
 
     def __repr__(self):
-        return "Sentence(%s)" % repr(" ".join(["/".join(word.tags) for word in self.words]).encode("utf-8"))
+        return "Sentence(\"%s\")" % " ".join(["/".join(word.tags) for word in self.words])
 
     def __eq__(self, other):
         if not isinstance(other, Sentence):
@@ -1198,7 +1199,8 @@ def __eq__(self, other):
     def xml(self):
         """ Yields the sentence as an XML-formatted string (plain bytestring, UTF-8 encoded).
         """
-        return parse_xml(self, tab="\t", id=self.id or "")
+        xml = parse_xml(self, tab="\t", id=self.id or "")
+        return xml.decode("utf-8") if isinstance(xml, bytes) else xml
 
     @classmethod
     def from_xml(cls, xml):
diff --git a/test/test_en.py b/test/test_en.py
@@ -566,7 +566,7 @@ def test_parse(self):
         i, n = 0, 0
         for corpus, a in (("tagged-en-wsj.txt", (0.968, 0.945)), ("tagged-en-oanc.txt", (0.929, 0.932))):
             for sentence in open(os.path.join(PATH, "corpora", corpus)).readlines():
-                sentence = sentence.decode("utf-8").strip()
+                sentence = sentence.strip()
                 s1 = [w.split("/") for w in sentence.split(" ")]
                 s2 = [[w for w, pos in s1]]
                 s2 = en.parse(s2, tokenize=False)
@@ -635,13 +635,13 @@ def test_command_line(self):
 
         # Assert parsed output from the command-line (example from the
         # documentation).
-        p = ["python", "-m", "pattern.en", "-s", "Nice cat.", "-OTCRL"]
-        p = subprocess.Popen(p, stdout=subprocess.PIPE)
+        command = ["python", "-m", "pattern.en", "-s", "Nice cat.", "-OTCRL"]
+        p = subprocess.Popen(command, stdout=subprocess.PIPE)
         p.wait()
         v = p.stdout.read()
         v = v.strip()
         self.assertEqual(
-            v, "Nice/JJ/B-NP/O/O/nice cat/NN/I-NP/O/O/cat ././O/O/O/.")
+            v, b"Nice/JJ/B-NP/O/O/nice cat/NN/I-NP/O/O/cat ././O/O/O/.")
         print("python -m pattern.en")
 
 #-------------------------------------------------------------------------
@@ -678,18 +678,19 @@ def test_text(self):
     def test_sentence(self):
         # Assert Sentence.
         v = self.text[0]
-        self.assertTrue(v.start == 0)
-        self.assertTrue(v.stop == 8)
-        self.assertTrue(v.string == "I 'm eating pizza with a fork .")
-        self.assertTrue(v.subjects == [self.text[0].chunks[0]])
-        self.assertTrue(v.verbs == [self.text[0].chunks[1]])
-        self.assertTrue(v.objects == [self.text[0].chunks[2]])
-        self.assertTrue(
-            v.nouns == [self.text[0].words[3], self.text[0].words[6]])
+        self.assertEqual(v.start, 0)
+        self.assertEqual(v.stop, 8)
+        self.assertEqual(v.string, "I 'm eating pizza with a fork .")
+        # TODO may be possible to not list each of these?
+        self.assertEqual(list(v.subjects), [self.text[0].chunks[0]])
+        self.assertEqual(list(v.verbs), [self.text[0].chunks[1]])
+        self.assertEqual(list(v.objects), [self.text[0].chunks[2]])
+        self.assertEqual(
+            v.nouns, [self.text[0].words[3], self.text[0].words[6]])
         # Sentence.string must be unicode.
-        self.assertTrue(isinstance(v.string, unicode) == True)
-        self.assertTrue(isinstance(unicode(v), unicode) == True)
-        self.assertTrue(isinstance(str(v), str) == True)
+        self.assertEqual(isinstance(v.string, unicode), True)
+        self.assertEqual(isinstance(unicode(v), unicode), True)
+        self.assertEqual(isinstance(str(v), str), True)
         print("pattern.en.Sentence")
 
     def test_sentence_constituents(self):
@@ -739,7 +740,7 @@ def test_chunk(self):
         # Assert chunk traversal.
         self.assertEqual(v.nearest("VP"), self.text[0].chunks[1])
         self.assertEqual(v.previous(), self.text[0].chunks[1])
-        self.assertEqual(next(v), self.text[0].chunks[3])
+        self.assertEqual(v.next(), self.text[0].chunks[3])
         print("pattern.en.Chunk")
 
     def test_chunk_conjunctions(self):
@@ -805,12 +806,6 @@ def test_find(self):
         self.assertEqual(v, 11)
         print("pattern.text.tree.find()")
 
-    def test_zip(self):
-        # Assert list of zipped tuples, using default to balance uneven lists.
-        v = text.tree.zip([1, 2, 3], [4, 5, 6, 7], default=0)
-        self.assertEqual(v, [(1, 4), (2, 5), (3, 6), (0, 7)])
-        print("pattern.text.tree.zip()")
-
     def test_unzip(self):
         v = text.tree.unzip(1, [(1, 4), (2, 5), (3, 6)])
         self.assertEqual(v, [4, 5, 6])
diff --git a/test/test_es.py b/test/test_es.py
@@ -224,14 +224,14 @@ def test_parse(self):
         # "el gato negro" is a noun phrase, "en la alfombra" is a prepositional noun phrase.
         v = es.parser.parse(u"El gato negro se sentó en la alfombra.")
         self.assertEqual(v,  # XXX - shouldn't "se" be part of the verb phrase?
-                         u"El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O " +
+                         (u"El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O " +
                          u"se/PRP/B-NP/O sentó/VB/B-VP/O " +
                          u"en/IN/B-PP/B-PNP la/DT/B-NP/I-PNP alfombra/NN/I-NP/I-PNP ././O/O"
-                         )
+                         ))
         # Assert the accuracy of the Spanish tagger.
         i, n = 0, 0
         for sentence in open(os.path.join(PATH, "corpora", "tagged-es-wikicorpus.txt")).readlines():
-            sentence = sentence.decode("utf-8").strip()
+            sentence = sentence.strip()
             s1 = [w.split("/") for w in sentence.split(" ")]
             s2 = [[w for w, pos in s1]]
             s2 = es.parse(s2, tokenize=False, tagset=es.PAROLE)
@@ -263,7 +263,7 @@ def test_command_line(self):
         v = p.stdout.read()
         v = v.strip()
         self.assertEqual(
-            v, "El/DT/B-NP/O/O/el gato/NN/I-NP/O/O/gato negro/JJ/I-NP/O/O/negro ././O/O/O/.")
+            v, b"El/DT/B-NP/O/O/el gato/NN/I-NP/O/O/gato negro/JJ/I-NP/O/O/negro ././O/O/O/.")
         print("python -m pattern.es")
 
 #-------------------------------------------------------------------------
diff --git a/test/test_fr.py b/test/test_fr.py
@@ -196,7 +196,7 @@ def test_command_line(self):
         v = p.stdout.read()
         v = v.strip()
         self.assertEqual(
-            v, "Le/DT/B-NP/O/O/le chat/NN/I-NP/O/O/chat noir/JJ/I-NP/O/O/noir ././O/O/O/.")
+            v, b"Le/DT/B-NP/O/O/le chat/NN/I-NP/O/O/chat noir/JJ/I-NP/O/O/noir ././O/O/O/.")
         print("python -m pattern.fr")
 
 #-------------------------------------------------------------------------
diff --git a/test/test_it.py b/test/test_it.py
diff --git a/test/test_nl.py b/test/test_nl.py
diff --git a/test/test_text.py b/test/test_text.py