PY3 some work towards vector, tweak text

hayd · hayd · commit 0c5f92debb65 · 2014-12-10T16:33:12.000-08:00
diff --git a/.travis.yml b/.travis.yml
@@ -24,7 +24,7 @@ script:
     # TODO perhaps split build into tests and examples?
     # For now we only run the passing python 3 tests are run on the 3.4 build
     - if [ "$TRAVIS_PYTHON_VERSION" == "3.4" ]; then
-      nosetests test/test_graph.py test/test_metrics.py test_de.py test/test_en.py test/test_es.py test/test_fr.py test/test_it.py test/test_nl.py test/test_text.py; else
+      nosetests test/test_graph.py test/test_metrics.py test/test_de.py test/test_en.py test/test_es.py test/test_fr.py test/test_it.py test/test_nl.py test/test_text.py test/test_search.py; else
       nosetests --exclude=test_05vector_07slp --with-coverage --cover-package=pattern;
       fi
 
diff --git a/pattern/text/en/wordnet/__init__.py b/pattern/text/en/wordnet/__init__.py
@@ -282,7 +282,7 @@ def hypernym(self):
         """
         p = self._synset.getPointers(wn.HYPERNYM)
         try:
-            first = next(p)
+            first = p[0] if isinstance(p, tuple) else next(p)
             return Synset(first.getTarget())
         except StopIteration:
             return None
diff --git a/pattern/text/search.py b/pattern/text/search.py
@@ -153,15 +153,15 @@ def combinations(iterable, n):
 
 def product(*args, **kwargs):
     """ Yields all permutations with replacement:
-        list(product("cat", repeat=2)) => 
-        [("c", "c"), 
-         ("c", "a"), 
-         ("c", "t"), 
-         ("a", "c"), 
-         ("a", "a"), 
-         ("a", "t"), 
-         ("t", "c"), 
-         ("t", "a"), 
+        list(product("cat", repeat=2)) =>
+        [("c", "c"),
+         ("c", "a"),
+         ("c", "t"),
+         ("a", "c"),
+         ("a", "a"),
+         ("a", "t"),
+         ("t", "c"),
+         ("t", "a"),
          ("t", "t")]
     """
     p = [[]]
@@ -196,7 +196,7 @@ def variations(iterable, optional=lambda x: False):
         v = tuple(iterable[i] for i in range(len(v)) if not v[i])
         a.add(v)
     # Longest-first.
-    return sorted(a, cmp=lambda x, y: len(y) - len(x))
+    return sorted(a, key=len, reverse=True)
 
 #### TAXONOMY ############################################################
 
@@ -626,7 +626,7 @@ def match(self, word):
             Some part-of-speech-tags can also contain wildcards: NN*, VB*, JJ*, RB*, PR*.
             If the given word contains spaces (e.g., proper noun),
             the entire chunk will also be compared.
-            For example: Constraint(words=["Mac OS X*"]) 
+            For example: Constraint(words=["Mac OS X*"])
             matches the word "Mac" if the word occurs in a Chunk("Mac OS X 10.5").
         """
         # If the constraint has a custom function it must return True.
@@ -918,7 +918,7 @@ def match(self, sentence, start=0, _v=None, _u=None):
                 _u[id(sequence)] = False
         # Return the leftmost-longest.
         if len(a) > 0:
-            return sorted(a)[0][-1]
+            return sorted(a, key=lambda x: x[:2])[0][-1]
 
     def _variations(self):
         v = variations(
diff --git a/pattern/text/tree.py b/pattern/text/tree.py
@@ -274,17 +274,23 @@ def __getattr__(self, tag):
     def __unicode__(self):
         return self.string
 
-    def __repr__(self):
-        return "Word(%s)" % repr("%s/%s" % (
+    def _repr(self):
+        return repr("%s/%s" % (
             encode_entities(self.string),
             self.type is not None and self.type or OUTSIDE))
 
+    def __repr__(self):
+        return "Word(%s)" % self._repr()
+
     def __eq__(self, word):
         return id(self) == id(word)
 
     def __ne__(self, word):
         return id(self) != id(word)
 
+    def __hash__(self):
+        return hash(self._repr())
+
 
 class Tags(dict):
 
diff --git a/pattern/vector/__init__.py b/pattern/vector/__init__.py
@@ -364,7 +364,7 @@ def count(words=[], top=None, threshold=0, stemmer=None, exclude=[], stopwords=F
             if stemmer is not None:
                 w2 = stem(w2, stemmer, **kwargs).lower()
             dict.__setitem__(count, w2, (w2 in count) and count[w2] + 1 or 1)
-    for k in count.keys():
+    for k in list(count.keys()):
         if count[k] <= threshold:
             dict.__delitem__(count, k)
     if top is not None:
@@ -439,11 +439,11 @@ def __init__(self, string="", **kwargs):
             Lists can contain tuples (of), strings or numbers.
             Dicts can contain tuples (of), strings or numbers as keys, and floats as values.
             Document.words stores a dict of (word, count)-items.
-            Document.vector stores a dict of (word, weight)-items, 
+            Document.vector stores a dict of (word, weight)-items,
             where weight is the term frequency normalized (0.0-1.0) to remove document length bias.
             Punctuation marks are stripped from the words.
             Stop words in the exclude list are excluded from the document.
-            Only top words whose count exceeds the threshold are included in the document.        
+            Only top words whose count exceeds the threshold are included in the document.
         """
         kwargs.setdefault("filter", lambda w: w.lstrip("'").isalnum())
         kwargs.setdefault("threshold", 0)
@@ -524,7 +524,11 @@ def load(cls, path):
         # Open unicode file.
         s = open(path, "rb").read()
         s = s.lstrip(codecs.BOM_UTF8)
-        s = decode_utf8(s)
+        try:
+            s = s.decode("utf-8")
+        except AttributeError:
+            foo
+
         a = {}
         v = {}
         # Parse document name and type.
@@ -705,7 +709,7 @@ def gain_ratio(self, word):
     @property
     def vector(self):
         """ Yields the document vector, a dictionary of (word, relevance)-items from the document.
-            The relevance is tf, tf * idf, infogain or binary if the document is part of a Model, 
+            The relevance is tf, tf * idf, infogain or binary if the document is part of a Model,
             based on the value of Model.weight (TF, TFIDF, IG, GR, BINARY, None).
             The document vector is used to calculate similarity between two documents,
             for example in a clustering or classification algorithm.
@@ -770,11 +774,16 @@ def __eq__(self, document):
     def __ne__(self, document):
         return not self.__eq__(document)
 
+    def _repr(self):
+        return repr(self._id +
+                    self.name and ", name=%s" % repr(self.name) or "" +
+                    self.type and ", type=%s" % repr(self.type) or "")
+
     def __repr__(self):
-        return "Document(id=%s%s%s)" % (
-            repr(self._id),
-            self.name and ", name=%s" % repr(self.name) or "",
-            self.type and ", type=%s" % repr(self.type) or "")
+        return "Document(id=%s%s%s)" % self._repr()
+
+    def __hash__(self):
+        return hash(self._repr())
 
 Bag = BagOfWords = BOW = Document
 
@@ -1000,7 +1009,7 @@ def entropy(p=[], base=None):
 class Model(object):
 
     def __init__(self, documents=[], weight=TFIDF):
-        """ A model is a bag-of-word representation of a corpus of documents, 
+        """ A model is a bag-of-word representation of a corpus of documents,
             where each document vector is a bag of (word, relevance)-items.
             Vectors can then be compared for similarity using a distance metric.
             The weighting scheme can be: relative TF, TFIDF (default), IG, BINARY, None,
@@ -1279,7 +1288,7 @@ def inverse_document_frequency(self, word, base=2.71828):
 
     @property
     def inverted_index(self):
-        """ Yields a dictionary of (word, set([document1, document2, ...]))-items. 
+        """ Yields a dictionary of (word, set([document1, document2, ...]))-items.
         """
         if not self._inverted:
             m = {}
@@ -1367,7 +1376,7 @@ def cosine_similarity(self, document1, document2):
     similarity = cos = cosine_similarity
 
     def nearest_neighbors(self, document, top=10):
-        """ Returns a list of (similarity, document)-tuples in the model, 
+        """ Returns a list of (similarity, document)-tuples in the model,
             sorted by cosine similarity to the given document.
         """
         v = ((self.cosine_similarity(document, d), d) for d in self.documents)
@@ -1779,7 +1788,9 @@ def __init__(self, model, k=NORM):
         import numpy
         # Calling Model.vector() in a loop is quite slow, we should refactor
         # this:
-        matrix = [model.vector(d).values() for d in model.documents]
+        # TODO remove list
+        matrix = [list(model.vector(d).values())
+                  for d in model.documents]
         matrix = numpy.array(matrix)
         # Singular value decomposition, where u * sigma * vt = svd(matrix).
         # Sigma is the diagonal matrix of singular values,
@@ -2049,7 +2060,7 @@ def k_means(vectors, k=None, iterations=10, distance=COSINE, seed=RANDOM, **kwar
 
 
 def kmpp(vectors, k, distance=COSINE):
-    """ The k-means++ initialization algorithm returns a set of initial clusers, 
+    """ The k-means++ initialization algorithm returns a set of initial clusers,
         with the advantage that:
         - it generates better clusters than k-means(seed=RANDOM) on most data sets,
         - it runs faster than standard k-means,
@@ -2390,7 +2401,7 @@ def _test(self, documents=[], target=None, **kwargs):
 
     def auc(self, documents=[], k=10):
         """ Returns the area under the ROC-curve.
-            Returns the probability (0.0-1.0) that a classifier will rank 
+            Returns the probability (0.0-1.0) that a classifier will rank
             a random positive document (True) higher than a random negative one (False).
         """
         return self.confusion_matrix(documents).auc(k)
@@ -2660,7 +2671,8 @@ def method(self):
 
     @property
     def features(self):
-        return self._features.keys()
+        # TODO don't require list
+        return list(self._features.keys())
 
     def train(self, document, type=None):
         """Trains the classifier with the given document of the given type
@@ -3195,7 +3207,7 @@ def _propagate_backward(self, output=[], rate=0.5, momentum=0.1):
 
     def _train(self, data=[], iterations=1000, rate=0.5, momentum=0.1):
         """ Trains the network with the given data using backpropagation.
-            The given data is a list of (input, output)-tuples, 
+            The given data is a list of (input, output)-tuples,
             where each input and output a list of values.
             For example, to learn the XOR-function:
             nn = BPNN()
@@ -3316,18 +3328,18 @@ def finalize(self):
 class SVM(Classifier):
 
     def __init__(self, *args, **kwargs):
-        """ Support Vector Machine (SVM) is a supervised learning method 
+        """ Support Vector Machine (SVM) is a supervised learning method
             where training documents are represented as points in n-dimensional space.
             The SVM constructs a number of hyperplanes that subdivide the space.
             Optional parameters:
-            -      type = CLASSIFICATION, 
-            -    kernel = LINEAR, 
-            -    degree = 3, 
-            -     gamma = 1 / len(SVM.features), 
+            -      type = CLASSIFICATION,
+            -    kernel = LINEAR,
+            -    degree = 3,
+            -     gamma = 1 / len(SVM.features),
             -    coeff0 = 0,
-            -      cost = 1, 
-            -   epsilon = 0.01, 
-            -     cache = 100, 
+            -      cost = 1,
+            -   epsilon = 0.01,
+            -     cache = 100,
             - shrinking = True,
             - extension = (LIBSVM, LIBLINEAR),
             -     train = []
diff --git a/test/test_vector.py b/test/test_vector.py
@@ -12,6 +12,11 @@
 from random import seed
 seed(0)
 
+try:
+    xrange
+except NameError:  # python 3
+    xrange = range
+
 
 def model(top=None):
     """ Returns a Model of e-mail messages.
@@ -50,7 +55,7 @@ def test_decode_utf8(self):
     def test_encode_utf8(self):
         # Assert Python bytestring.
         for s in self.strings:
-            self.assertTrue(isinstance(vector.encode_utf8(s), str))
+            self.assertTrue(isinstance(vector.encode_utf8(s), bytes))
         print("pattern.vector.encode_utf8()")
 
 #-------------------------------------------------------------------------
@@ -459,7 +464,7 @@ def test_frequent_concept_sets(self):
         # Assert Apriori algorithm.
         v = self.model.frequent(threshold=0.5)
         self.assertEqual(
-            sorted(v.keys()), [frozenset(["dogs"]), frozenset(["cats"])])
+            sorted(v.keys()), [frozenset(["cats"]), frozenset(["dogs"])])
         print("pattern.vector.Model.frequent()")
 
     def test_cosine_similarity(self):