Skip to content

Commit 0c5f92d

Browse files
committed
PY3 some work towards vector, tweak text
1 parent 7dabc3d commit 0c5f92d

File tree

6 files changed

+66
-43
lines changed

6 files changed

+66
-43
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ script:
2424
# TODO perhaps split build into tests and examples?
2525
# For now we only run the passing python 3 tests are run on the 3.4 build
2626
- if [ "$TRAVIS_PYTHON_VERSION" == "3.4" ]; then
27-
nosetests test/test_graph.py test/test_metrics.py test_de.py test/test_en.py test/test_es.py test/test_fr.py test/test_it.py test/test_nl.py test/test_text.py; else
27+
nosetests test/test_graph.py test/test_metrics.py test/test_de.py test/test_en.py test/test_es.py test/test_fr.py test/test_it.py test/test_nl.py test/test_text.py test/test_search.py; else
2828
nosetests --exclude=test_05vector_07slp --with-coverage --cover-package=pattern;
2929
fi
3030

pattern/text/en/wordnet/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ def hypernym(self):
282282
"""
283283
p = self._synset.getPointers(wn.HYPERNYM)
284284
try:
285-
first = next(p)
285+
first = p[0] if isinstance(p, tuple) else next(p)
286286
return Synset(first.getTarget())
287287
except StopIteration:
288288
return None

pattern/text/search.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -153,15 +153,15 @@ def combinations(iterable, n):
153153

154154
def product(*args, **kwargs):
155155
""" Yields all permutations with replacement:
156-
list(product("cat", repeat=2)) =>
157-
[("c", "c"),
158-
("c", "a"),
159-
("c", "t"),
160-
("a", "c"),
161-
("a", "a"),
162-
("a", "t"),
163-
("t", "c"),
164-
("t", "a"),
156+
list(product("cat", repeat=2)) =>
157+
[("c", "c"),
158+
("c", "a"),
159+
("c", "t"),
160+
("a", "c"),
161+
("a", "a"),
162+
("a", "t"),
163+
("t", "c"),
164+
("t", "a"),
165165
("t", "t")]
166166
"""
167167
p = [[]]
@@ -196,7 +196,7 @@ def variations(iterable, optional=lambda x: False):
196196
v = tuple(iterable[i] for i in range(len(v)) if not v[i])
197197
a.add(v)
198198
# Longest-first.
199-
return sorted(a, cmp=lambda x, y: len(y) - len(x))
199+
return sorted(a, key=len, reverse=True)
200200

201201
#### TAXONOMY ############################################################
202202

@@ -626,7 +626,7 @@ def match(self, word):
626626
Some part-of-speech-tags can also contain wildcards: NN*, VB*, JJ*, RB*, PR*.
627627
If the given word contains spaces (e.g., proper noun),
628628
the entire chunk will also be compared.
629-
For example: Constraint(words=["Mac OS X*"])
629+
For example: Constraint(words=["Mac OS X*"])
630630
matches the word "Mac" if the word occurs in a Chunk("Mac OS X 10.5").
631631
"""
632632
# If the constraint has a custom function it must return True.
@@ -918,7 +918,7 @@ def match(self, sentence, start=0, _v=None, _u=None):
918918
_u[id(sequence)] = False
919919
# Return the leftmost-longest.
920920
if len(a) > 0:
921-
return sorted(a)[0][-1]
921+
return sorted(a, key=lambda x: x[:2])[0][-1]
922922

923923
def _variations(self):
924924
v = variations(

pattern/text/tree.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,17 +274,23 @@ def __getattr__(self, tag):
274274
def __unicode__(self):
275275
return self.string
276276

277-
def __repr__(self):
278-
return "Word(%s)" % repr("%s/%s" % (
277+
def _repr(self):
278+
return repr("%s/%s" % (
279279
encode_entities(self.string),
280280
self.type is not None and self.type or OUTSIDE))
281281

282+
def __repr__(self):
283+
return "Word(%s)" % self._repr()
284+
282285
def __eq__(self, word):
283286
return id(self) == id(word)
284287

285288
def __ne__(self, word):
286289
return id(self) != id(word)
287290

291+
def __hash__(self):
292+
return hash(self._repr())
293+
288294

289295
class Tags(dict):
290296

pattern/vector/__init__.py

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ def count(words=[], top=None, threshold=0, stemmer=None, exclude=[], stopwords=F
364364
if stemmer is not None:
365365
w2 = stem(w2, stemmer, **kwargs).lower()
366366
dict.__setitem__(count, w2, (w2 in count) and count[w2] + 1 or 1)
367-
for k in count.keys():
367+
for k in list(count.keys()):
368368
if count[k] <= threshold:
369369
dict.__delitem__(count, k)
370370
if top is not None:
@@ -439,11 +439,11 @@ def __init__(self, string="", **kwargs):
439439
Lists can contain tuples (of), strings or numbers.
440440
Dicts can contain tuples (of), strings or numbers as keys, and floats as values.
441441
Document.words stores a dict of (word, count)-items.
442-
Document.vector stores a dict of (word, weight)-items,
442+
Document.vector stores a dict of (word, weight)-items,
443443
where weight is the term frequency normalized (0.0-1.0) to remove document length bias.
444444
Punctuation marks are stripped from the words.
445445
Stop words in the exclude list are excluded from the document.
446-
Only top words whose count exceeds the threshold are included in the document.
446+
Only top words whose count exceeds the threshold are included in the document.
447447
"""
448448
kwargs.setdefault("filter", lambda w: w.lstrip("'").isalnum())
449449
kwargs.setdefault("threshold", 0)
@@ -524,7 +524,11 @@ def load(cls, path):
524524
# Open unicode file.
525525
s = open(path, "rb").read()
526526
s = s.lstrip(codecs.BOM_UTF8)
527-
s = decode_utf8(s)
527+
try:
528+
s = s.decode("utf-8")
529+
except AttributeError:
530+
foo
531+
528532
a = {}
529533
v = {}
530534
# Parse document name and type.
@@ -705,7 +709,7 @@ def gain_ratio(self, word):
705709
@property
706710
def vector(self):
707711
""" Yields the document vector, a dictionary of (word, relevance)-items from the document.
708-
The relevance is tf, tf * idf, infogain or binary if the document is part of a Model,
712+
The relevance is tf, tf * idf, infogain or binary if the document is part of a Model,
709713
based on the value of Model.weight (TF, TFIDF, IG, GR, BINARY, None).
710714
The document vector is used to calculate similarity between two documents,
711715
for example in a clustering or classification algorithm.
@@ -770,11 +774,16 @@ def __eq__(self, document):
770774
def __ne__(self, document):
771775
return not self.__eq__(document)
772776

777+
def _repr(self):
778+
return repr(self._id +
779+
self.name and ", name=%s" % repr(self.name) or "" +
780+
self.type and ", type=%s" % repr(self.type) or "")
781+
773782
def __repr__(self):
774-
return "Document(id=%s%s%s)" % (
775-
repr(self._id),
776-
self.name and ", name=%s" % repr(self.name) or "",
777-
self.type and ", type=%s" % repr(self.type) or "")
783+
return "Document(id=%s%s%s)" % self._repr()
784+
785+
def __hash__(self):
786+
return hash(self._repr())
778787

779788
Bag = BagOfWords = BOW = Document
780789

@@ -1000,7 +1009,7 @@ def entropy(p=[], base=None):
10001009
class Model(object):
10011010

10021011
def __init__(self, documents=[], weight=TFIDF):
1003-
""" A model is a bag-of-word representation of a corpus of documents,
1012+
""" A model is a bag-of-word representation of a corpus of documents,
10041013
where each document vector is a bag of (word, relevance)-items.
10051014
Vectors can then be compared for similarity using a distance metric.
10061015
The weighting scheme can be: relative TF, TFIDF (default), IG, BINARY, None,
@@ -1279,7 +1288,7 @@ def inverse_document_frequency(self, word, base=2.71828):
12791288

12801289
@property
12811290
def inverted_index(self):
1282-
""" Yields a dictionary of (word, set([document1, document2, ...]))-items.
1291+
""" Yields a dictionary of (word, set([document1, document2, ...]))-items.
12831292
"""
12841293
if not self._inverted:
12851294
m = {}
@@ -1367,7 +1376,7 @@ def cosine_similarity(self, document1, document2):
13671376
similarity = cos = cosine_similarity
13681377

13691378
def nearest_neighbors(self, document, top=10):
1370-
""" Returns a list of (similarity, document)-tuples in the model,
1379+
""" Returns a list of (similarity, document)-tuples in the model,
13711380
sorted by cosine similarity to the given document.
13721381
"""
13731382
v = ((self.cosine_similarity(document, d), d) for d in self.documents)
@@ -1779,7 +1788,9 @@ def __init__(self, model, k=NORM):
17791788
import numpy
17801789
# Calling Model.vector() in a loop is quite slow, we should refactor
17811790
# this:
1782-
matrix = [model.vector(d).values() for d in model.documents]
1791+
# TODO remove list
1792+
matrix = [list(model.vector(d).values())
1793+
for d in model.documents]
17831794
matrix = numpy.array(matrix)
17841795
# Singular value decomposition, where u * sigma * vt = svd(matrix).
17851796
# Sigma is the diagonal matrix of singular values,
@@ -2049,7 +2060,7 @@ def k_means(vectors, k=None, iterations=10, distance=COSINE, seed=RANDOM, **kwar
20492060

20502061

20512062
def kmpp(vectors, k, distance=COSINE):
2052-
""" The k-means++ initialization algorithm returns a set of initial clusers,
2063+
""" The k-means++ initialization algorithm returns a set of initial clusers,
20532064
with the advantage that:
20542065
- it generates better clusters than k-means(seed=RANDOM) on most data sets,
20552066
- it runs faster than standard k-means,
@@ -2390,7 +2401,7 @@ def _test(self, documents=[], target=None, **kwargs):
23902401

23912402
def auc(self, documents=[], k=10):
23922403
""" Returns the area under the ROC-curve.
2393-
Returns the probability (0.0-1.0) that a classifier will rank
2404+
Returns the probability (0.0-1.0) that a classifier will rank
23942405
a random positive document (True) higher than a random negative one (False).
23952406
"""
23962407
return self.confusion_matrix(documents).auc(k)
@@ -2660,7 +2671,8 @@ def method(self):
26602671

26612672
@property
26622673
def features(self):
2663-
return self._features.keys()
2674+
# TODO don't require list
2675+
return list(self._features.keys())
26642676

26652677
def train(self, document, type=None):
26662678
"""Trains the classifier with the given document of the given type
@@ -3195,7 +3207,7 @@ def _propagate_backward(self, output=[], rate=0.5, momentum=0.1):
31953207

31963208
def _train(self, data=[], iterations=1000, rate=0.5, momentum=0.1):
31973209
""" Trains the network with the given data using backpropagation.
3198-
The given data is a list of (input, output)-tuples,
3210+
The given data is a list of (input, output)-tuples,
31993211
where each input and output a list of values.
32003212
For example, to learn the XOR-function:
32013213
nn = BPNN()
@@ -3316,18 +3328,18 @@ def finalize(self):
33163328
class SVM(Classifier):
33173329

33183330
def __init__(self, *args, **kwargs):
3319-
""" Support Vector Machine (SVM) is a supervised learning method
3331+
""" Support Vector Machine (SVM) is a supervised learning method
33203332
where training documents are represented as points in n-dimensional space.
33213333
The SVM constructs a number of hyperplanes that subdivide the space.
33223334
Optional parameters:
3323-
- type = CLASSIFICATION,
3324-
- kernel = LINEAR,
3325-
- degree = 3,
3326-
- gamma = 1 / len(SVM.features),
3335+
- type = CLASSIFICATION,
3336+
- kernel = LINEAR,
3337+
- degree = 3,
3338+
- gamma = 1 / len(SVM.features),
33273339
- coeff0 = 0,
3328-
- cost = 1,
3329-
- epsilon = 0.01,
3330-
- cache = 100,
3340+
- cost = 1,
3341+
- epsilon = 0.01,
3342+
- cache = 100,
33313343
- shrinking = True,
33323344
- extension = (LIBSVM, LIBLINEAR),
33333345
- train = []

test/test_vector.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
from random import seed
1313
seed(0)
1414

15+
try:
16+
xrange
17+
except NameError: # python 3
18+
xrange = range
19+
1520

1621
def model(top=None):
1722
""" Returns a Model of e-mail messages.
@@ -50,7 +55,7 @@ def test_decode_utf8(self):
5055
def test_encode_utf8(self):
5156
# Assert Python bytestring.
5257
for s in self.strings:
53-
self.assertTrue(isinstance(vector.encode_utf8(s), str))
58+
self.assertTrue(isinstance(vector.encode_utf8(s), bytes))
5459
print("pattern.vector.encode_utf8()")
5560

5661
#-------------------------------------------------------------------------
@@ -459,7 +464,7 @@ def test_frequent_concept_sets(self):
459464
# Assert Apriori algorithm.
460465
v = self.model.frequent(threshold=0.5)
461466
self.assertEqual(
462-
sorted(v.keys()), [frozenset(["dogs"]), frozenset(["cats"])])
467+
sorted(v.keys()), [frozenset(["cats"]), frozenset(["dogs"])])
463468
print("pattern.vector.Model.frequent()")
464469

465470
def test_cosine_similarity(self):

0 commit comments

Comments
 (0)