@@ -364,7 +364,7 @@ def count(words=[], top=None, threshold=0, stemmer=None, exclude=[], stopwords=F
364364 if stemmer is not None :
365365 w2 = stem (w2 , stemmer , ** kwargs ).lower ()
366366 dict .__setitem__ (count , w2 , (w2 in count ) and count [w2 ] + 1 or 1 )
367- for k in count .keys ():
367+ for k in list ( count .keys () ):
368368 if count [k ] <= threshold :
369369 dict .__delitem__ (count , k )
370370 if top is not None :
@@ -439,11 +439,11 @@ def __init__(self, string="", **kwargs):
439439 Lists can contain tuples (of), strings or numbers.
440440 Dicts can contain tuples (of), strings or numbers as keys, and floats as values.
441441 Document.words stores a dict of (word, count)-items.
442- Document.vector stores a dict of (word, weight)-items,
442+ Document.vector stores a dict of (word, weight)-items,
443443 where weight is the term frequency normalized (0.0-1.0) to remove document length bias.
444444 Punctuation marks are stripped from the words.
445445 Stop words in the exclude list are excluded from the document.
446- Only top words whose count exceeds the threshold are included in the document.
446+ Only top words whose count exceeds the threshold are included in the document.
447447 """
448448 kwargs .setdefault ("filter" , lambda w : w .lstrip ("'" ).isalnum ())
449449 kwargs .setdefault ("threshold" , 0 )
@@ -524,7 +524,11 @@ def load(cls, path):
524524 # Open unicode file.
525525 s = open (path , "rb" ).read ()
526526 s = s .lstrip (codecs .BOM_UTF8 )
527- s = decode_utf8 (s )
527+ try :
528+ s = s .decode ("utf-8" )
529+ except AttributeError :
530+ foo
531+
528532 a = {}
529533 v = {}
530534 # Parse document name and type.
@@ -705,7 +709,7 @@ def gain_ratio(self, word):
705709 @property
706710 def vector (self ):
707711 """ Yields the document vector, a dictionary of (word, relevance)-items from the document.
708- The relevance is tf, tf * idf, infogain or binary if the document is part of a Model,
712+ The relevance is tf, tf * idf, infogain or binary if the document is part of a Model,
709713 based on the value of Model.weight (TF, TFIDF, IG, GR, BINARY, None).
710714 The document vector is used to calculate similarity between two documents,
711715 for example in a clustering or classification algorithm.
@@ -770,11 +774,16 @@ def __eq__(self, document):
770774 def __ne__ (self , document ):
771775 return not self .__eq__ (document )
772776
777+ def _repr (self ):
778+ return repr (self ._id +
779+ self .name and ", name=%s" % repr (self .name ) or "" +
780+ self .type and ", type=%s" % repr (self .type ) or "" )
781+
773782 def __repr__ (self ):
774- return "Document(id=%s%s%s)" % (
775- repr ( self . _id ),
776- self . name and ", name=%s" % repr (self . name ) or "" ,
777- self . type and ", type=%s" % repr (self .type ) or "" )
783+ return "Document(id=%s%s%s)" % self . _repr ()
784+
785+ def __hash__ (self ):
786+ return hash (self ._repr () )
778787
779788Bag = BagOfWords = BOW = Document
780789
@@ -1000,7 +1009,7 @@ def entropy(p=[], base=None):
10001009class Model (object ):
10011010
10021011 def __init__ (self , documents = [], weight = TFIDF ):
1003- """ A model is a bag-of-word representation of a corpus of documents,
1012+ """ A model is a bag-of-word representation of a corpus of documents,
10041013 where each document vector is a bag of (word, relevance)-items.
10051014 Vectors can then be compared for similarity using a distance metric.
10061015 The weighting scheme can be: relative TF, TFIDF (default), IG, BINARY, None,
@@ -1279,7 +1288,7 @@ def inverse_document_frequency(self, word, base=2.71828):
12791288
12801289 @property
12811290 def inverted_index (self ):
1282- """ Yields a dictionary of (word, set([document1, document2, ...]))-items.
1291+ """ Yields a dictionary of (word, set([document1, document2, ...]))-items.
12831292 """
12841293 if not self ._inverted :
12851294 m = {}
@@ -1367,7 +1376,7 @@ def cosine_similarity(self, document1, document2):
13671376 similarity = cos = cosine_similarity
13681377
13691378 def nearest_neighbors (self , document , top = 10 ):
1370- """ Returns a list of (similarity, document)-tuples in the model,
1379+ """ Returns a list of (similarity, document)-tuples in the model,
13711380 sorted by cosine similarity to the given document.
13721381 """
13731382 v = ((self .cosine_similarity (document , d ), d ) for d in self .documents )
@@ -1779,7 +1788,9 @@ def __init__(self, model, k=NORM):
17791788 import numpy
17801789 # Calling Model.vector() in a loop is quite slow, we should refactor
17811790 # this:
1782- matrix = [model .vector (d ).values () for d in model .documents ]
1791+ # TODO remove list
1792+ matrix = [list (model .vector (d ).values ())
1793+ for d in model .documents ]
17831794 matrix = numpy .array (matrix )
17841795 # Singular value decomposition, where u * sigma * vt = svd(matrix).
17851796 # Sigma is the diagonal matrix of singular values,
@@ -2049,7 +2060,7 @@ def k_means(vectors, k=None, iterations=10, distance=COSINE, seed=RANDOM, **kwar
20492060
20502061
20512062def kmpp (vectors , k , distance = COSINE ):
2052- """ The k-means++ initialization algorithm returns a set of initial clusers,
2063+ """ The k-means++ initialization algorithm returns a set of initial clusers,
20532064 with the advantage that:
20542065 - it generates better clusters than k-means(seed=RANDOM) on most data sets,
20552066 - it runs faster than standard k-means,
@@ -2390,7 +2401,7 @@ def _test(self, documents=[], target=None, **kwargs):
23902401
23912402 def auc (self , documents = [], k = 10 ):
23922403 """ Returns the area under the ROC-curve.
2393- Returns the probability (0.0-1.0) that a classifier will rank
2404+ Returns the probability (0.0-1.0) that a classifier will rank
23942405 a random positive document (True) higher than a random negative one (False).
23952406 """
23962407 return self .confusion_matrix (documents ).auc (k )
@@ -2660,7 +2671,8 @@ def method(self):
26602671
26612672 @property
26622673 def features (self ):
2663- return self ._features .keys ()
2674+ # TODO don't require list
2675+ return list (self ._features .keys ())
26642676
26652677 def train (self , document , type = None ):
26662678 """Trains the classifier with the given document of the given type
@@ -3195,7 +3207,7 @@ def _propagate_backward(self, output=[], rate=0.5, momentum=0.1):
31953207
31963208 def _train (self , data = [], iterations = 1000 , rate = 0.5 , momentum = 0.1 ):
31973209 """ Trains the network with the given data using backpropagation.
3198- The given data is a list of (input, output)-tuples,
3210+ The given data is a list of (input, output)-tuples,
31993211 where each input and output a list of values.
32003212 For example, to learn the XOR-function:
32013213 nn = BPNN()
@@ -3316,18 +3328,18 @@ def finalize(self):
33163328class SVM (Classifier ):
33173329
33183330 def __init__ (self , * args , ** kwargs ):
3319- """ Support Vector Machine (SVM) is a supervised learning method
3331+ """ Support Vector Machine (SVM) is a supervised learning method
33203332 where training documents are represented as points in n-dimensional space.
33213333 The SVM constructs a number of hyperplanes that subdivide the space.
33223334 Optional parameters:
3323- - type = CLASSIFICATION,
3324- - kernel = LINEAR,
3325- - degree = 3,
3326- - gamma = 1 / len(SVM.features),
3335+ - type = CLASSIFICATION,
3336+ - kernel = LINEAR,
3337+ - degree = 3,
3338+ - gamma = 1 / len(SVM.features),
33273339 - coeff0 = 0,
3328- - cost = 1,
3329- - epsilon = 0.01,
3330- - cache = 100,
3340+ - cost = 1,
3341+ - epsilon = 0.01,
3342+ - cache = 100,
33313343 - shrinking = True,
33323344 - extension = (LIBSVM, LIBLINEAR),
33333345 - train = []
0 commit comments