@@ -399,10 +399,13 @@ def _read(path, encoding="utf-8", comment=";;;"):
399399 # From file or buffer.
400400 f = path
401401 for i , line in enumerate (f ):
402- line = line .strip (codecs .BOM_UTF8 ) if i == 0 and isinstance (
403- line , str ) else line
402+ line = (line .strip (codecs .BOM_UTF8 )
403+ if i == 0 and isinstance (line , bytes )
404+ else line )
405+
404406 line = line .strip ()
405- line = decode_utf8 (line , encoding )
407+ line = line .decode (encoding ) if isinstance (line , bytes ) else line
408+
406409 if not line or (comment and line .startswith (comment )):
407410 continue
408411 yield line
@@ -424,6 +427,7 @@ def load(self):
424427 # Arnold NNP x
425428 dict .update (self , (x .split (" " )[:2 ] for x in _read (self ._path )))
426429
430+
427431#--- FREQUENCY -----------------------------------------------------------
428432
429433
@@ -859,7 +863,7 @@ def __init__(self, lexicon={}, frequency={}, model=None, morphology=None, contex
859863 The given default tags are used for unknown words.
860864 Unknown words that start with a capital letter are tagged NNP (except for German).
861865 Unknown words that contain only digits and punctuation are tagged CD.
862- Optionally, morphological and contextual rules (or a language model) can be used
866+ Optionally, morphological and contextual rules (or a language model) can be used
863867 to improve the tags of unknown words.
864868 The given language can be used to discern between
865869 Germanic and Romance languages for phrase chunking.
@@ -1727,7 +1731,7 @@ def commandline(parse=Parser().parse):
17271731 # The output can be either slash-formatted string or XML.
17281732 if "xml" in arguments :
17291733 s = Tree (s , s .tags ).xml
1730- print (encode_utf8 ( s ) )
1734+ print (s )
17311735
17321736#### VERBS ###############################################################
17331737
@@ -2153,9 +2157,11 @@ def tenses(self, verb, parse=True):
21532157 for id1 , id2 in self ._default .items ():
21542158 if id2 in a :
21552159 a .add (id1 )
2156- a = (TENSES [id ][:- 2 ] for id in a )
2157- a = Tenses (sorted (a ))
2158- return a
2160+ t = (TENSES [id ][:- 2 ] for id in a )
2161+ # TODO fix this hack
2162+ t = Tenses (sorted (t , key = lambda x : (x [0 ] or '' , x [1 ] or 0 , x [2 ] or '' ,
2163+ x [3 ] or '' , x [4 ] or '' )))
2164+ return t
21592165
21602166 def find_lemma (self , verb ):
21612167 # Must be overridden in a subclass.
@@ -2289,14 +2295,14 @@ def load(self, path=None):
22892295 self ._language = xml .attrib .get ("language" , self ._language )
22902296 # Average scores of all word senses per part-of-speech tag.
22912297 for w in words :
2292- words [w ] = dict ((pos , map ( avg , zip (* psi )) )
2298+ words [w ] = dict ((pos , [ avg ( x ) for x in zip (* psi )] )
22932299 for pos , psi in words [w ].items ())
22942300 # Average scores of all part-of-speech tags.
22952301 for w , pos in words .items ():
2296- words [w ][None ] = map ( avg , zip (* pos .values ()))
2302+ words [w ][None ] = [ avg ( x ) for x in zip (* pos .values ())]
22972303 # Average scores of all synonyms per synset.
22982304 for id , psi in synsets .items ():
2299- synsets [id ] = map ( avg , zip (* psi ))
2305+ synsets [id ] = [ avg ( x ) for x in zip (* psi )]
23002306 dict .update (self , words )
23012307 dict .update (self .labeler , labels )
23022308 dict .update (self ._synsets , synsets )
@@ -2628,7 +2634,7 @@ def suggest(self, w):
26282634def _module (language ):
26292635 """ Returns the given language module (e.g., "en" => pattern.en).
26302636 """
2631- return _modules .setdefault (language , __import__ (language , globals (), {}, [], - 1 ))
2637+ return _modules .setdefault (language , __import__ (language , globals (), {}, [], 1 ))
26322638
26332639
26342640def _multilingual (function , * args , ** kwargs ):
0 commit comments