@@ -102,6 +102,8 @@ def decode_string(v, encoding="utf-8"):
102102 return unicode (v )
103103
104104
105+ # TODO use one of these (rather than copy and paste in each file)
106+ # ... if we really have to use any at all
105107def encode_string (v , encoding = "utf-8" ):
106108 """Returns the given value as a Python byte string (if possible)."""
107109 if isinstance (encoding , basestring ):
@@ -113,7 +115,12 @@ def encode_string(v, encoding="utf-8"):
113115 except :
114116 pass
115117 return v
116- return str (v )
118+ if isinstance (v , bytes ):
119+ return v
120+ else :
121+ # TODO Is this ever the correct behaviour (see coverage)
122+ raise ValueError ()
123+ #return str(v)
117124
118125decode_utf8 = decode_string
119126encode_utf8 = encode_string
@@ -3478,10 +3485,17 @@ def _train(self):
34783485 H2 = dict ((w , i + 1 ) for i , w in enumerate (self .classes ))
34793486 # Class reversed hash.
34803487 H3 = dict ((i + 1 , w ) for i , w in enumerate (self .classes ))
3488+
34813489 # Hashed vectors.
3482- x = map (lambda v : dict (map (lambda k : (H1 [k ], v [k ]), v )), M )
3490+ x = list (map (lambda v : dict (map (lambda k : (H1 [k ], v [k ]), v )), M ))
3491+ # TODO use this more efficient version?
3492+ # x = [dict(((H1[k], v[k]), v) for k in v) for v in M]
3493+
34833494 # Hashed classes.
3484- y = map (lambda v : H2 [v [0 ]], self ._vectors )
3495+ y = list (map (lambda v : H2 [v [0 ]], self ._vectors ))
3496+ # TODO use this more efficient version?
3497+ # y = [H2[v[0]] for v in self._vectors]
3498+
34853499 # For linear SVC, use LIBLINEAR which is faster.
34863500 # For kernel SVC, use LIBSVM.
34873501 if self .extension == LIBLINEAR :
0 commit comments