@@ -152,13 +152,14 @@ def tokenize_int(*args, **kwargs):
152152cdef class Tokenizer(TypeDispatcher):
153153 def __call__ (self , object obj , *args , **kwargs ):
154154 try :
155- return super ().__call__ (obj, * args, ** kwargs)
155+ return self .get_handler( type (obj)) (obj, * args, ** kwargs)
156156 except KeyError :
157157 if hasattr (obj, ' __mars_tokenize__' ) and not isinstance (obj, type ):
158158 if len (args) == 0 and len (kwargs) == 0 :
159159 return obj.__mars_tokenize__()
160160 else :
161- return super ().__call__(obj.__mars_tokenize__(), * args, ** kwargs)
161+ obj = obj.__mars_tokenize__()
162+ return self .get_handler(type (obj))(obj, * args, ** kwargs)
162163 if callable (obj):
163164 if PDTick is not None and not isinstance (obj, PDTick):
164165 return tokenize_function(obj)
@@ -176,14 +177,20 @@ cdef inline list iterative_tokenize(object ob):
176177 while dq_pos < len (dq):
177178 x = dq[dq_pos]
178179 dq_pos += 1
179- if isinstance (x, (list , tuple )):
180+ if type (x) in _primitive_types:
181+ h_list.append(x)
182+ elif isinstance (x, (list , tuple )):
180183 dq.extend(x)
181184 elif isinstance (x, set ):
182185 dq.extend(sorted (x))
183186 elif isinstance (x, dict ):
184187 dq.extend(sorted (x.items()))
185188 else :
186189 h_list.append(tokenize_handler(x))
190+
191+ if dq_pos >= 64 and len (dq) < dq_pos * 2 : # pragma: no cover
192+ dq = dq[dq_pos:]
193+ dq_pos = 0
187194 return h_list
188195
189196
@@ -202,20 +209,20 @@ cdef inline tuple tokenize_numpy(ob):
202209 ob.shape, ob.strides, offset)
203210 if ob.dtype.hasobject:
204211 try :
205- data = mmh_hash_bytes(' -' .join(ob.flat).encode(' utf-8' , errors = ' surrogatepass' )).hex()
212+ data = mmh_hash_bytes(' -' .join(ob.flat).encode(' utf-8' , errors = ' surrogatepass' ))
206213 except UnicodeDecodeError :
207- data = mmh_hash_bytes(b' -' .join([to_binary(x) for x in ob.flat])).hex()
214+ data = mmh_hash_bytes(b' -' .join([to_binary(x) for x in ob.flat]))
208215 except TypeError :
209216 try :
210- data = mmh_hash_bytes(pickle.dumps(ob, pickle.HIGHEST_PROTOCOL)).hex()
217+ data = mmh_hash_bytes(pickle.dumps(ob, pickle.HIGHEST_PROTOCOL))
211218 except :
212219 # nothing can do, generate uuid
213220 data = uuid.uuid4().hex
214221 else :
215222 try :
216- data = mmh_hash_bytes(ob.ravel().view(' i1' ).data).hex()
223+ data = mmh_hash_bytes(ob.ravel().view(' i1' ).data)
217224 except (BufferError, AttributeError , ValueError ):
218- data = mmh_hash_bytes(ob.copy().ravel().view(' i1' ).data).hex()
225+ data = mmh_hash_bytes(ob.copy().ravel().view(' i1' ).data)
219226 return data, ob.dtype, ob.shape, ob.strides
220227
221228
@@ -332,20 +339,19 @@ def tokenize_cudf(ob):
332339
333340cdef Tokenizer tokenize_handler = Tokenizer()
334341
335- base_types = (int , float , str , unicode , bytes, complex ,
336- type (None ), type , slice , date, datetime, timedelta)
337- for t in base_types:
342+ cdef set _primitive_types = {
343+ int , float , str , unicode , bytes, complex , type (None ), type , slice , date, datetime, timedelta
344+ }
345+ for t in _primitive_types:
338346 tokenize_handler.register(t, lambda ob : ob)
339347
340348for t in (np.dtype, np.generic):
341- tokenize_handler.register(t, lambda ob : repr (ob) )
349+ tokenize_handler.register(t, lambda ob : ob )
342350
343351for t in (list , tuple , dict , set ):
344352 tokenize_handler.register(t, iterative_tokenize)
345353
346354tokenize_handler.register(np.ndarray, tokenize_numpy)
347- tokenize_handler.register(dict , lambda ob : iterative_tokenize(sorted (ob.items())))
348- tokenize_handler.register(set , lambda ob : iterative_tokenize(sorted (ob)))
349355tokenize_handler.register(np.random.RandomState, lambda ob : iterative_tokenize(ob.get_state()))
350356tokenize_handler.register(memoryview, lambda ob : mmh3_hash_from_buffer(ob))
351357tokenize_handler.register(Enum, tokenize_enum)
0 commit comments