11"""The hashing module contains all methods and classes related to the hashing trick."""
22
3- import sys
43import hashlib
54import category_encoders .utils as util
65import multiprocessing
@@ -179,8 +178,7 @@ def _transform(self, X, override_return_df=False):
179178 return X
180179
181180 @staticmethod
182- def hash_chunk (args ):
183- hash_method , np_df , N = args
181+ def hash_chunk (hash_method : str , np_df : np .ndarray , N : int ) -> np .ndarray :
184182 # Calling getattr outside the loop saves some time in the loop
185183 hasher_constructor = getattr (hashlib , hash_method )
186184 # Same when the call to getattr is implicit
@@ -202,31 +200,28 @@ def hash_chunk(args):
202200 result [i , column_index ] += 1
203201 return result
204202
205- def hashing_trick_with_np_parallel (self , df , N : int ):
203+ def hashing_trick_with_np_parallel (self , df : pd . DataFrame , N : int ) -> pd . DataFrame :
206204 np_df = df .to_numpy ()
207205 ctx = multiprocessing .get_context (self .process_creation_method )
208206
209207 with ProcessPoolExecutor (max_workers = self .max_process , mp_context = ctx ) as executor :
210208 result = np .concatenate (list (
211209 executor .map (
212210 self .hash_chunk ,
213- zip (
214- [self .hash_method ]* self .max_process ,
215- np .array_split (np_df , self .max_process ),
216- [N ]* self .max_process
217- )
211+ [self .hash_method ]* self .max_process ,
212+ np .array_split (np_df , self .max_process ),
213+ [N ]* self .max_process
218214 )
219215 ))
220216
221217 return pd .DataFrame (result , index = df .index )
222218
223- def hashing_trick_with_np_no_parallel (self , df , N ) :
219+ def hashing_trick_with_np_no_parallel (self , df : pd . DataFrame , N : int ) -> pd . DataFrame :
224220 np_df = df .to_numpy ()
225221
226- result = HashingEncoder .hash_chunk (( self .hash_method , np_df , N ) )
222+ result = HashingEncoder .hash_chunk (self .hash_method , np_df , N )
227223
228224 return pd .DataFrame (result , index = df .index )
229-
230225
231226 def hashing_trick (self , X_in , hashing_method = 'md5' , N = 2 , cols = None , make_copy = False ):
232227 """A basic hashing implementation with configurable dimensionality/precision
0 commit comments