55import category_encoders .utils as util
66import multiprocessing
77import pandas as pd
8+ import numpy as np
89import math
910import platform
11+ from concurrent .futures import ProcessPoolExecutor
1012
1113__author__ = 'willmcginnis' , 'LiuShulun'
1214
@@ -56,6 +58,12 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
5658 n_components: int
5759 how many bits to use to represent the feature. By default, we use 8 bits.
5860 For high-cardinality features, consider using up-to 32 bits.
61+ process_creation_method: string
62+ either "fork", "spawn" or "forkserver" (availability depends on your
63+ platform). See https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
64+ for more details and tradeoffs. Defaults to "fork" on linux/macos as it
65+ is the fastest option and to "spawn" on windows as it is the only one
66+ available
5967
6068 Example
6169 -------
@@ -103,12 +111,12 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
103111 encoding_relation = util .EncodingRelation .ONE_TO_M
104112
105113 def __init__ (self , max_process = 0 , max_sample = 0 , verbose = 0 , n_components = 8 , cols = None , drop_invariant = False ,
106- return_df = True , hash_method = 'md5' ):
114+ return_df = True , hash_method = 'md5' , process_creation_method = 'fork' ):
107115 super ().__init__ (verbose = verbose , cols = cols , drop_invariant = drop_invariant , return_df = return_df ,
108116 handle_unknown = "does not apply" , handle_missing = "does not apply" )
109117
110118 if max_process not in range (1 , 128 ):
111- if platform .system == 'Windows' :
119+ if platform .system () == 'Windows' :
112120 self .max_process = 1
113121 else :
114122 self .max_process = int (math .ceil (multiprocessing .cpu_count () / 2 ))
@@ -119,7 +127,10 @@ def __init__(self, max_process=0, max_sample=0, verbose=0, n_components=8, cols=
119127 else :
120128 self .max_process = max_process
121129 self .max_sample = int (max_sample )
122- self .auto_sample = max_sample <= 0
130+ if platform .system () == 'Windows' :
131+ self .process_creation_method = "spawn"
132+ else :
133+ self .process_creation_method = process_creation_method
123134 self .data_lines = 0
124135 self .X = None
125136
@@ -129,87 +140,7 @@ def __init__(self, max_process=0, max_sample=0, verbose=0, n_components=8, cols=
129140 def _fit (self , X , y = None , ** kwargs ):
130141 pass
131142
132- def require_data (self , data_lock , new_start , done_index , hashing_parts , process_index ):
133- is_finished = False
134- while not is_finished :
135- if data_lock .acquire ():
136- if new_start .value :
137- end_index = 0
138- new_start .value = False
139- else :
140- end_index = done_index .value
141-
142- if all ([self .data_lines > 0 , end_index < self .data_lines ]):
143- start_index = end_index
144- if (self .data_lines - end_index ) <= self .max_sample :
145- end_index = self .data_lines
146- else :
147- end_index += self .max_sample
148- done_index .value = end_index
149- data_lock .release ()
150-
151- data_part = self .X .iloc [start_index : end_index ]
152- # Always get df and check it after merge all data parts
153- data_part = self .hashing_trick (X_in = data_part , hashing_method = self .hash_method ,
154- N = self .n_components , cols = self .cols )
155- part_index = int (math .ceil (end_index / self .max_sample ))
156- hashing_parts .put ({part_index : data_part })
157- is_finished = end_index >= self .data_lines
158- if self .verbose == 5 :
159- print (f"Process - { process_index } done hashing data : { start_index } ~ { end_index } " )
160- else :
161- data_lock .release ()
162- is_finished = True
163- else :
164- data_lock .release ()
165-
166- def _transform (self , X ):
167- """
168- Call _transform_single_cpu() if you want to use single CPU with all samples
169- """
170- self .X = X
171-
172- self .data_lines = len (self .X )
173-
174- data_lock = multiprocessing .Manager ().Lock ()
175- new_start = multiprocessing .Manager ().Value ('d' , True )
176- done_index = multiprocessing .Manager ().Value ('d' , int (0 ))
177- hashing_parts = multiprocessing .Manager ().Queue ()
178-
179- if self .auto_sample :
180- self .max_sample = int (self .data_lines / self .max_process )
181-
182- if self .max_sample == 0 :
183- self .max_sample = 1
184- if self .max_process == 1 :
185- self .require_data (data_lock , new_start , done_index , hashing_parts , process_index = 1 )
186- else :
187- n_process = []
188- for thread_idx in range (self .max_process ):
189- process = multiprocessing .Process (target = self .require_data ,
190- args = (data_lock , new_start , done_index , hashing_parts , thread_idx + 1 ))
191- process .daemon = True
192- n_process .append (process )
193- for process in n_process :
194- process .start ()
195- for process in n_process :
196- process .join ()
197- data = self .X
198- if self .max_sample == 0 or self .max_sample == self .data_lines :
199- if hashing_parts :
200- data = list (hashing_parts .get ().values ())[0 ]
201- else :
202- list_data = {}
203- while not hashing_parts .empty ():
204- list_data .update (hashing_parts .get ())
205- sort_data = []
206- for part_index in sorted (list_data ):
207- sort_data .append (list_data [part_index ])
208- if sort_data :
209- data = pd .concat (sort_data )
210- return data
211-
212- def _transform_single_cpu (self , X , override_return_df = False ):
143+ def _transform (self , X , override_return_df = False ):
213144 """Perform the transformation to new categorical data.
214145
215146 Parameters
@@ -238,18 +169,66 @@ def _transform_single_cpu(self, X, override_return_df=False):
238169 if not list (self .cols ):
239170 return X
240171
241- X = self .hashing_trick (X , hashing_method = self .hash_method , N = self .n_components , cols = self .cols )
242-
243- if self .drop_invariant :
244- X = X .drop (columns = self .invariant_cols )
245-
246- if self .return_df or override_return_df :
247- return X
248- else :
249- return X .to_numpy ()
172+ X = self .hashing_trick (
173+ X ,
174+ hashing_method = self .hash_method ,
175+ N = self .n_components ,
176+ cols = self .cols ,
177+ )
178+
179+ return X
250180
251181 @staticmethod
252- def hashing_trick (X_in , hashing_method = 'md5' , N = 2 , cols = None , make_copy = False ):
182+ def hash_chunk (args ):
183+ hash_method , np_df , N = args
184+ # Calling getattr outside the loop saves some time in the loop
185+ hasher_constructor = getattr (hashlib , hash_method )
186+ # Same when the call to getattr is implicit
187+ int_from_bytes = int .from_bytes
188+ result = np .zeros ((np_df .shape [0 ], N ), dtype = 'int' )
189+ for i , row in enumerate (np_df ):
190+ for val in row :
191+ if val is not None :
192+ hasher = hasher_constructor ()
193+ # Computes an integer index from the hasher digest. The endian is
194+ # "big" as the code use to read:
195+ # column_index = int(hasher.hexdigest(), 16) % N
196+ # which is implicitly considering the hexdigest to be big endian,
197+ # even if the system is little endian.
198+ # Building the index that way is about 30% faster than using the
199+ # hexdigest.
200+ hasher .update (bytes (str (val ), 'utf-8' ))
201+ column_index = int_from_bytes (hasher .digest (), byteorder = 'big' ) % N
202+ result [i , column_index ] += 1
203+ return result
204+
205+ def hashing_trick_with_np_parallel (self , df , N : int ):
206+ np_df = df .to_numpy ()
207+ ctx = multiprocessing .get_context (self .process_creation_method )
208+
209+ with ProcessPoolExecutor (max_workers = self .max_process , mp_context = ctx ) as executor :
210+ result = np .concatenate (list (
211+ executor .map (
212+ self .hash_chunk ,
213+ zip (
214+ [self .hash_method ]* self .max_process ,
215+ np .array_split (np_df , self .max_process ),
216+ [N ]* self .max_process
217+ )
218+ )
219+ ))
220+
221+ return pd .DataFrame (result , index = df .index )
222+
223+ def hashing_trick_with_np_no_parallel (self , df , N ):
224+ np_df = df .to_numpy ()
225+
226+ result = HashingEncoder .hash_chunk ((self .hash_method , np_df , N ))
227+
228+ return pd .DataFrame (result , index = df .index )
229+
230+
231+ def hashing_trick (self , X_in , hashing_method = 'md5' , N = 2 , cols = None , make_copy = False ):
253232 """A basic hashing implementation with configurable dimensionality/precision
254233
255234 Performs the hashing trick on a pandas dataframe, `X`, using the hashing method from hashlib
@@ -296,24 +275,16 @@ def hashing_trick(X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
296275 if cols is None :
297276 cols = X .columns
298277
299- def hash_fn (x ):
300- tmp = [0 for _ in range (N )]
301- for val in x .array :
302- if val is not None :
303- hasher = hashlib .new (hashing_method )
304- if sys .version_info [0 ] == 2 :
305- hasher .update (str (val ))
306- else :
307- hasher .update (bytes (str (val ), 'utf-8' ))
308- tmp [int (hasher .hexdigest (), 16 ) % N ] += 1
309- return tmp
310-
311278 new_cols = [f'col_{ d } ' for d in range (N )]
312279
313280 X_cat = X .loc [:, cols ]
314281 X_num = X .loc [:, [x for x in X .columns if x not in cols ]]
315282
316- X_cat = X_cat .apply (hash_fn , axis = 1 , result_type = 'expand' )
283+ if self .max_process == 1 :
284+ X_cat = self .hashing_trick_with_np_no_parallel (X_cat , N )
285+ else :
286+ X_cat = self .hashing_trick_with_np_parallel (X_cat , N )
287+
317288 X_cat .columns = new_cols
318289
319290 X = pd .concat ([X_cat , X_num ], axis = 1 )
0 commit comments