Skip to content

Commit 5c94e27

Browse files
Merge pull request #428 from bkhant1/all_optis
Optimise `HashingEncoder` for both large and small dataframes
2 parents 26ef261 + e2c1b79 commit 5c94e27

File tree

3 files changed

+97
-107
lines changed

3 files changed

+97
-107
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
unreleased
22
==========
3+
* improved: performance of the hashing encoder (about twice as fast)
4+
* deprecate the `max_sample`` parameter, it has no use anymore
5+
* add `process_creation_method` parameter
6+
* use concurrent.futures.ProcessPoolExecutor instead of hand-managed queues
7+
* optimisations to hashlib calls, remove python 2 checks, fork instead of spawn
38

49
v2.6.3
510
======

category_encoders/hashing.py

Lines changed: 78 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
import category_encoders.utils as util
66
import multiprocessing
77
import pandas as pd
8+
import numpy as np
89
import math
910
import platform
11+
from concurrent.futures import ProcessPoolExecutor
1012

1113
__author__ = 'willmcginnis', 'LiuShulun'
1214

@@ -56,6 +58,12 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
5658
n_components: int
5759
how many bits to use to represent the feature. By default, we use 8 bits.
5860
For high-cardinality features, consider using up-to 32 bits.
61+
process_creation_method: string
62+
either "fork", "spawn" or "forkserver" (availability depends on your
63+
platform). See https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
64+
for more details and tradeoffs. Defaults to "fork" on linux/macos as it
65+
is the fastest option and to "spawn" on windows as it is the only one
66+
available
5967
6068
Example
6169
-------
@@ -103,12 +111,12 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
103111
encoding_relation = util.EncodingRelation.ONE_TO_M
104112

105113
def __init__(self, max_process=0, max_sample=0, verbose=0, n_components=8, cols=None, drop_invariant=False,
106-
return_df=True, hash_method='md5'):
114+
return_df=True, hash_method='md5', process_creation_method='fork'):
107115
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
108116
handle_unknown="does not apply", handle_missing="does not apply")
109117

110118
if max_process not in range(1, 128):
111-
if platform.system == 'Windows':
119+
if platform.system() == 'Windows':
112120
self.max_process = 1
113121
else:
114122
self.max_process = int(math.ceil(multiprocessing.cpu_count() / 2))
@@ -119,7 +127,10 @@ def __init__(self, max_process=0, max_sample=0, verbose=0, n_components=8, cols=
119127
else:
120128
self.max_process = max_process
121129
self.max_sample = int(max_sample)
122-
self.auto_sample = max_sample <= 0
130+
if platform.system() == 'Windows':
131+
self.process_creation_method = "spawn"
132+
else:
133+
self.process_creation_method = process_creation_method
123134
self.data_lines = 0
124135
self.X = None
125136

@@ -129,87 +140,7 @@ def __init__(self, max_process=0, max_sample=0, verbose=0, n_components=8, cols=
129140
def _fit(self, X, y=None, **kwargs):
130141
pass
131142

132-
def require_data(self, data_lock, new_start, done_index, hashing_parts, process_index):
133-
is_finished = False
134-
while not is_finished:
135-
if data_lock.acquire():
136-
if new_start.value:
137-
end_index = 0
138-
new_start.value = False
139-
else:
140-
end_index = done_index.value
141-
142-
if all([self.data_lines > 0, end_index < self.data_lines]):
143-
start_index = end_index
144-
if (self.data_lines - end_index) <= self.max_sample:
145-
end_index = self.data_lines
146-
else:
147-
end_index += self.max_sample
148-
done_index.value = end_index
149-
data_lock.release()
150-
151-
data_part = self.X.iloc[start_index: end_index]
152-
# Always get df and check it after merge all data parts
153-
data_part = self.hashing_trick(X_in=data_part, hashing_method=self.hash_method,
154-
N=self.n_components, cols=self.cols)
155-
part_index = int(math.ceil(end_index / self.max_sample))
156-
hashing_parts.put({part_index: data_part})
157-
is_finished = end_index >= self.data_lines
158-
if self.verbose == 5:
159-
print(f"Process - {process_index} done hashing data : {start_index} ~ {end_index}")
160-
else:
161-
data_lock.release()
162-
is_finished = True
163-
else:
164-
data_lock.release()
165-
166-
def _transform(self, X):
167-
"""
168-
Call _transform_single_cpu() if you want to use single CPU with all samples
169-
"""
170-
self.X = X
171-
172-
self.data_lines = len(self.X)
173-
174-
data_lock = multiprocessing.Manager().Lock()
175-
new_start = multiprocessing.Manager().Value('d', True)
176-
done_index = multiprocessing.Manager().Value('d', int(0))
177-
hashing_parts = multiprocessing.Manager().Queue()
178-
179-
if self.auto_sample:
180-
self.max_sample = int(self.data_lines / self.max_process)
181-
182-
if self.max_sample == 0:
183-
self.max_sample = 1
184-
if self.max_process == 1:
185-
self.require_data(data_lock, new_start, done_index, hashing_parts, process_index=1)
186-
else:
187-
n_process = []
188-
for thread_idx in range(self.max_process):
189-
process = multiprocessing.Process(target=self.require_data,
190-
args=(data_lock, new_start, done_index, hashing_parts, thread_idx + 1))
191-
process.daemon = True
192-
n_process.append(process)
193-
for process in n_process:
194-
process.start()
195-
for process in n_process:
196-
process.join()
197-
data = self.X
198-
if self.max_sample == 0 or self.max_sample == self.data_lines:
199-
if hashing_parts:
200-
data = list(hashing_parts.get().values())[0]
201-
else:
202-
list_data = {}
203-
while not hashing_parts.empty():
204-
list_data.update(hashing_parts.get())
205-
sort_data = []
206-
for part_index in sorted(list_data):
207-
sort_data.append(list_data[part_index])
208-
if sort_data:
209-
data = pd.concat(sort_data)
210-
return data
211-
212-
def _transform_single_cpu(self, X, override_return_df=False):
143+
def _transform(self, X, override_return_df=False):
213144
"""Perform the transformation to new categorical data.
214145
215146
Parameters
@@ -238,18 +169,66 @@ def _transform_single_cpu(self, X, override_return_df=False):
238169
if not list(self.cols):
239170
return X
240171

241-
X = self.hashing_trick(X, hashing_method=self.hash_method, N=self.n_components, cols=self.cols)
242-
243-
if self.drop_invariant:
244-
X = X.drop(columns=self.invariant_cols)
245-
246-
if self.return_df or override_return_df:
247-
return X
248-
else:
249-
return X.to_numpy()
172+
X = self.hashing_trick(
173+
X,
174+
hashing_method=self.hash_method,
175+
N=self.n_components,
176+
cols=self.cols,
177+
)
178+
179+
return X
250180

251181
@staticmethod
252-
def hashing_trick(X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
182+
def hash_chunk(args):
183+
hash_method, np_df, N = args
184+
# Calling getattr outside the loop saves some time in the loop
185+
hasher_constructor = getattr(hashlib, hash_method)
186+
# Same when the call to getattr is implicit
187+
int_from_bytes = int.from_bytes
188+
result = np.zeros((np_df.shape[0], N), dtype='int')
189+
for i, row in enumerate(np_df):
190+
for val in row:
191+
if val is not None:
192+
hasher = hasher_constructor()
193+
# Computes an integer index from the hasher digest. The endian is
194+
# "big" as the code use to read:
195+
# column_index = int(hasher.hexdigest(), 16) % N
196+
# which is implicitly considering the hexdigest to be big endian,
197+
# even if the system is little endian.
198+
# Building the index that way is about 30% faster than using the
199+
# hexdigest.
200+
hasher.update(bytes(str(val), 'utf-8'))
201+
column_index = int_from_bytes(hasher.digest(), byteorder='big') % N
202+
result[i, column_index] += 1
203+
return result
204+
205+
def hashing_trick_with_np_parallel(self, df, N: int):
206+
np_df = df.to_numpy()
207+
ctx = multiprocessing.get_context(self.process_creation_method)
208+
209+
with ProcessPoolExecutor(max_workers=self.max_process, mp_context=ctx) as executor:
210+
result = np.concatenate(list(
211+
executor.map(
212+
self.hash_chunk,
213+
zip(
214+
[self.hash_method]*self.max_process,
215+
np.array_split(np_df, self.max_process),
216+
[N]*self.max_process
217+
)
218+
)
219+
))
220+
221+
return pd.DataFrame(result, index=df.index)
222+
223+
def hashing_trick_with_np_no_parallel(self, df, N):
224+
np_df = df.to_numpy()
225+
226+
result = HashingEncoder.hash_chunk((self.hash_method, np_df, N))
227+
228+
return pd.DataFrame(result, index=df.index)
229+
230+
231+
def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
253232
"""A basic hashing implementation with configurable dimensionality/precision
254233
255234
Performs the hashing trick on a pandas dataframe, `X`, using the hashing method from hashlib
@@ -296,24 +275,16 @@ def hashing_trick(X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
296275
if cols is None:
297276
cols = X.columns
298277

299-
def hash_fn(x):
300-
tmp = [0 for _ in range(N)]
301-
for val in x.array:
302-
if val is not None:
303-
hasher = hashlib.new(hashing_method)
304-
if sys.version_info[0] == 2:
305-
hasher.update(str(val))
306-
else:
307-
hasher.update(bytes(str(val), 'utf-8'))
308-
tmp[int(hasher.hexdigest(), 16) % N] += 1
309-
return tmp
310-
311278
new_cols = [f'col_{d}' for d in range(N)]
312279

313280
X_cat = X.loc[:, cols]
314281
X_num = X.loc[:, [x for x in X.columns if x not in cols]]
315282

316-
X_cat = X_cat.apply(hash_fn, axis=1, result_type='expand')
283+
if self.max_process == 1:
284+
X_cat = self.hashing_trick_with_np_no_parallel(X_cat, N)
285+
else:
286+
X_cat = self.hashing_trick_with_np_parallel(X_cat, N)
287+
317288
X_cat.columns = new_cols
318289

319290
X = pd.concat([X_cat, X_num], axis=1)

tests/test_hashing.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,17 @@ def test_transform_works_with_single_row_df(self):
4242
set(target_columns))
4343
) == df_encoded_multi_process.shape[1]
4444
)
45+
46+
def test_simple_example(self):
47+
df = pd.DataFrame({
48+
'strings': ["aaaa", "bbbb", "cccc"],
49+
"more_strings": ["aaaa", "dddd", "eeee"],
50+
})
51+
encoder = encoders.HashingEncoder(n_components=4, max_process=2)
52+
encoder.fit(df)
53+
assert encoder.transform(df).equals(pd.DataFrame({
54+
"col_0": [0,1,1],
55+
"col_1": [2,0,1],
56+
"col_2": [0,1,0],
57+
"col_3": [0,0,0]
58+
}))

0 commit comments

Comments
 (0)