Skip to content

Commit 5203c12

Browse files
author
PaulWestenthanner
committed
make hashing-hash chunk arguments explicit
1 parent 5c94e27 commit 5203c12

File tree

1 file changed

+7
-12
lines changed

1 file changed

+7
-12
lines changed

category_encoders/hashing.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""The hashing module contains all methods and classes related to the hashing trick."""
22

3-
import sys
43
import hashlib
54
import category_encoders.utils as util
65
import multiprocessing
@@ -179,8 +178,7 @@ def _transform(self, X, override_return_df=False):
179178
return X
180179

181180
@staticmethod
182-
def hash_chunk(args):
183-
hash_method, np_df, N = args
181+
def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray:
184182
# Calling getattr outside the loop saves some time in the loop
185183
hasher_constructor = getattr(hashlib, hash_method)
186184
# Same when the call to getattr is implicit
@@ -202,31 +200,28 @@ def hash_chunk(args):
202200
result[i, column_index] += 1
203201
return result
204202

205-
def hashing_trick_with_np_parallel(self, df, N: int):
203+
def hashing_trick_with_np_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
206204
np_df = df.to_numpy()
207205
ctx = multiprocessing.get_context(self.process_creation_method)
208206

209207
with ProcessPoolExecutor(max_workers=self.max_process, mp_context=ctx) as executor:
210208
result = np.concatenate(list(
211209
executor.map(
212210
self.hash_chunk,
213-
zip(
214-
[self.hash_method]*self.max_process,
215-
np.array_split(np_df, self.max_process),
216-
[N]*self.max_process
217-
)
211+
[self.hash_method]*self.max_process,
212+
np.array_split(np_df, self.max_process),
213+
[N]*self.max_process
218214
)
219215
))
220216

221217
return pd.DataFrame(result, index=df.index)
222218

223-
def hashing_trick_with_np_no_parallel(self, df, N):
219+
def hashing_trick_with_np_no_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
224220
np_df = df.to_numpy()
225221

226-
result = HashingEncoder.hash_chunk((self.hash_method, np_df, N))
222+
result = HashingEncoder.hash_chunk(self.hash_method, np_df, N)
227223

228224
return pd.DataFrame(result, index=df.index)
229-
230225

231226
def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
232227
"""A basic hashing implementation with configurable dimensionality/precision

0 commit comments

Comments
 (0)