make hashing-hash chunk arguments explicit

PaulWestenthanner · PaulWestenthanner · commit 5203c12627e1 · 2023-11-11T15:43:17.000+01:00
diff --git a/category_encoders/hashing.py b/category_encoders/hashing.py
@@ -1,6 +1,5 @@
 """The hashing module contains all methods and classes related to the hashing trick."""
 
-import sys
 import hashlib
 import category_encoders.utils as util
 import multiprocessing
@@ -179,8 +178,7 @@ def _transform(self, X, override_return_df=False):
         return X
 
     @staticmethod
-    def hash_chunk(args):
-        hash_method, np_df, N = args
+    def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray:
         # Calling getattr outside the loop saves some time in the loop
         hasher_constructor = getattr(hashlib, hash_method)
         # Same when the call to getattr is implicit
@@ -202,31 +200,28 @@ def hash_chunk(args):
                     result[i, column_index] += 1
         return result
 
-    def hashing_trick_with_np_parallel(self, df, N: int):
+    def hashing_trick_with_np_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
         np_df = df.to_numpy()
         ctx = multiprocessing.get_context(self.process_creation_method)
 
         with ProcessPoolExecutor(max_workers=self.max_process, mp_context=ctx) as executor:
             result = np.concatenate(list(
                 executor.map(
                     self.hash_chunk,
-                    zip(
-                        [self.hash_method]*self.max_process,
-                        np.array_split(np_df, self.max_process),
-                        [N]*self.max_process
-                    )
+                    [self.hash_method]*self.max_process,
+                    np.array_split(np_df, self.max_process),
+                    [N]*self.max_process
                 )
             ))
 
         return pd.DataFrame(result, index=df.index)
 
-    def hashing_trick_with_np_no_parallel(self, df, N):
+    def hashing_trick_with_np_no_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
         np_df = df.to_numpy()
 
-        result = HashingEncoder.hash_chunk((self.hash_method, np_df, N))
+        result = HashingEncoder.hash_chunk(self.hash_method, np_df, N)
 
         return pd.DataFrame(result, index=df.index)
-        
 
     def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
         """A basic hashing implementation with configurable dimensionality/precision