@@ -114,7 +114,8 @@ def _to_buckets(
114114 lambda row : _get_bucket_number (bucketing_info [1 ], [row [col_name ] for col_name in bucketing_info [0 ]]),
115115 axis = "columns" ,
116116 )
117- for bucket_number , subgroup in df .groupby (by = bucket_number_series , observed = True ):
117+ bucket_number_series = bucket_number_series .astype (pd .CategoricalDtype (range (bucketing_info [1 ])))
118+ for bucket_number , subgroup in df .groupby (by = bucket_number_series , observed = False ):
118119 _proxy .write (
119120 func = func ,
120121 df = subgroup ,
@@ -131,21 +132,38 @@ def _to_buckets(
131132 return paths
132133
133134
135+ def _simulate_overflow (value : int , bits : int = 31 , signed : bool = False ) -> int :
136+ base = 1 << bits
137+ value %= base
138+ return value - base if signed and value .bit_length () == bits else value
139+
140+
134141def _get_bucket_number (number_of_buckets : int , values : List [Union [str , int , bool ]]) -> int :
135142 hash_code = 0
136143 for value in values :
137144 hash_code = 31 * hash_code + _get_value_hash (value )
145+ hash_code = _simulate_overflow (hash_code )
138146
139147 return hash_code % number_of_buckets
140148
141149
142150def _get_value_hash (value : Union [str , int , bool ]) -> int :
143151 if isinstance (value , (int , np .int_ )):
152+ value = int (value )
153+ bigint_min , bigint_max = - (2 ** 63 ), 2 ** 63 - 1
154+ int_min , int_max = - (2 ** 31 ), 2 ** 31 - 1
155+ if not bigint_min <= value <= bigint_max :
156+ raise ValueError (f"{ value } exceeds the range that Athena cannot handle as bigint." )
157+ if not int_min <= value <= int_max :
158+ value = (value >> 32 ) ^ value
159+ if value < 0 :
160+ return - value - 1
144161 return int (value )
145162 if isinstance (value , (str , np .str_ )):
146163 value_hash = 0
147164 for byte in value .encode ():
148165 value_hash = value_hash * 31 + byte
166+ value_hash = _simulate_overflow (value_hash )
149167 return value_hash
150168 if isinstance (value , (bool , np .bool_ )):
151169 return int (value )
0 commit comments