@@ -24,8 +24,10 @@ def _load_mode_and_filename_prefix(*, mode: str | None, filename_prefix: str | N
2424 mode = "append"
2525
2626 if mode == "overwrite_files" :
27+ # In `overwrite_files` mode, we need create a deterministic
28+ # filename to ensure that the same file is overwritten:
2729 if filename_prefix is None :
28- filename_prefix = "part "
30+ filename_prefix = "data "
2931 random_filename_suffix = ""
3032 mode = "append"
3133 else :
@@ -68,8 +70,8 @@ def _get_bucket_number(number_of_buckets: int, values: list[str | int | bool]) -
6870def _get_value_hash (value : str | int | bool ) -> int :
6971 if isinstance (value , (int , np .int_ )):
7072 value = int (value )
71- bigint_min , bigint_max = - (2 ** 63 ), 2 ** 63 - 1
72- int_min , int_max = - (2 ** 31 ), 2 ** 31 - 1
73+ bigint_min , bigint_max = - (2 ** 63 ), 2 ** 63 - 1
74+ int_min , int_max = - (2 ** 31 ), 2 ** 31 - 1
7375 if not bigint_min <= value <= bigint_max :
7476 raise ValueError (f"{ value } exceeds the range that Athena cannot handle as bigint." )
7577 if not int_min <= value <= int_max :
@@ -97,13 +99,13 @@ def _get_subgroup_prefix(keys: tuple[str, None], partition_cols: list[str], path
9799
98100
99101def _delete_objects (
100- keys : tuple [str , None ],
101- path_root : str ,
102- use_threads : bool | int ,
103- mode : str ,
104- partition_cols : list [str ],
105- boto3_session : boto3 .Session | None = None ,
106- ** func_kwargs : Any ,
102+ keys : tuple [str , None ],
103+ path_root : str ,
104+ use_threads : bool | int ,
105+ mode : str ,
106+ partition_cols : list [str ],
107+ boto3_session : boto3 .Session | None = None ,
108+ ** func_kwargs : Any ,
107109) -> str :
108110 # Keys are either a primitive type or a tuple if partitioning by multiple cols
109111 keys = (keys ,) if not isinstance (keys , tuple ) else keys
@@ -120,17 +122,17 @@ def _delete_objects(
120122
121123@engine .dispatch_on_engine
122124def _to_partitions (
123- df : pd .DataFrame ,
124- func : Callable [..., list [str ]],
125- concurrent_partitioning : bool ,
126- path_root : str ,
127- use_threads : bool | int ,
128- mode : str ,
129- partition_cols : list [str ],
130- bucketing_info : typing .BucketingInfoTuple | None ,
131- filename_prefix : str ,
132- boto3_session : boto3 .Session | None ,
133- ** func_kwargs : Any ,
125+ df : pd .DataFrame ,
126+ func : Callable [..., list [str ]],
127+ concurrent_partitioning : bool ,
128+ path_root : str ,
129+ use_threads : bool | int ,
130+ mode : str ,
131+ partition_cols : list [str ],
132+ bucketing_info : typing .BucketingInfoTuple | None ,
133+ filename_prefix : str ,
134+ boto3_session : boto3 .Session | None ,
135+ ** func_kwargs : Any ,
134136) -> tuple [list [str ], dict [str , list [str ]]]:
135137 partitions_values : dict [str , list [str ]] = {}
136138 proxy : _WriteProxy = _WriteProxy (use_threads = concurrent_partitioning )
@@ -187,15 +189,15 @@ def _to_partitions(
187189
188190@engine .dispatch_on_engine
189191def _to_buckets (
190- df : pd .DataFrame ,
191- func : Callable [..., list [str ]],
192- path_root : str ,
193- bucketing_info : typing .BucketingInfoTuple ,
194- filename_prefix : str ,
195- boto3_session : boto3 .Session | None ,
196- use_threads : bool | int ,
197- proxy : _WriteProxy | None = None ,
198- ** func_kwargs : Any ,
192+ df : pd .DataFrame ,
193+ func : Callable [..., list [str ]],
194+ path_root : str ,
195+ bucketing_info : typing .BucketingInfoTuple ,
196+ filename_prefix : str ,
197+ boto3_session : boto3 .Session | None ,
198+ use_threads : bool | int ,
199+ proxy : _WriteProxy | None = None ,
200+ ** func_kwargs : Any ,
199201) -> list [str ]:
200202 _proxy : _WriteProxy = proxy if proxy else _WriteProxy (use_threads = False )
201203 s3_client = client (service_name = "s3" , session = boto3_session )
@@ -216,18 +218,18 @@ def _to_buckets(
216218
217219
218220def _to_dataset (
219- func : Callable [..., list [str ]],
220- concurrent_partitioning : bool ,
221- df : pd .DataFrame ,
222- path_root : str ,
223- filename_prefix : str | None ,
224- index : bool ,
225- use_threads : bool | int ,
226- mode : str ,
227- partition_cols : list [str ] | None ,
228- bucketing_info : typing .BucketingInfoTuple | None ,
229- boto3_session : boto3 .Session | None ,
230- ** func_kwargs : Any ,
221+ func : Callable [..., list [str ]],
222+ concurrent_partitioning : bool ,
223+ df : pd .DataFrame ,
224+ path_root : str ,
225+ filename_prefix : str | None ,
226+ index : bool ,
227+ use_threads : bool | int ,
228+ mode : str ,
229+ partition_cols : list [str ] | None ,
230+ bucketing_info : typing .BucketingInfoTuple | None ,
231+ boto3_session : boto3 .Session | None ,
232+ ** func_kwargs : Any ,
231233) -> tuple [list [str ], dict [str , list [str ]]]:
232234 path_root = path_root if path_root .endswith ("/" ) else f"{ path_root } /"
233235 # Evaluate mode
0 commit comments