11"""Amazon S3 Write Dataset (PRIVATE)."""
22
33import logging
4- import uuid
54from typing import Any , Callable , Dict , List , Optional , Tuple , Union
65
76import boto3
@@ -24,12 +23,12 @@ def _to_partitions(
2423 mode : str ,
2524 partition_cols : List [str ],
2625 bucketing_info : Optional [Tuple [List [str ], int ]],
26+ filename_prefix : str ,
2727 boto3_session : boto3 .Session ,
2828 ** func_kwargs : Any ,
2929) -> Tuple [List [str ], Dict [str , List [str ]]]:
3030 partitions_values : Dict [str , List [str ]] = {}
3131 proxy : _WriteProxy = _WriteProxy (use_threads = concurrent_partitioning )
32- filename_prefix = uuid .uuid4 ().hex
3332
3433 for keys , subgroup in df .groupby (by = partition_cols , observed = True ):
3534 subgroup = subgroup .drop (partition_cols , axis = "columns" )
@@ -60,6 +59,7 @@ def _to_partitions(
6059 func = func ,
6160 df = subgroup ,
6261 path_root = prefix ,
62+ filename_prefix = filename_prefix ,
6363 boto3_session = boto3_session ,
6464 use_threads = use_threads ,
6565 ** func_kwargs ,
@@ -74,25 +74,23 @@ def _to_buckets(
7474 df : pd .DataFrame ,
7575 path_root : str ,
7676 bucketing_info : Tuple [List [str ], int ],
77+ filename_prefix : str ,
7778 boto3_session : boto3 .Session ,
7879 use_threads : bool ,
7980 proxy : Optional [_WriteProxy ] = None ,
80- filename_prefix : Optional [str ] = None ,
8181 ** func_kwargs : Any ,
8282) -> List [str ]:
8383 _proxy : _WriteProxy = proxy if proxy else _WriteProxy (use_threads = False )
8484 bucket_number_series = df .apply (
8585 lambda row : _get_bucket_number (bucketing_info [1 ], [row [col_name ] for col_name in bucketing_info [0 ]]),
8686 axis = "columns" ,
8787 )
88- if filename_prefix is None :
89- filename_prefix = uuid .uuid4 ().hex
9088 for bucket_number , subgroup in df .groupby (by = bucket_number_series , observed = True ):
9189 _proxy .write (
9290 func = func ,
9391 df = subgroup ,
9492 path_root = path_root ,
95- filename = f"{ filename_prefix } _bucket-{ bucket_number :05d} " ,
93+ filename_prefix = f"{ filename_prefix } _bucket-{ bucket_number :05d} " ,
9694 boto3_session = boto3_session ,
9795 use_threads = use_threads ,
9896 ** func_kwargs ,
@@ -133,6 +131,7 @@ def _to_dataset(
133131 concurrent_partitioning : bool ,
134132 df : pd .DataFrame ,
135133 path_root : str ,
134+ filename_prefix : str ,
136135 index : bool ,
137136 use_threads : bool ,
138137 mode : str ,
@@ -168,6 +167,7 @@ def _to_dataset(
168167 use_threads = use_threads ,
169168 mode = mode ,
170169 bucketing_info = bucketing_info ,
170+ filename_prefix = filename_prefix ,
171171 partition_cols = partition_cols ,
172172 boto3_session = boto3_session ,
173173 index = index ,
@@ -180,13 +180,20 @@ def _to_dataset(
180180 path_root = path_root ,
181181 use_threads = use_threads ,
182182 bucketing_info = bucketing_info ,
183+ filename_prefix = filename_prefix ,
183184 boto3_session = boto3_session ,
184185 index = index ,
185186 ** func_kwargs ,
186187 )
187188 else :
188189 paths = func (
189- df = df , path_root = path_root , use_threads = use_threads , boto3_session = boto3_session , index = index , ** func_kwargs
190+ df = df ,
191+ path_root = path_root ,
192+ filename_prefix = filename_prefix ,
193+ use_threads = use_threads ,
194+ boto3_session = boto3_session ,
195+ index = index ,
196+ ** func_kwargs ,
190197 )
191198 _logger .debug ("paths: %s" , paths )
192199 _logger .debug ("partitions_values: %s" , partitions_values )
0 commit comments