Skip to content

Commit 1b879bf

Browse files
committed
Added info comment
1 parent 2fe891d commit 1b879bf

File tree

1 file changed

+44
-42
lines changed

1 file changed

+44
-42
lines changed

awswrangler/s3/_write_dataset.py

Lines changed: 44 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ def _load_mode_and_filename_prefix(*, mode: str | None, filename_prefix: str | N
2424
mode = "append"
2525

2626
if mode == "overwrite_files":
27+
# In `overwrite_files` mode, we need create a deterministic
28+
# filename to ensure that the same file is overwritten:
2729
if filename_prefix is None:
28-
filename_prefix = "part"
30+
filename_prefix = "data"
2931
random_filename_suffix = ""
3032
mode = "append"
3133
else:
@@ -68,8 +70,8 @@ def _get_bucket_number(number_of_buckets: int, values: list[str | int | bool]) -
6870
def _get_value_hash(value: str | int | bool) -> int:
6971
if isinstance(value, (int, np.int_)):
7072
value = int(value)
71-
bigint_min, bigint_max = -(2**63), 2**63 - 1
72-
int_min, int_max = -(2**31), 2**31 - 1
73+
bigint_min, bigint_max = -(2 ** 63), 2 ** 63 - 1
74+
int_min, int_max = -(2 ** 31), 2 ** 31 - 1
7375
if not bigint_min <= value <= bigint_max:
7476
raise ValueError(f"{value} exceeds the range that Athena cannot handle as bigint.")
7577
if not int_min <= value <= int_max:
@@ -97,13 +99,13 @@ def _get_subgroup_prefix(keys: tuple[str, None], partition_cols: list[str], path
9799

98100

99101
def _delete_objects(
100-
keys: tuple[str, None],
101-
path_root: str,
102-
use_threads: bool | int,
103-
mode: str,
104-
partition_cols: list[str],
105-
boto3_session: boto3.Session | None = None,
106-
**func_kwargs: Any,
102+
keys: tuple[str, None],
103+
path_root: str,
104+
use_threads: bool | int,
105+
mode: str,
106+
partition_cols: list[str],
107+
boto3_session: boto3.Session | None = None,
108+
**func_kwargs: Any,
107109
) -> str:
108110
# Keys are either a primitive type or a tuple if partitioning by multiple cols
109111
keys = (keys,) if not isinstance(keys, tuple) else keys
@@ -120,17 +122,17 @@ def _delete_objects(
120122

121123
@engine.dispatch_on_engine
122124
def _to_partitions(
123-
df: pd.DataFrame,
124-
func: Callable[..., list[str]],
125-
concurrent_partitioning: bool,
126-
path_root: str,
127-
use_threads: bool | int,
128-
mode: str,
129-
partition_cols: list[str],
130-
bucketing_info: typing.BucketingInfoTuple | None,
131-
filename_prefix: str,
132-
boto3_session: boto3.Session | None,
133-
**func_kwargs: Any,
125+
df: pd.DataFrame,
126+
func: Callable[..., list[str]],
127+
concurrent_partitioning: bool,
128+
path_root: str,
129+
use_threads: bool | int,
130+
mode: str,
131+
partition_cols: list[str],
132+
bucketing_info: typing.BucketingInfoTuple | None,
133+
filename_prefix: str,
134+
boto3_session: boto3.Session | None,
135+
**func_kwargs: Any,
134136
) -> tuple[list[str], dict[str, list[str]]]:
135137
partitions_values: dict[str, list[str]] = {}
136138
proxy: _WriteProxy = _WriteProxy(use_threads=concurrent_partitioning)
@@ -187,15 +189,15 @@ def _to_partitions(
187189

188190
@engine.dispatch_on_engine
189191
def _to_buckets(
190-
df: pd.DataFrame,
191-
func: Callable[..., list[str]],
192-
path_root: str,
193-
bucketing_info: typing.BucketingInfoTuple,
194-
filename_prefix: str,
195-
boto3_session: boto3.Session | None,
196-
use_threads: bool | int,
197-
proxy: _WriteProxy | None = None,
198-
**func_kwargs: Any,
192+
df: pd.DataFrame,
193+
func: Callable[..., list[str]],
194+
path_root: str,
195+
bucketing_info: typing.BucketingInfoTuple,
196+
filename_prefix: str,
197+
boto3_session: boto3.Session | None,
198+
use_threads: bool | int,
199+
proxy: _WriteProxy | None = None,
200+
**func_kwargs: Any,
199201
) -> list[str]:
200202
_proxy: _WriteProxy = proxy if proxy else _WriteProxy(use_threads=False)
201203
s3_client = client(service_name="s3", session=boto3_session)
@@ -216,18 +218,18 @@ def _to_buckets(
216218

217219

218220
def _to_dataset(
219-
func: Callable[..., list[str]],
220-
concurrent_partitioning: bool,
221-
df: pd.DataFrame,
222-
path_root: str,
223-
filename_prefix: str | None,
224-
index: bool,
225-
use_threads: bool | int,
226-
mode: str,
227-
partition_cols: list[str] | None,
228-
bucketing_info: typing.BucketingInfoTuple | None,
229-
boto3_session: boto3.Session | None,
230-
**func_kwargs: Any,
221+
func: Callable[..., list[str]],
222+
concurrent_partitioning: bool,
223+
df: pd.DataFrame,
224+
path_root: str,
225+
filename_prefix: str | None,
226+
index: bool,
227+
use_threads: bool | int,
228+
mode: str,
229+
partition_cols: list[str] | None,
230+
bucketing_info: typing.BucketingInfoTuple | None,
231+
boto3_session: boto3.Session | None,
232+
**func_kwargs: Any,
231233
) -> tuple[list[str], dict[str, list[str]]]:
232234
path_root = path_root if path_root.endswith("/") else f"{path_root}/"
233235
# Evaluate mode

0 commit comments

Comments
 (0)