Skip to content

Commit ee1e73b

Browse files
committed
chore: always store the cdc parameters as metadata
1 parent 565d260 commit ee1e73b

File tree

2 files changed

+10
-5
lines changed

2 files changed

+10
-5
lines changed

src/datasets/arrow_writer.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,7 @@ def _build_writer(self, inferred_schema: pa.Schema):
690690
self.pa_writer = pq.ParquetWriter(
691691
self.stream, self._schema, use_content_defined_chunking=self.use_content_defined_chunking
692692
)
693-
self.pa_writer.add_key_value_metadata(
694-
{"content_defined_chunking": json.dumps(self.use_content_defined_chunking)}
695-
)
693+
if self.use_content_defined_chunking is not False:
694+
self.pa_writer.add_key_value_metadata(
695+
{"content_defined_chunking": json.dumps(self.use_content_defined_chunking)}
696+
)

src/datasets/io/parquet.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,9 @@ def __init__(
9191
def write(self) -> int:
9292
batch_size = self.batch_size if self.batch_size else config.DEFAULT_MAX_BATCH_SIZE
9393
use_content_defined_chunking = (
94-
self.use_content_defined_chunking if self.use_content_defined_chunking else config.DEFAULT_CDC_OPTIONS
94+
config.DEFAULT_CDC_OPTIONS
95+
if self.use_content_defined_chunking is None
96+
else self.use_content_defined_chunking
9597
)
9698

9799
if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):
@@ -140,6 +142,8 @@ def _write(
140142
written += batch.nbytes
141143

142144
# TODO(kszucs): we may want to persist multiple parameters
143-
writer.add_key_value_metadata({"content_defined_chunking": json.dumps(use_content_defined_chunking)})
145+
if use_content_defined_chunking is not False:
146+
writer.add_key_value_metadata({"content_defined_chunking": json.dumps(use_content_defined_chunking)})
147+
144148
writer.close()
145149
return written

0 commit comments

Comments
 (0)