@@ -77,33 +77,43 @@ def __init__(
7777 dataset : Dataset ,
7878 path_or_buf : Union [PathLike , BinaryIO ],
7979 batch_size : Optional [int ] = None ,
80- cdc_options : Optional [dict ] = None ,
8180 storage_options : Optional [dict ] = None ,
81+ use_content_defined_chunking : Optional [dict ] = None ,
8282 ** parquet_writer_kwargs ,
8383 ):
8484 self .dataset = dataset
8585 self .path_or_buf = path_or_buf
86- self .cdc_options = cdc_options
8786 self .batch_size = batch_size or get_writer_batch_size (dataset .features )
8887 self .storage_options = storage_options or {}
8988 self .parquet_writer_kwargs = parquet_writer_kwargs
89+ self .use_content_defined_chunking = use_content_defined_chunking
9090
9191 def write (self ) -> int :
9292 batch_size = self .batch_size if self .batch_size else config .DEFAULT_MAX_BATCH_SIZE
93- cdc_options = self .cdc_options if self .cdc_options else config .DEFAULT_CDC_OPTIONS
93+ use_content_defined_chunking = (
94+ self .use_content_defined_chunking if self .use_content_defined_chunking else config .DEFAULT_CDC_OPTIONS
95+ )
9496
9597 if isinstance (self .path_or_buf , (str , bytes , os .PathLike )):
9698 with fsspec .open (self .path_or_buf , "wb" , ** (self .storage_options or {})) as buffer :
9799 written = self ._write (
98- file_obj = buffer , batch_size = batch_size , cdc_options = cdc_options , ** self .parquet_writer_kwargs
100+ file_obj = buffer ,
101+ batch_size = batch_size ,
102+ use_content_defined_chunking = use_content_defined_chunking ,
103+ ** self .parquet_writer_kwargs ,
99104 )
100105 else :
101106 written = self ._write (
102- file_obj = self .path_or_buf , batch_size = batch_size , cdc_options = cdc_options , ** self .parquet_writer_kwargs
107+ file_obj = self .path_or_buf ,
108+ batch_size = batch_size ,
109+ use_content_defined_chunking = use_content_defined_chunking ,
110+ ** self .parquet_writer_kwargs ,
103111 )
104112 return written
105113
106- def _write (self , file_obj : BinaryIO , batch_size : int , cdc_options : dict , ** parquet_writer_kwargs ) -> int :
114+ def _write (
115+ self , file_obj : BinaryIO , batch_size : int , use_content_defined_chunking : bool | dict , ** parquet_writer_kwargs
116+ ) -> int :
107117 """Writes the pyarrow table as Parquet to a binary file handle.
108118
109119 Caller is responsible for opening and closing the handle.
@@ -113,7 +123,7 @@ def _write(self, file_obj: BinaryIO, batch_size: int, cdc_options: dict, **parqu
113123 schema = self .dataset .features .arrow_schema
114124
115125 writer = pq .ParquetWriter (
116- file_obj , schema = schema , use_content_defined_chunking = cdc_options , ** parquet_writer_kwargs
126+ file_obj , schema = schema , use_content_defined_chunking = use_content_defined_chunking , ** parquet_writer_kwargs
117127 )
118128
119129 for offset in hf_tqdm (
@@ -130,6 +140,6 @@ def _write(self, file_obj: BinaryIO, batch_size: int, cdc_options: dict, **parqu
130140 written += batch .nbytes
131141
132142 # TODO(kszucs): we may want to persist multiple parameters
133- writer .add_key_value_metadata ({"content_defined_chunking" : json .dumps (cdc_options )})
143+ writer .add_key_value_metadata ({"content_defined_chunking" : json .dumps (use_content_defined_chunking )})
134144 writer .close ()
135145 return written
0 commit comments