@@ -78,44 +78,37 @@ def __init__(
7878 path_or_buf : Union [PathLike , BinaryIO ],
7979 batch_size : Optional [int ] = None ,
8080 storage_options : Optional [dict ] = None ,
81- use_content_defined_chunking : Optional [ dict ] = None ,
81+ use_content_defined_chunking : bool | dict = True ,
8282 ** parquet_writer_kwargs ,
8383 ):
8484 self .dataset = dataset
8585 self .path_or_buf = path_or_buf
8686 self .batch_size = batch_size or get_writer_batch_size (dataset .features )
8787 self .storage_options = storage_options or {}
8888 self .parquet_writer_kwargs = parquet_writer_kwargs
89+ if use_content_defined_chunking is True :
90+ use_content_defined_chunking = config .DEFAULT_CDC_OPTIONS
8991 self .use_content_defined_chunking = use_content_defined_chunking
9092
9193 def write (self ) -> int :
9294 batch_size = self .batch_size if self .batch_size else config .DEFAULT_MAX_BATCH_SIZE
93- use_content_defined_chunking = (
94- config .DEFAULT_CDC_OPTIONS
95- if self .use_content_defined_chunking is None
96- else self .use_content_defined_chunking
97- )
9895
9996 if isinstance (self .path_or_buf , (str , bytes , os .PathLike )):
10097 with fsspec .open (self .path_or_buf , "wb" , ** (self .storage_options or {})) as buffer :
10198 written = self ._write (
10299 file_obj = buffer ,
103100 batch_size = batch_size ,
104- use_content_defined_chunking = use_content_defined_chunking ,
105101 ** self .parquet_writer_kwargs ,
106102 )
107103 else :
108104 written = self ._write (
109105 file_obj = self .path_or_buf ,
110106 batch_size = batch_size ,
111- use_content_defined_chunking = use_content_defined_chunking ,
112107 ** self .parquet_writer_kwargs ,
113108 )
114109 return written
115110
116- def _write (
117- self , file_obj : BinaryIO , batch_size : int , use_content_defined_chunking : bool | dict , ** parquet_writer_kwargs
118- ) -> int :
111+ def _write (self , file_obj : BinaryIO , batch_size : int , ** parquet_writer_kwargs ) -> int :
119112 """Writes the pyarrow table as Parquet to a binary file handle.
120113
121114 Caller is responsible for opening and closing the handle.
@@ -125,7 +118,10 @@ def _write(
125118 schema = self .dataset .features .arrow_schema
126119
127120 writer = pq .ParquetWriter (
128- file_obj , schema = schema , use_content_defined_chunking = use_content_defined_chunking , ** parquet_writer_kwargs
121+ file_obj ,
122+ schema = schema ,
123+ use_content_defined_chunking = self .use_content_defined_chunking ,
124+ ** parquet_writer_kwargs ,
129125 )
130126
131127 for offset in hf_tqdm (
@@ -142,8 +138,8 @@ def _write(
142138 written += batch .nbytes
143139
144140 # TODO(kszucs): we may want to persist multiple parameters
145- if use_content_defined_chunking is not False :
146- writer .add_key_value_metadata ({"content_defined_chunking" : json .dumps (use_content_defined_chunking )})
141+ if self . use_content_defined_chunking is not False :
142+ writer .add_key_value_metadata ({"content_defined_chunking" : json .dumps (self . use_content_defined_chunking )})
147143
148144 writer .close ()
149145 return written
0 commit comments