@@ -119,68 +119,8 @@ class ParquetWriterOptions:
119119 """Advanced parquet writer options.
120120
121121 Allows settings the writer options that apply to the entire file. Some options can
122- also be set on a column by column basis, with the field `column_specific_options`
123- (see `ParquetColumnOptions`).
124-
125- Attributes:
126- data_pagesize_limit: Sets best effort maximum size of data page in bytes.
127- write_batch_size: Sets write_batch_size in bytes.
128- writer_version: Sets parquet writer version. Valid values are `1.0` and
129- `2.0`.
130- skip_arrow_metadata: Skip encoding the embedded arrow metadata in the
131- KV_meta.
132- compression: Compression type to use. Default is "zstd(3)".
133- Available compression types are
134- - "uncompressed": No compression.
135- - "snappy": Snappy compression.
136- - "gzip(n)": Gzip compression with level n.
137- - "brotli(n)": Brotli compression with level n.
138- - "lz4": LZ4 compression.
139- - "lz4_raw": LZ4_RAW compression.
140- - "zstd(n)": Zstandard compression with level n.
141- dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses
142- the default parquet writer setting.
143- dictionary_page_size_limit: Sets best effort maximum dictionary page size,
144- in bytes.
145- statistics_enabled: Sets if statistics are enabled for any column Valid
146- values are `none`, `chunk`, and `page`. If None, uses the default
147- parquet writer setting.
148- max_row_group_size: Target maximum number of rows in each row group
149- (defaults to 1M rows). Writing larger row groups requires more memory to
150- write, but can get better compression and be faster to read.
151- created_by: Sets "created by" property.
152- column_index_truncate_length: Sets column index truncate length.
153- statistics_truncate_length: Sets statistics truncate length. If None, uses
154- the default parquet writer setting.
155- data_page_row_count_limit: Sets best effort maximum number of rows in a data
156- page.
157- encoding: Sets default encoding for any column. Valid values are `plain`,
158- `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
159- `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
160- `byte_stream_split`. If None, uses the default parquet writer setting.
161- bloom_filter_on_write: Write bloom filters for all columns when creating
162- parquet files.
163- bloom_filter_fpp: Sets bloom filter false positive probability. If None,
164- uses the default parquet writer setting
165- bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses
166- the default parquet writer setting.
167- allow_single_file_parallelism: Controls whether DataFusion will attempt to
168- speed up writing parquet files by serializing them in parallel. Each
169- column in each row group in each output file are serialized in parallel
170- leveraging a maximum possible core count of n_files * n_row_groups *
171- n_columns.
172- maximum_parallel_row_group_writers: By default parallel parquet writer is
173- tuned for minimum memory usage in a streaming execution plan. You may
174- see a performance benefit when writing large parquet files by increasing
175- `maximum_parallel_row_group_writers` and
176- `maximum_buffered_record_batches_per_stream` if your system has idle
177- cores and can tolerate additional memory usage. Boosting these values is
178- likely worthwhile when writing out already in-memory data, such as from
179- a cached data frame.
180- maximum_buffered_record_batches_per_stream: See
181- `maximum_parallel_row_group_writers`.
182- column_specific_options: Overrides options for specific columns. If a column
183- is not a part of this dictionary, it will use the parameters provided here.
122+ also be set on a column by column basis, with the field ``column_specific_options``
123+ (see ``ParquetColumnOptions``).
184124 """
185125
186126 def __init__ (
@@ -208,7 +148,72 @@ def __init__(
208148 maximum_buffered_record_batches_per_stream : int = 2 ,
209149 column_specific_options : Optional [dict [str , ParquetColumnOptions ]] = None ,
210150 ) -> None :
211- """Initialize the ParquetWriterOptions."""
151+ """Initialize the ParquetWriterOptions.
152+
153+ Args:
154+ data_pagesize_limit: Sets best effort maximum size of data page in bytes.
155+ write_batch_size: Sets write_batch_size in bytes.
156+ writer_version: Sets parquet writer version. Valid values are ``1.0`` and
157+ ``2.0``.
158+ skip_arrow_metadata: Skip encoding the embedded arrow metadata in the
159+ KV_meta.
160+ compression: Compression type to use. Default is ``zstd(3)``.
161+ Available compression types are
162+
163+ - ``uncompressed``: No compression.
164+ - ``snappy``: Snappy compression.
165+ - ``gzip(n)``: Gzip compression with level n.
166+ - ``brotli(n)``: Brotli compression with level n.
167+ - ``lz4``: LZ4 compression.
168+ - ``lz4_raw``: LZ4_RAW compression.
169+ - ``zstd(n)``: Zstandard compression with level n.
170+ compression_level: Compression level to set.
171+ dictionary_enabled: Sets if dictionary encoding is enabled. If ``None``,
172+ uses the default parquet writer setting.
173+ dictionary_page_size_limit: Sets best effort maximum dictionary page size,
174+ in bytes.
175+ statistics_enabled: Sets if statistics are enabled for any column Valid
176+ values are ``none``, ``chunk``, and ``page``. If ``None``, uses the
177+ default parquet writer setting.
178+ max_row_group_size: Target maximum number of rows in each row group
179+ (defaults to 1M rows). Writing larger row groups requires more memory
180+ to write, but can get better compression and be faster to read.
181+ created_by: Sets "created by" property.
182+ column_index_truncate_length: Sets column index truncate length.
183+ statistics_truncate_length: Sets statistics truncate length. If ``None``,
184+ uses the default parquet writer setting.
185+ data_page_row_count_limit: Sets best effort maximum number of rows in a data
186+ page.
187+ encoding: Sets default encoding for any column. Valid values are ``plain``,
188+ ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``,
189+ ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``,
190+ and ``byte_stream_split``. If ``None``, uses the default parquet writer
191+ setting.
192+ bloom_filter_on_write: Write bloom filters for all columns when creating
193+ parquet files.
194+ bloom_filter_fpp: Sets bloom filter false positive probability. If ``None``,
195+ uses the default parquet writer setting
196+ bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``,
197+ uses the default parquet writer setting.
198+ allow_single_file_parallelism: Controls whether DataFusion will attempt to
199+ speed up writing parquet files by serializing them in parallel. Each
200+ column in each row group in each output file are serialized in parallel
201+ leveraging a maximum possible core count of
202+ ``n_files * n_row_groups * n_columns``.
203+ maximum_parallel_row_group_writers: By default parallel parquet writer is
204+ tuned for minimum memory usage in a streaming execution plan. You may
205+ see a performance benefit when writing large parquet files by increasing
206+ ``maximum_parallel_row_group_writers`` and
207+ ``maximum_buffered_record_batches_per_stream`` if your system has idle
208+ cores and can tolerate additional memory usage. Boosting these values is
209+ likely worthwhile when writing out already in-memory data, such as from
210+ a cached data frame.
211+ maximum_buffered_record_batches_per_stream: See
212+ ``maximum_parallel_row_group_writers``.
213+ column_specific_options: Overrides options for specific columns. If a column
214+ is not a part of this dictionary, it will use the parameters provided
215+ here.
216+ """
212217 self .data_pagesize_limit = data_pagesize_limit
213218 self .write_batch_size = write_batch_size
214219 self .writer_version = writer_version
@@ -241,29 +246,7 @@ class ParquetColumnOptions:
241246 """Parquet options for individual columns.
242247
243248 Contains the available options that can be applied for an individual Parquet column,
244- replacing the global options in `ParquetWriterOptions`.
245-
246- Attributes:
247- encoding: Sets encoding for the column path. Valid values are: `plain`,
248- `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
249- `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
250- `byte_stream_split`. These values are not case-sensitive. If `None`, uses
251- the default parquet options
252- dictionary_enabled: Sets if dictionary encoding is enabled for the column path.
253- If `None`, uses the default parquet options
254- compression: Sets default parquet compression codec for the column path. Valid
255- values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`,
256- `lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If
257- `None`, uses the default parquet options.
258- statistics_enabled: Sets if statistics are enabled for the column Valid values
259- are: `none`, `chunk`, and `page` These values are not case sensitive. If
260- `None`, uses the default parquet options.
261- bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If
262- `None`, uses the default parquet options.
263- bloom_filter_fpp: Sets bloom filter false positive probability for the column
264- path. If `None`, uses the default parquet options.
265- bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses
266- the default parquet options.
249+ replacing the global options in ``ParquetWriterOptions``.
267250 """
268251
269252 def __init__ (
@@ -276,7 +259,31 @@ def __init__(
276259 bloom_filter_fpp : Optional [float ] = None ,
277260 bloom_filter_ndv : Optional [int ] = None ,
278261 ) -> None :
279- """Initialize the ParquetColumnOptions."""
262+ """Initialize the ParquetColumnOptions.
263+
264+ Args:
265+ encoding: Sets encoding for the column path. Valid values are: ``plain``,
266+ ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``,
267+ ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``,
268+ and ``byte_stream_split``. These values are not case-sensitive. If
269+ ``None``, uses the default parquet options
270+ dictionary_enabled: Sets if dictionary encoding is enabled for the column
271+ path. If `None`, uses the default parquet options
272+ compression: Sets default parquet compression codec for the column path.
273+ Valid values are ``uncompressed``, ``snappy``, ``gzip(level)``, ``lzo``,
274+ ``brotli(level)``, ``lz4``, ``zstd(level)``, and ``lz4_raw``. These
275+ values are not case-sensitive. If ``None``, uses the default parquet
276+ options.
277+ statistics_enabled: Sets if statistics are enabled for the column Valid
278+ values are: ``none``, ``chunk``, and ``page`` These values are not case
279+ sensitive. If ``None``, uses the default parquet options.
280+ bloom_filter_enabled: Sets if bloom filter is enabled for the column path.
281+ If ``None``, uses the default parquet options.
282+ bloom_filter_fpp: Sets bloom filter false positive probability for the
283+ column path. If ``None``, uses the default parquet options.
284+ bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``,
285+ uses the default parquet options.
286+ """
280287 self .encoding = encoding
281288 self .dictionary_enabled = dictionary_enabled
282289 self .compression = compression
0 commit comments