Skip to content

Commit 615e2a1

Browse files
committed
Move parameters into init method to fix documentation error
1 parent a9d421f commit 615e2a1

File tree

1 file changed

+94
-87
lines changed

1 file changed

+94
-87
lines changed

python/datafusion/dataframe.py

Lines changed: 94 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -119,68 +119,8 @@ class ParquetWriterOptions:
119119
"""Advanced parquet writer options.
120120
121121
Allows settings the writer options that apply to the entire file. Some options can
122-
also be set on a column by column basis, with the field `column_specific_options`
123-
(see `ParquetColumnOptions`).
124-
125-
Attributes:
126-
data_pagesize_limit: Sets best effort maximum size of data page in bytes.
127-
write_batch_size: Sets write_batch_size in bytes.
128-
writer_version: Sets parquet writer version. Valid values are `1.0` and
129-
`2.0`.
130-
skip_arrow_metadata: Skip encoding the embedded arrow metadata in the
131-
KV_meta.
132-
compression: Compression type to use. Default is "zstd(3)".
133-
Available compression types are
134-
- "uncompressed": No compression.
135-
- "snappy": Snappy compression.
136-
- "gzip(n)": Gzip compression with level n.
137-
- "brotli(n)": Brotli compression with level n.
138-
- "lz4": LZ4 compression.
139-
- "lz4_raw": LZ4_RAW compression.
140-
- "zstd(n)": Zstandard compression with level n.
141-
dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses
142-
the default parquet writer setting.
143-
dictionary_page_size_limit: Sets best effort maximum dictionary page size,
144-
in bytes.
145-
statistics_enabled: Sets if statistics are enabled for any column Valid
146-
values are `none`, `chunk`, and `page`. If None, uses the default
147-
parquet writer setting.
148-
max_row_group_size: Target maximum number of rows in each row group
149-
(defaults to 1M rows). Writing larger row groups requires more memory to
150-
write, but can get better compression and be faster to read.
151-
created_by: Sets "created by" property.
152-
column_index_truncate_length: Sets column index truncate length.
153-
statistics_truncate_length: Sets statistics truncate length. If None, uses
154-
the default parquet writer setting.
155-
data_page_row_count_limit: Sets best effort maximum number of rows in a data
156-
page.
157-
encoding: Sets default encoding for any column. Valid values are `plain`,
158-
`plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
159-
`delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
160-
`byte_stream_split`. If None, uses the default parquet writer setting.
161-
bloom_filter_on_write: Write bloom filters for all columns when creating
162-
parquet files.
163-
bloom_filter_fpp: Sets bloom filter false positive probability. If None,
164-
uses the default parquet writer setting
165-
bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses
166-
the default parquet writer setting.
167-
allow_single_file_parallelism: Controls whether DataFusion will attempt to
168-
speed up writing parquet files by serializing them in parallel. Each
169-
column in each row group in each output file are serialized in parallel
170-
leveraging a maximum possible core count of n_files * n_row_groups *
171-
n_columns.
172-
maximum_parallel_row_group_writers: By default parallel parquet writer is
173-
tuned for minimum memory usage in a streaming execution plan. You may
174-
see a performance benefit when writing large parquet files by increasing
175-
`maximum_parallel_row_group_writers` and
176-
`maximum_buffered_record_batches_per_stream` if your system has idle
177-
cores and can tolerate additional memory usage. Boosting these values is
178-
likely worthwhile when writing out already in-memory data, such as from
179-
a cached data frame.
180-
maximum_buffered_record_batches_per_stream: See
181-
`maximum_parallel_row_group_writers`.
182-
column_specific_options: Overrides options for specific columns. If a column
183-
is not a part of this dictionary, it will use the parameters provided here.
122+
also be set on a column by column basis, with the field ``column_specific_options``
123+
(see ``ParquetColumnOptions``).
184124
"""
185125

186126
def __init__(
@@ -208,7 +148,72 @@ def __init__(
208148
maximum_buffered_record_batches_per_stream: int = 2,
209149
column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None,
210150
) -> None:
211-
"""Initialize the ParquetWriterOptions."""
151+
"""Initialize the ParquetWriterOptions.
152+
153+
Args:
154+
data_pagesize_limit: Sets best effort maximum size of data page in bytes.
155+
write_batch_size: Sets write_batch_size in bytes.
156+
writer_version: Sets parquet writer version. Valid values are ``1.0`` and
157+
``2.0``.
158+
skip_arrow_metadata: Skip encoding the embedded arrow metadata in the
159+
KV_meta.
160+
compression: Compression type to use. Default is ``zstd(3)``.
161+
Available compression types are
162+
163+
- ``uncompressed``: No compression.
164+
- ``snappy``: Snappy compression.
165+
- ``gzip(n)``: Gzip compression with level n.
166+
- ``brotli(n)``: Brotli compression with level n.
167+
- ``lz4``: LZ4 compression.
168+
- ``lz4_raw``: LZ4_RAW compression.
169+
- ``zstd(n)``: Zstandard compression with level n.
170+
compression_level: Compression level to set.
171+
dictionary_enabled: Sets if dictionary encoding is enabled. If ``None``,
172+
uses the default parquet writer setting.
173+
dictionary_page_size_limit: Sets best effort maximum dictionary page size,
174+
in bytes.
175+
statistics_enabled: Sets if statistics are enabled for any column Valid
176+
values are ``none``, ``chunk``, and ``page``. If ``None``, uses the
177+
default parquet writer setting.
178+
max_row_group_size: Target maximum number of rows in each row group
179+
(defaults to 1M rows). Writing larger row groups requires more memory
180+
to write, but can get better compression and be faster to read.
181+
created_by: Sets "created by" property.
182+
column_index_truncate_length: Sets column index truncate length.
183+
statistics_truncate_length: Sets statistics truncate length. If ``None``,
184+
uses the default parquet writer setting.
185+
data_page_row_count_limit: Sets best effort maximum number of rows in a data
186+
page.
187+
encoding: Sets default encoding for any column. Valid values are ``plain``,
188+
``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``,
189+
``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``,
190+
and ``byte_stream_split``. If ``None``, uses the default parquet writer
191+
setting.
192+
bloom_filter_on_write: Write bloom filters for all columns when creating
193+
parquet files.
194+
bloom_filter_fpp: Sets bloom filter false positive probability. If ``None``,
195+
uses the default parquet writer setting
196+
bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``,
197+
uses the default parquet writer setting.
198+
allow_single_file_parallelism: Controls whether DataFusion will attempt to
199+
speed up writing parquet files by serializing them in parallel. Each
200+
column in each row group in each output file are serialized in parallel
201+
leveraging a maximum possible core count of
202+
``n_files * n_row_groups * n_columns``.
203+
maximum_parallel_row_group_writers: By default parallel parquet writer is
204+
tuned for minimum memory usage in a streaming execution plan. You may
205+
see a performance benefit when writing large parquet files by increasing
206+
``maximum_parallel_row_group_writers`` and
207+
``maximum_buffered_record_batches_per_stream`` if your system has idle
208+
cores and can tolerate additional memory usage. Boosting these values is
209+
likely worthwhile when writing out already in-memory data, such as from
210+
a cached data frame.
211+
maximum_buffered_record_batches_per_stream: See
212+
``maximum_parallel_row_group_writers``.
213+
column_specific_options: Overrides options for specific columns. If a column
214+
is not a part of this dictionary, it will use the parameters provided
215+
here.
216+
"""
212217
self.data_pagesize_limit = data_pagesize_limit
213218
self.write_batch_size = write_batch_size
214219
self.writer_version = writer_version
@@ -241,29 +246,7 @@ class ParquetColumnOptions:
241246
"""Parquet options for individual columns.
242247
243248
Contains the available options that can be applied for an individual Parquet column,
244-
replacing the global options in `ParquetWriterOptions`.
245-
246-
Attributes:
247-
encoding: Sets encoding for the column path. Valid values are: `plain`,
248-
`plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
249-
`delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
250-
`byte_stream_split`. These values are not case-sensitive. If `None`, uses
251-
the default parquet options
252-
dictionary_enabled: Sets if dictionary encoding is enabled for the column path.
253-
If `None`, uses the default parquet options
254-
compression: Sets default parquet compression codec for the column path. Valid
255-
values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`,
256-
`lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If
257-
`None`, uses the default parquet options.
258-
statistics_enabled: Sets if statistics are enabled for the column Valid values
259-
are: `none`, `chunk`, and `page` These values are not case sensitive. If
260-
`None`, uses the default parquet options.
261-
bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If
262-
`None`, uses the default parquet options.
263-
bloom_filter_fpp: Sets bloom filter false positive probability for the column
264-
path. If `None`, uses the default parquet options.
265-
bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses
266-
the default parquet options.
249+
replacing the global options in ``ParquetWriterOptions``.
267250
"""
268251

269252
def __init__(
@@ -276,7 +259,31 @@ def __init__(
276259
bloom_filter_fpp: Optional[float] = None,
277260
bloom_filter_ndv: Optional[int] = None,
278261
) -> None:
279-
"""Initialize the ParquetColumnOptions."""
262+
"""Initialize the ParquetColumnOptions.
263+
264+
Args:
265+
encoding: Sets encoding for the column path. Valid values are: ``plain``,
266+
``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``,
267+
``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``,
268+
and ``byte_stream_split``. These values are not case-sensitive. If
269+
``None``, uses the default parquet options
270+
dictionary_enabled: Sets if dictionary encoding is enabled for the column
271+
path. If `None`, uses the default parquet options
272+
compression: Sets default parquet compression codec for the column path.
273+
Valid values are ``uncompressed``, ``snappy``, ``gzip(level)``, ``lzo``,
274+
``brotli(level)``, ``lz4``, ``zstd(level)``, and ``lz4_raw``. These
275+
values are not case-sensitive. If ``None``, uses the default parquet
276+
options.
277+
statistics_enabled: Sets if statistics are enabled for the column Valid
278+
values are: ``none``, ``chunk``, and ``page`` These values are not case
279+
sensitive. If ``None``, uses the default parquet options.
280+
bloom_filter_enabled: Sets if bloom filter is enabled for the column path.
281+
If ``None``, uses the default parquet options.
282+
bloom_filter_fpp: Sets bloom filter false positive probability for the
283+
column path. If ``None``, uses the default parquet options.
284+
bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``,
285+
uses the default parquet options.
286+
"""
280287
self.encoding = encoding
281288
self.dictionary_enabled = dictionary_enabled
282289
self.compression = compression

0 commit comments

Comments
 (0)