Move parameters into init method to fix documentation error

timsaucer · timsaucer · commit 615e2a123f5d · 2025-08-30T10:39:28.000-04:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -119,68 +119,8 @@ class ParquetWriterOptions:
     """Advanced parquet writer options.
 
     Allows settings the writer options that apply to the entire file. Some options can
-    also be set on a column by column basis, with the field `column_specific_options`
-    (see `ParquetColumnOptions`).
-
-    Attributes:
-        data_pagesize_limit: Sets best effort maximum size of data page in bytes.
-        write_batch_size: Sets write_batch_size in bytes.
-        writer_version: Sets parquet writer version. Valid values are `1.0` and
-            `2.0`.
-        skip_arrow_metadata: Skip encoding the embedded arrow metadata in the
-            KV_meta.
-        compression: Compression type to use. Default is "zstd(3)".
-            Available compression types are
-            - "uncompressed": No compression.
-            - "snappy": Snappy compression.
-            - "gzip(n)": Gzip compression with level n.
-            - "brotli(n)": Brotli compression with level n.
-            - "lz4": LZ4 compression.
-            - "lz4_raw": LZ4_RAW compression.
-            - "zstd(n)": Zstandard compression with level n.
-        dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses
-            the default parquet writer setting.
-        dictionary_page_size_limit: Sets best effort maximum dictionary page size,
-            in bytes.
-        statistics_enabled: Sets if statistics are enabled for any column Valid
-            values are `none`, `chunk`, and `page`. If None, uses the default
-            parquet writer setting.
-        max_row_group_size: Target maximum number of rows in each row group
-            (defaults to 1M rows). Writing larger row groups requires more memory to
-            write, but can get better compression and be faster to read.
-        created_by: Sets "created by" property.
-        column_index_truncate_length: Sets column index truncate length.
-        statistics_truncate_length: Sets statistics truncate length. If None, uses
-            the default parquet writer setting.
-        data_page_row_count_limit: Sets best effort maximum number of rows in a data
-            page.
-        encoding: Sets default encoding for any column. Valid values are `plain`,
-            `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
-            `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
-            `byte_stream_split`. If None, uses the default parquet writer setting.
-        bloom_filter_on_write: Write bloom filters for all columns when creating
-            parquet files.
-        bloom_filter_fpp: Sets bloom filter false positive probability. If None,
-            uses the default parquet writer setting
-        bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses
-            the default parquet writer setting.
-        allow_single_file_parallelism: Controls whether DataFusion will attempt to
-            speed up writing parquet files by serializing them in parallel. Each
-            column in each row group in each output file are serialized in parallel
-            leveraging a maximum possible core count of n_files * n_row_groups *
-            n_columns.
-        maximum_parallel_row_group_writers: By default parallel parquet writer is
-            tuned for minimum memory usage in a streaming execution plan. You may
-            see a performance benefit when writing large parquet files by increasing
-            `maximum_parallel_row_group_writers` and
-            `maximum_buffered_record_batches_per_stream` if your system has idle
-            cores and can tolerate additional memory usage. Boosting these values is
-            likely worthwhile when writing out already in-memory data, such as from
-            a cached data frame.
-        maximum_buffered_record_batches_per_stream: See
-            `maximum_parallel_row_group_writers`.
-        column_specific_options: Overrides options for specific columns. If a column
-            is not a part of this dictionary, it will use the parameters provided here.
+    also be set on a column by column basis, with the field ``column_specific_options``
+    (see ``ParquetColumnOptions``).
     """
 
     def __init__(
@@ -208,7 +148,72 @@ def __init__(
         maximum_buffered_record_batches_per_stream: int = 2,
         column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None,
     ) -> None:
-        """Initialize the ParquetWriterOptions."""
+        """Initialize the ParquetWriterOptions.
+
+        Args:
+            data_pagesize_limit: Sets best effort maximum size of data page in bytes.
+            write_batch_size: Sets write_batch_size in bytes.
+            writer_version: Sets parquet writer version. Valid values are ``1.0`` and
+                ``2.0``.
+            skip_arrow_metadata: Skip encoding the embedded arrow metadata in the
+                KV_meta.
+            compression: Compression type to use. Default is ``zstd(3)``.
+                Available compression types are
+
+                - ``uncompressed``: No compression.
+                - ``snappy``: Snappy compression.
+                - ``gzip(n)``: Gzip compression with level n.
+                - ``brotli(n)``: Brotli compression with level n.
+                - ``lz4``: LZ4 compression.
+                - ``lz4_raw``: LZ4_RAW compression.
+                - ``zstd(n)``: Zstandard compression with level n.
+            compression_level: Compression level to set.
+            dictionary_enabled: Sets if dictionary encoding is enabled. If ``None``,
+                uses the default parquet writer setting.
+            dictionary_page_size_limit: Sets best effort maximum dictionary page size,
+                in bytes.
+            statistics_enabled: Sets if statistics are enabled for any column Valid
+                values are ``none``, ``chunk``, and ``page``. If ``None``, uses the
+                default parquet writer setting.
+            max_row_group_size: Target maximum number of rows in each row group
+                (defaults to 1M rows). Writing larger row groups requires more memory
+                to write, but can get better compression and be faster to read.
+            created_by: Sets "created by" property.
+            column_index_truncate_length: Sets column index truncate length.
+            statistics_truncate_length: Sets statistics truncate length. If ``None``,
+                uses the default parquet writer setting.
+            data_page_row_count_limit: Sets best effort maximum number of rows in a data
+                page.
+            encoding: Sets default encoding for any column. Valid values are ``plain``,
+                ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``,
+                ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``,
+                and ``byte_stream_split``. If ``None``, uses the default parquet writer
+                setting.
+            bloom_filter_on_write: Write bloom filters for all columns when creating
+                parquet files.
+            bloom_filter_fpp: Sets bloom filter false positive probability. If ``None``,
+                uses the default parquet writer setting
+            bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``,
+                uses the default parquet writer setting.
+            allow_single_file_parallelism: Controls whether DataFusion will attempt to
+                speed up writing parquet files by serializing them in parallel. Each
+                column in each row group in each output file are serialized in parallel
+                leveraging a maximum possible core count of
+                ``n_files * n_row_groups * n_columns``.
+            maximum_parallel_row_group_writers: By default parallel parquet writer is
+                tuned for minimum memory usage in a streaming execution plan. You may
+                see a performance benefit when writing large parquet files by increasing
+                ``maximum_parallel_row_group_writers`` and
+                ``maximum_buffered_record_batches_per_stream`` if your system has idle
+                cores and can tolerate additional memory usage. Boosting these values is
+                likely worthwhile when writing out already in-memory data, such as from
+                a cached data frame.
+            maximum_buffered_record_batches_per_stream: See
+                ``maximum_parallel_row_group_writers``.
+            column_specific_options: Overrides options for specific columns. If a column
+                is not a part of this dictionary, it will use the parameters provided
+                here.
+        """
         self.data_pagesize_limit = data_pagesize_limit
         self.write_batch_size = write_batch_size
         self.writer_version = writer_version
@@ -241,29 +246,7 @@ class ParquetColumnOptions:
     """Parquet options for individual columns.
 
     Contains the available options that can be applied for an individual Parquet column,
-    replacing the global options in `ParquetWriterOptions`.
-
-    Attributes:
-        encoding: Sets encoding for the column path. Valid values are: `plain`,
-            `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
-            `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
-            `byte_stream_split`. These values are not case-sensitive. If `None`, uses
-            the default parquet options
-        dictionary_enabled: Sets if dictionary encoding is enabled for the column path.
-            If `None`, uses the default parquet options
-        compression: Sets default parquet compression codec for the column path. Valid
-            values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`,
-            `lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If
-            `None`, uses the default parquet options.
-        statistics_enabled: Sets if statistics are enabled for the column Valid values
-            are: `none`, `chunk`, and `page` These values are not case sensitive. If
-            `None`, uses the default parquet options.
-        bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If
-            `None`, uses the default parquet options.
-        bloom_filter_fpp: Sets bloom filter false positive probability for the column
-            path. If `None`, uses the default parquet options.
-        bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses
-            the default parquet options.
+    replacing the global options in ``ParquetWriterOptions``.
     """
 
     def __init__(
@@ -276,7 +259,31 @@ def __init__(
         bloom_filter_fpp: Optional[float] = None,
         bloom_filter_ndv: Optional[int] = None,
     ) -> None:
-        """Initialize the ParquetColumnOptions."""
+        """Initialize the ParquetColumnOptions.
+
+        Args:
+            encoding: Sets encoding for the column path. Valid values are: ``plain``,
+                ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``,
+                ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``,
+                and ``byte_stream_split``. These values are not case-sensitive. If
+                ``None``, uses the default parquet options
+            dictionary_enabled: Sets if dictionary encoding is enabled for the column
+                path. If `None`, uses the default parquet options
+            compression: Sets default parquet compression codec for the column path.
+                Valid values are ``uncompressed``, ``snappy``, ``gzip(level)``, ``lzo``,
+                ``brotli(level)``, ``lz4``, ``zstd(level)``, and ``lz4_raw``. These
+                 values are not case-sensitive. If ``None``, uses the default parquet
+                 options.
+            statistics_enabled: Sets if statistics are enabled for the column Valid
+                values are: ``none``, ``chunk``, and ``page`` These values are not case
+                sensitive. If ``None``, uses the default parquet options.
+            bloom_filter_enabled: Sets if bloom filter is enabled for the column path.
+                If ``None``, uses the default parquet options.
+            bloom_filter_fpp: Sets bloom filter false positive probability for the
+                column path. If ``None``, uses the default parquet options.
+            bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``,
+                uses the default parquet options.
+        """
         self.encoding = encoding
         self.dictionary_enabled = dictionary_enabled
         self.compression = compression