- 
                Notifications
    
You must be signed in to change notification settings  - Fork 133
 
Default to ZSTD compression when writing Parquet #981
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
0c3fed9
              b1db46c
              819de0d
              56965f4
              df7d65e
              f62a7a8
              b5b3c47
              2362992
              b86b142
              41e1742
              fe502e8
              67529b8
              811f633
              50a58b3
              55fc97e
              73519fe
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -35,6 +35,65 @@ | |
| 
     | 
||
| from datafusion._internal import DataFrame as DataFrameInternal | ||
| from datafusion.expr import Expr, SortExpr, sort_or_default | ||
| from enum import Enum | ||
| 
     | 
||
| 
     | 
||
| # excerpt from deltalake | ||
| # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 | ||
| class Compression(Enum): | ||
| """Enum representing the available compression types for Parquet files.""" | ||
| 
     | 
||
| UNCOMPRESSED = "uncompressed" | ||
| SNAPPY = "snappy" | ||
| GZIP = "gzip" | ||
| BROTLI = "brotli" | ||
| LZ4 = "lz4" | ||
| LZ0 = "lz0" | ||
| ZSTD = "zstd" | ||
| LZ4_RAW = "lz4_raw" | ||
| 
     | 
||
| @classmethod | ||
| def from_str(cls, value: str) -> "Compression": | ||
| """Convert a string to a Compression enum value. | ||
| 
     | 
||
| Args: | ||
| value (str): The string representation of the compression type. | ||
| 
     | 
||
| Returns: | ||
| Compression: The corresponding Compression enum value. | ||
| 
     | 
||
| Raises: | ||
| ValueError: If the string does not match any Compression enum value. | ||
| """ | ||
| try: | ||
| return cls(value.lower()) | ||
| except ValueError: | ||
| raise ValueError( | ||
| f"{value} is not a valid Compression. Valid values are: {[item.value for item in Compression]}" | ||
| ) | ||
| 
     | 
||
| def get_default_level(self) -> int: | ||
| """Get the default compression level for the compression type. | ||
| 
     | 
||
| Returns: | ||
| int: The default compression level. | ||
                
       | 
||
| 
     | 
||
| Raises: | ||
| KeyError: If the compression type does not have a default level. | ||
| """ | ||
| # GZIP, BROTLI defaults from deltalake | ||
| # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 | ||
| if self == Compression.GZIP: | ||
| DEFAULT = 6 | ||
| elif self == Compression.BROTLI: | ||
| DEFAULT = 1 | ||
| elif self == Compression.ZSTD: | ||
| # ZSTD default from delta-rs | ||
| # https://github.com/apache/datafusion-python/pull/981#discussion_r1904789223 | ||
| DEFAULT = 4 | ||
| else: | ||
| raise KeyError(f"{self.value} does not have a compression level.") | ||
| return DEFAULT | ||
| 
     | 
||
| 
     | 
||
| class DataFrame: | ||
| 
          
            
          
           | 
    @@ -620,17 +679,34 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None | |
| def write_parquet( | ||
| self, | ||
| path: str | pathlib.Path, | ||
| compression: str = "uncompressed", | ||
| compression: str = Compression.ZSTD.value, | ||
                
       | 
||
| compression_level: int | None = None, | ||
                
      
                  kosiew marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||
| ) -> None: | ||
| """Execute the :py:class:`DataFrame` and write the results to a Parquet file. | ||
| 
     | 
||
| Args: | ||
| path: Path of the Parquet file to write. | ||
| compression: Compression type to use. | ||
| compression_level: Compression level to use. | ||
| """ | ||
| self.df.write_parquet(str(path), compression, compression_level) | ||
| compression: Compression type to use. Default is "ZSTD". | ||
| Available compression types are: | ||
| - "uncompressed": No compression. | ||
| - "snappy": Snappy compression. | ||
| - "gzip": Gzip compression. | ||
| - "brotli": Brotli compression. | ||
| - "lz0": LZ0 compression. | ||
| - "lz4": LZ4 compression. | ||
| - "lz4_raw": LZ4_RAW compression. | ||
| - "zstd": Zstandard compression. | ||
| compression_level: Compression level to use. For ZSTD, the | ||
| recommended range is 1 to 22, with the default being 4. Higher levels | ||
| provide better compression but slower speed. | ||
| """ | ||
| compression_enum = Compression.from_str(compression) | ||
| 
     | 
||
| if compression_enum in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD}: | ||
| if compression_level is None: | ||
| compression_level = compression_enum.get_default_level() | ||
                
       | 
||
| 
     | 
||
| self.df.write_parquet(str(path), compression_enum.value, compression_level) | ||
| 
     | 
||
| def write_json(self, path: str | pathlib.Path) -> None: | ||
| """Execute the :py:class:`DataFrame` and write the results to a JSON file. | ||
| 
          
            
          
           | 
    ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: since the type hint indicates a
stryou shouldn't have to repeat here, per the google code design spec.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good nit 😄