|  | 
|  | 1 | +"""Python bindings to Zstandard (zstd) compression library, the API style is | 
|  | 2 | +similar to Python's bz2/lzma/zlib modules. | 
|  | 3 | +""" | 
|  | 4 | + | 
|  | 5 | +__all__ = ( | 
|  | 6 | +    # From this file | 
|  | 7 | +    "compressionLevel_values", | 
|  | 8 | +    "get_frame_info", | 
|  | 9 | +    "CParameter", | 
|  | 10 | +    "DParameter", | 
|  | 11 | +    "Strategy", | 
|  | 12 | +    "finalize_dict", | 
|  | 13 | +    "train_dict", | 
|  | 14 | +    "zstd_support_multithread", | 
|  | 15 | +    "compress", | 
|  | 16 | +    "decompress", | 
|  | 17 | +    # From _zstd | 
|  | 18 | +    "ZstdCompressor", | 
|  | 19 | +    "ZstdDecompressor", | 
|  | 20 | +    "ZstdDict", | 
|  | 21 | +    "ZstdError", | 
|  | 22 | +    "get_frame_size", | 
|  | 23 | +    "zstd_version", | 
|  | 24 | +    "zstd_version_info", | 
|  | 25 | +    # From zstd.zstdfile | 
|  | 26 | +    "open", | 
|  | 27 | +    "ZstdFile", | 
|  | 28 | +) | 
|  | 29 | + | 
|  | 30 | +from collections import namedtuple | 
|  | 31 | +from enum import IntEnum | 
|  | 32 | +from functools import lru_cache | 
|  | 33 | + | 
|  | 34 | +from compression.zstd.zstdfile import ZstdFile, open | 
|  | 35 | +from _zstd import * | 
|  | 36 | + | 
|  | 37 | +import _zstd | 
|  | 38 | + | 
|  | 39 | + | 
|  | 40 | +_ZSTD_CStreamSizes = _zstd._ZSTD_CStreamSizes | 
|  | 41 | +_ZSTD_DStreamSizes = _zstd._ZSTD_DStreamSizes | 
|  | 42 | +_train_dict = _zstd._train_dict | 
|  | 43 | +_finalize_dict = _zstd._finalize_dict | 
|  | 44 | + | 
|  | 45 | + | 
|  | 46 | +# TODO(emmatyping): these should be dataclasses or some other class, not namedtuples | 
|  | 47 | + | 
|  | 48 | +# compressionLevel_values | 
|  | 49 | +_nt_values = namedtuple("values", ["default", "min", "max"]) | 
|  | 50 | +compressionLevel_values = _nt_values(*_zstd._compressionLevel_values) | 
|  | 51 | + | 
|  | 52 | + | 
|  | 53 | +_nt_frame_info = namedtuple("frame_info", ["decompressed_size", "dictionary_id"]) | 
|  | 54 | + | 
|  | 55 | + | 
|  | 56 | +def get_frame_info(frame_buffer): | 
|  | 57 | +    """Get zstd frame information from a frame header. | 
|  | 58 | +
 | 
|  | 59 | +    Parameter | 
|  | 60 | +    frame_buffer: A bytes-like object. It should starts from the beginning of | 
|  | 61 | +                  a frame, and needs to include at least the frame header (6 to | 
|  | 62 | +                  18 bytes). | 
|  | 63 | +
 | 
|  | 64 | +    Return a two-items namedtuple: (decompressed_size, dictionary_id) | 
|  | 65 | +
 | 
|  | 66 | +    If decompressed_size is None, decompressed size is unknown. | 
|  | 67 | +
 | 
|  | 68 | +    dictionary_id is a 32-bit unsigned integer value. 0 means dictionary ID was | 
|  | 69 | +    not recorded in the frame header, the frame may or may not need a dictionary | 
|  | 70 | +    to be decoded, and the ID of such a dictionary is not specified. | 
|  | 71 | +
 | 
|  | 72 | +    It's possible to append more items to the namedtuple in the future.""" | 
|  | 73 | + | 
|  | 74 | +    ret_tuple = _zstd._get_frame_info(frame_buffer) | 
|  | 75 | +    return _nt_frame_info(*ret_tuple) | 
|  | 76 | + | 
|  | 77 | + | 
|  | 78 | +def _nbytes(dat): | 
|  | 79 | +    if isinstance(dat, (bytes, bytearray)): | 
|  | 80 | +        return len(dat) | 
|  | 81 | +    with memoryview(dat) as mv: | 
|  | 82 | +        return mv.nbytes | 
|  | 83 | + | 
|  | 84 | + | 
|  | 85 | +def train_dict(samples, dict_size): | 
|  | 86 | +    """Train a zstd dictionary, return a ZstdDict object. | 
|  | 87 | +
 | 
|  | 88 | +    Parameters | 
|  | 89 | +    samples:   An iterable of samples, a sample is a bytes-like object | 
|  | 90 | +               represents a file. | 
|  | 91 | +    dict_size: The dictionary's maximum size, in bytes. | 
|  | 92 | +    """ | 
|  | 93 | +    # Check argument's type | 
|  | 94 | +    if not isinstance(dict_size, int): | 
|  | 95 | +        raise TypeError('dict_size argument should be an int object.') | 
|  | 96 | + | 
|  | 97 | +    # Prepare data | 
|  | 98 | +    chunks = [] | 
|  | 99 | +    chunk_sizes = [] | 
|  | 100 | +    for chunk in samples: | 
|  | 101 | +        chunks.append(chunk) | 
|  | 102 | +        chunk_sizes.append(_nbytes(chunk)) | 
|  | 103 | + | 
|  | 104 | +    chunks = b''.join(chunks) | 
|  | 105 | +    if not chunks: | 
|  | 106 | +        raise ValueError("The samples are empty content, can't train dictionary.") | 
|  | 107 | + | 
|  | 108 | +    # samples_bytes: samples be stored concatenated in a single flat buffer. | 
|  | 109 | +    # samples_size_list: a list of each sample's size. | 
|  | 110 | +    # dict_size: size of the dictionary, in bytes. | 
|  | 111 | +    dict_content = _train_dict(chunks, chunk_sizes, dict_size) | 
|  | 112 | + | 
|  | 113 | +    return ZstdDict(dict_content) | 
|  | 114 | + | 
|  | 115 | + | 
|  | 116 | +def finalize_dict(zstd_dict, samples, dict_size, level): | 
|  | 117 | +    """Finalize a zstd dictionary, return a ZstdDict object. | 
|  | 118 | +
 | 
|  | 119 | +    Given a custom content as a basis for dictionary, and a set of samples, | 
|  | 120 | +    finalize dictionary by adding headers and statistics according to the zstd | 
|  | 121 | +    dictionary format. | 
|  | 122 | +
 | 
|  | 123 | +    You may compose an effective dictionary content by hand, which is used as | 
|  | 124 | +    basis dictionary, and use some samples to finalize a dictionary. The basis | 
|  | 125 | +    dictionary can be a "raw content" dictionary, see is_raw parameter in | 
|  | 126 | +    ZstdDict.__init__ method. | 
|  | 127 | +
 | 
|  | 128 | +    Parameters | 
|  | 129 | +    zstd_dict: A ZstdDict object, basis dictionary. | 
|  | 130 | +    samples:   An iterable of samples, a sample is a bytes-like object | 
|  | 131 | +               represents a file. | 
|  | 132 | +    dict_size: The dictionary's maximum size, in bytes. | 
|  | 133 | +    level:     The compression level expected to use in production. The | 
|  | 134 | +               statistics for each compression level differ, so tuning the | 
|  | 135 | +               dictionary for the compression level can help quite a bit. | 
|  | 136 | +    """ | 
|  | 137 | + | 
|  | 138 | +    # Check arguments' type | 
|  | 139 | +    if not isinstance(zstd_dict, ZstdDict): | 
|  | 140 | +        raise TypeError('zstd_dict argument should be a ZstdDict object.') | 
|  | 141 | +    if not isinstance(dict_size, int): | 
|  | 142 | +        raise TypeError('dict_size argument should be an int object.') | 
|  | 143 | +    if not isinstance(level, int): | 
|  | 144 | +        raise TypeError('level argument should be an int object.') | 
|  | 145 | + | 
|  | 146 | +    # Prepare data | 
|  | 147 | +    chunks = [] | 
|  | 148 | +    chunk_sizes = [] | 
|  | 149 | +    for chunk in samples: | 
|  | 150 | +        chunks.append(chunk) | 
|  | 151 | +        chunk_sizes.append(_nbytes(chunk)) | 
|  | 152 | + | 
|  | 153 | +    chunks = b''.join(chunks) | 
|  | 154 | +    if not chunks: | 
|  | 155 | +        raise ValueError("The samples are empty content, can't finalize dictionary.") | 
|  | 156 | + | 
|  | 157 | +    # custom_dict_bytes: existing dictionary. | 
|  | 158 | +    # samples_bytes: samples be stored concatenated in a single flat buffer. | 
|  | 159 | +    # samples_size_list: a list of each sample's size. | 
|  | 160 | +    # dict_size: maximal size of the dictionary, in bytes. | 
|  | 161 | +    # compression_level: compression level expected to use in production. | 
|  | 162 | +    dict_content = _finalize_dict(zstd_dict.dict_content, | 
|  | 163 | +                                  chunks, chunk_sizes, | 
|  | 164 | +                                  dict_size, level) | 
|  | 165 | + | 
|  | 166 | +    return _zstd.ZstdDict(dict_content) | 
|  | 167 | + | 
|  | 168 | +def compress(data, level=None, options=None, zstd_dict=None): | 
|  | 169 | +    """Compress a block of data, return a bytes object of zstd compressed data. | 
|  | 170 | +
 | 
|  | 171 | +    Refer to ZstdCompressor's docstring for a description of the | 
|  | 172 | +    optional arguments *level*, *options*, and *zstd_dict*. | 
|  | 173 | +
 | 
|  | 174 | +    For incremental compression, use an ZstdCompressor instead. | 
|  | 175 | +    """ | 
|  | 176 | +    comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict) | 
|  | 177 | +    return comp.compress(data, ZstdCompressor.FLUSH_FRAME) | 
|  | 178 | + | 
|  | 179 | +def decompress(data, zstd_dict=None, options=None): | 
|  | 180 | +    """Decompress one or more frames of data. | 
|  | 181 | +
 | 
|  | 182 | +    Refer to ZstdDecompressor's docstring for a description of the | 
|  | 183 | +    optional arguments *zstd_dict*, *options*. | 
|  | 184 | +
 | 
|  | 185 | +    For incremental decompression, use an ZstdDecompressor instead. | 
|  | 186 | +    """ | 
|  | 187 | +    results = [] | 
|  | 188 | +    while True: | 
|  | 189 | +        decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict) | 
|  | 190 | +        try: | 
|  | 191 | +            res = decomp.decompress(data) | 
|  | 192 | +        except ZstdError: | 
|  | 193 | +            if results: | 
|  | 194 | +                break  # Leftover data is not a valid LZMA/XZ stream; ignore it. | 
|  | 195 | +            else: | 
|  | 196 | +                raise  # Error on the first iteration; bail out. | 
|  | 197 | +        results.append(res) | 
|  | 198 | +        if not decomp.eof: | 
|  | 199 | +            raise ZstdError("Compressed data ended before the " | 
|  | 200 | +                            "end-of-stream marker was reached") | 
|  | 201 | +        data = decomp.unused_data | 
|  | 202 | +        if not data: | 
|  | 203 | +            break | 
|  | 204 | +    return b"".join(results) | 
|  | 205 | + | 
|  | 206 | +class _UnsupportedCParameter: | 
|  | 207 | +    def __set_name__(self, _, name): | 
|  | 208 | +        self.name = name | 
|  | 209 | + | 
|  | 210 | +    def __get__(self, *_, **__): | 
|  | 211 | +        msg = ("%s CParameter not available, zstd version is %s.") % ( | 
|  | 212 | +            self.name, | 
|  | 213 | +            zstd_version, | 
|  | 214 | +        ) | 
|  | 215 | +        raise NotImplementedError(msg) | 
|  | 216 | + | 
|  | 217 | + | 
|  | 218 | +class CParameter(IntEnum): | 
|  | 219 | +    """Compression parameters""" | 
|  | 220 | + | 
|  | 221 | +    compressionLevel = _zstd._ZSTD_c_compressionLevel | 
|  | 222 | +    windowLog = _zstd._ZSTD_c_windowLog | 
|  | 223 | +    hashLog = _zstd._ZSTD_c_hashLog | 
|  | 224 | +    chainLog = _zstd._ZSTD_c_chainLog | 
|  | 225 | +    searchLog = _zstd._ZSTD_c_searchLog | 
|  | 226 | +    minMatch = _zstd._ZSTD_c_minMatch | 
|  | 227 | +    targetLength = _zstd._ZSTD_c_targetLength | 
|  | 228 | +    strategy = _zstd._ZSTD_c_strategy | 
|  | 229 | + | 
|  | 230 | +    targetCBlockSize = _UnsupportedCParameter() | 
|  | 231 | + | 
|  | 232 | +    enableLongDistanceMatching = _zstd._ZSTD_c_enableLongDistanceMatching | 
|  | 233 | +    ldmHashLog = _zstd._ZSTD_c_ldmHashLog | 
|  | 234 | +    ldmMinMatch = _zstd._ZSTD_c_ldmMinMatch | 
|  | 235 | +    ldmBucketSizeLog = _zstd._ZSTD_c_ldmBucketSizeLog | 
|  | 236 | +    ldmHashRateLog = _zstd._ZSTD_c_ldmHashRateLog | 
|  | 237 | + | 
|  | 238 | +    contentSizeFlag = _zstd._ZSTD_c_contentSizeFlag | 
|  | 239 | +    checksumFlag = _zstd._ZSTD_c_checksumFlag | 
|  | 240 | +    dictIDFlag = _zstd._ZSTD_c_dictIDFlag | 
|  | 241 | + | 
|  | 242 | +    nbWorkers = _zstd._ZSTD_c_nbWorkers | 
|  | 243 | +    jobSize = _zstd._ZSTD_c_jobSize | 
|  | 244 | +    overlapLog = _zstd._ZSTD_c_overlapLog | 
|  | 245 | + | 
|  | 246 | +    @lru_cache(maxsize=None) | 
|  | 247 | +    def bounds(self): | 
|  | 248 | +        """Return lower and upper bounds of a compression parameter, both inclusive.""" | 
|  | 249 | +        # 1 means compression parameter | 
|  | 250 | +        return _zstd._get_param_bounds(1, self.value) | 
|  | 251 | + | 
|  | 252 | + | 
|  | 253 | +class DParameter(IntEnum): | 
|  | 254 | +    """Decompression parameters""" | 
|  | 255 | + | 
|  | 256 | +    windowLogMax = _zstd._ZSTD_d_windowLogMax | 
|  | 257 | + | 
|  | 258 | +    @lru_cache(maxsize=None) | 
|  | 259 | +    def bounds(self): | 
|  | 260 | +        """Return lower and upper bounds of a decompression parameter, both inclusive.""" | 
|  | 261 | +        # 0 means decompression parameter | 
|  | 262 | +        return _zstd._get_param_bounds(0, self.value) | 
|  | 263 | + | 
|  | 264 | + | 
|  | 265 | +class Strategy(IntEnum): | 
|  | 266 | +    """Compression strategies, listed from fastest to strongest. | 
|  | 267 | +
 | 
|  | 268 | +    Note : new strategies _might_ be added in the future, only the order | 
|  | 269 | +    (from fast to strong) is guaranteed. | 
|  | 270 | +    """ | 
|  | 271 | + | 
|  | 272 | +    fast = _zstd._ZSTD_fast | 
|  | 273 | +    dfast = _zstd._ZSTD_dfast | 
|  | 274 | +    greedy = _zstd._ZSTD_greedy | 
|  | 275 | +    lazy = _zstd._ZSTD_lazy | 
|  | 276 | +    lazy2 = _zstd._ZSTD_lazy2 | 
|  | 277 | +    btlazy2 = _zstd._ZSTD_btlazy2 | 
|  | 278 | +    btopt = _zstd._ZSTD_btopt | 
|  | 279 | +    btultra = _zstd._ZSTD_btultra | 
|  | 280 | +    btultra2 = _zstd._ZSTD_btultra2 | 
|  | 281 | + | 
|  | 282 | + | 
|  | 283 | +# Set CParameter/DParameter types for validity check | 
|  | 284 | +_zstd._set_parameter_types(CParameter, DParameter) | 
|  | 285 | + | 
|  | 286 | +zstd_support_multithread = CParameter.nbWorkers.bounds() != (0, 0) | 
0 commit comments