Skip to content

Commit 5c1e360

Browse files
committed
Add Python files
1 parent e45c22a commit 5c1e360

File tree

13 files changed

+3549
-9
lines changed

13 files changed

+3549
-9
lines changed

Lib/compression/zstd/__init__.py

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
"""Python bindings to Zstandard (zstd) compression library, the API style is
2+
similar to Python's bz2/lzma/zlib modules.
3+
"""
4+
5+
__all__ = (
6+
# From this file
7+
"compressionLevel_values",
8+
"get_frame_info",
9+
"CParameter",
10+
"DParameter",
11+
"Strategy",
12+
"finalize_dict",
13+
"train_dict",
14+
"zstd_support_multithread",
15+
"compress",
16+
"decompress",
17+
# From _zstd
18+
"ZstdCompressor",
19+
"ZstdDecompressor",
20+
"ZstdDict",
21+
"ZstdError",
22+
"get_frame_size",
23+
"zstd_version",
24+
"zstd_version_info",
25+
# From zstd.zstdfile
26+
"open",
27+
"ZstdFile",
28+
)
29+
30+
from collections import namedtuple
31+
from enum import IntEnum
32+
from functools import lru_cache
33+
34+
from compression.zstd.zstdfile import ZstdFile, open
35+
from _zstd import *
36+
37+
import _zstd
38+
39+
40+
_ZSTD_CStreamSizes = _zstd._ZSTD_CStreamSizes
41+
_ZSTD_DStreamSizes = _zstd._ZSTD_DStreamSizes
42+
_train_dict = _zstd._train_dict
43+
_finalize_dict = _zstd._finalize_dict
44+
45+
46+
# TODO(emmatyping): these should be dataclasses or some other class, not namedtuples
47+
48+
# compressionLevel_values
49+
_nt_values = namedtuple("values", ["default", "min", "max"])
50+
compressionLevel_values = _nt_values(*_zstd._compressionLevel_values)
51+
52+
53+
_nt_frame_info = namedtuple("frame_info", ["decompressed_size", "dictionary_id"])
54+
55+
56+
def get_frame_info(frame_buffer):
57+
"""Get zstd frame information from a frame header.
58+
59+
Parameter
60+
frame_buffer: A bytes-like object. It should starts from the beginning of
61+
a frame, and needs to include at least the frame header (6 to
62+
18 bytes).
63+
64+
Return a two-items namedtuple: (decompressed_size, dictionary_id)
65+
66+
If decompressed_size is None, decompressed size is unknown.
67+
68+
dictionary_id is a 32-bit unsigned integer value. 0 means dictionary ID was
69+
not recorded in the frame header, the frame may or may not need a dictionary
70+
to be decoded, and the ID of such a dictionary is not specified.
71+
72+
It's possible to append more items to the namedtuple in the future."""
73+
74+
ret_tuple = _zstd._get_frame_info(frame_buffer)
75+
return _nt_frame_info(*ret_tuple)
76+
77+
78+
def _nbytes(dat):
79+
if isinstance(dat, (bytes, bytearray)):
80+
return len(dat)
81+
with memoryview(dat) as mv:
82+
return mv.nbytes
83+
84+
85+
def train_dict(samples, dict_size):
86+
"""Train a zstd dictionary, return a ZstdDict object.
87+
88+
Parameters
89+
samples: An iterable of samples, a sample is a bytes-like object
90+
represents a file.
91+
dict_size: The dictionary's maximum size, in bytes.
92+
"""
93+
# Check argument's type
94+
if not isinstance(dict_size, int):
95+
raise TypeError('dict_size argument should be an int object.')
96+
97+
# Prepare data
98+
chunks = []
99+
chunk_sizes = []
100+
for chunk in samples:
101+
chunks.append(chunk)
102+
chunk_sizes.append(_nbytes(chunk))
103+
104+
chunks = b''.join(chunks)
105+
if not chunks:
106+
raise ValueError("The samples are empty content, can't train dictionary.")
107+
108+
# samples_bytes: samples be stored concatenated in a single flat buffer.
109+
# samples_size_list: a list of each sample's size.
110+
# dict_size: size of the dictionary, in bytes.
111+
dict_content = _train_dict(chunks, chunk_sizes, dict_size)
112+
113+
return ZstdDict(dict_content)
114+
115+
116+
def finalize_dict(zstd_dict, samples, dict_size, level):
117+
"""Finalize a zstd dictionary, return a ZstdDict object.
118+
119+
Given a custom content as a basis for dictionary, and a set of samples,
120+
finalize dictionary by adding headers and statistics according to the zstd
121+
dictionary format.
122+
123+
You may compose an effective dictionary content by hand, which is used as
124+
basis dictionary, and use some samples to finalize a dictionary. The basis
125+
dictionary can be a "raw content" dictionary, see is_raw parameter in
126+
ZstdDict.__init__ method.
127+
128+
Parameters
129+
zstd_dict: A ZstdDict object, basis dictionary.
130+
samples: An iterable of samples, a sample is a bytes-like object
131+
represents a file.
132+
dict_size: The dictionary's maximum size, in bytes.
133+
level: The compression level expected to use in production. The
134+
statistics for each compression level differ, so tuning the
135+
dictionary for the compression level can help quite a bit.
136+
"""
137+
138+
# Check arguments' type
139+
if not isinstance(zstd_dict, ZstdDict):
140+
raise TypeError('zstd_dict argument should be a ZstdDict object.')
141+
if not isinstance(dict_size, int):
142+
raise TypeError('dict_size argument should be an int object.')
143+
if not isinstance(level, int):
144+
raise TypeError('level argument should be an int object.')
145+
146+
# Prepare data
147+
chunks = []
148+
chunk_sizes = []
149+
for chunk in samples:
150+
chunks.append(chunk)
151+
chunk_sizes.append(_nbytes(chunk))
152+
153+
chunks = b''.join(chunks)
154+
if not chunks:
155+
raise ValueError("The samples are empty content, can't finalize dictionary.")
156+
157+
# custom_dict_bytes: existing dictionary.
158+
# samples_bytes: samples be stored concatenated in a single flat buffer.
159+
# samples_size_list: a list of each sample's size.
160+
# dict_size: maximal size of the dictionary, in bytes.
161+
# compression_level: compression level expected to use in production.
162+
dict_content = _finalize_dict(zstd_dict.dict_content,
163+
chunks, chunk_sizes,
164+
dict_size, level)
165+
166+
return _zstd.ZstdDict(dict_content)
167+
168+
def compress(data, level=None, options=None, zstd_dict=None):
169+
"""Compress a block of data, return a bytes object of zstd compressed data.
170+
171+
Refer to ZstdCompressor's docstring for a description of the
172+
optional arguments *level*, *options*, and *zstd_dict*.
173+
174+
For incremental compression, use an ZstdCompressor instead.
175+
"""
176+
comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
177+
return comp.compress(data, ZstdCompressor.FLUSH_FRAME)
178+
179+
def decompress(data, zstd_dict=None, options=None):
180+
"""Decompress one or more frames of data.
181+
182+
Refer to ZstdDecompressor's docstring for a description of the
183+
optional arguments *zstd_dict*, *options*.
184+
185+
For incremental decompression, use an ZstdDecompressor instead.
186+
"""
187+
results = []
188+
while True:
189+
decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
190+
try:
191+
res = decomp.decompress(data)
192+
except ZstdError:
193+
if results:
194+
break # Leftover data is not a valid LZMA/XZ stream; ignore it.
195+
else:
196+
raise # Error on the first iteration; bail out.
197+
results.append(res)
198+
if not decomp.eof:
199+
raise ZstdError("Compressed data ended before the "
200+
"end-of-stream marker was reached")
201+
data = decomp.unused_data
202+
if not data:
203+
break
204+
return b"".join(results)
205+
206+
class _UnsupportedCParameter:
207+
def __set_name__(self, _, name):
208+
self.name = name
209+
210+
def __get__(self, *_, **__):
211+
msg = ("%s CParameter not available, zstd version is %s.") % (
212+
self.name,
213+
zstd_version,
214+
)
215+
raise NotImplementedError(msg)
216+
217+
218+
class CParameter(IntEnum):
219+
"""Compression parameters"""
220+
221+
compressionLevel = _zstd._ZSTD_c_compressionLevel
222+
windowLog = _zstd._ZSTD_c_windowLog
223+
hashLog = _zstd._ZSTD_c_hashLog
224+
chainLog = _zstd._ZSTD_c_chainLog
225+
searchLog = _zstd._ZSTD_c_searchLog
226+
minMatch = _zstd._ZSTD_c_minMatch
227+
targetLength = _zstd._ZSTD_c_targetLength
228+
strategy = _zstd._ZSTD_c_strategy
229+
230+
targetCBlockSize = _UnsupportedCParameter()
231+
232+
enableLongDistanceMatching = _zstd._ZSTD_c_enableLongDistanceMatching
233+
ldmHashLog = _zstd._ZSTD_c_ldmHashLog
234+
ldmMinMatch = _zstd._ZSTD_c_ldmMinMatch
235+
ldmBucketSizeLog = _zstd._ZSTD_c_ldmBucketSizeLog
236+
ldmHashRateLog = _zstd._ZSTD_c_ldmHashRateLog
237+
238+
contentSizeFlag = _zstd._ZSTD_c_contentSizeFlag
239+
checksumFlag = _zstd._ZSTD_c_checksumFlag
240+
dictIDFlag = _zstd._ZSTD_c_dictIDFlag
241+
242+
nbWorkers = _zstd._ZSTD_c_nbWorkers
243+
jobSize = _zstd._ZSTD_c_jobSize
244+
overlapLog = _zstd._ZSTD_c_overlapLog
245+
246+
@lru_cache(maxsize=None)
247+
def bounds(self):
248+
"""Return lower and upper bounds of a compression parameter, both inclusive."""
249+
# 1 means compression parameter
250+
return _zstd._get_param_bounds(1, self.value)
251+
252+
253+
class DParameter(IntEnum):
254+
"""Decompression parameters"""
255+
256+
windowLogMax = _zstd._ZSTD_d_windowLogMax
257+
258+
@lru_cache(maxsize=None)
259+
def bounds(self):
260+
"""Return lower and upper bounds of a decompression parameter, both inclusive."""
261+
# 0 means decompression parameter
262+
return _zstd._get_param_bounds(0, self.value)
263+
264+
265+
class Strategy(IntEnum):
266+
"""Compression strategies, listed from fastest to strongest.
267+
268+
Note : new strategies _might_ be added in the future, only the order
269+
(from fast to strong) is guaranteed.
270+
"""
271+
272+
fast = _zstd._ZSTD_fast
273+
dfast = _zstd._ZSTD_dfast
274+
greedy = _zstd._ZSTD_greedy
275+
lazy = _zstd._ZSTD_lazy
276+
lazy2 = _zstd._ZSTD_lazy2
277+
btlazy2 = _zstd._ZSTD_btlazy2
278+
btopt = _zstd._ZSTD_btopt
279+
btultra = _zstd._ZSTD_btultra
280+
btultra2 = _zstd._ZSTD_btultra2
281+
282+
283+
# Set CParameter/DParameter types for validity check
284+
_zstd._set_parameter_types(CParameter, DParameter)
285+
286+
zstd_support_multithread = CParameter.nbWorkers.bounds() != (0, 0)

0 commit comments

Comments
 (0)