Skip to content

Commit 9f66389

Browse files
authored
Merge branch 'main' into chore/abc_type_hints
2 parents 806aa61 + 725cf25 commit 9f66389

File tree

11 files changed

+383
-254
lines changed

11 files changed

+383
-254
lines changed

docs/release.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,22 @@ Improvements
2828
~~~~~~~~~~~~
2929
* Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec.
3030
By :user:`Cas Wognum <cwognum>`.
31+
* Add ``typesize`` argument to ``Blosc`` to allow for buffers that are passed to ``encode``
32+
use that information. zarr v3 currently has its Blosc codec as bytes-to-bytes but does retain
33+
the size information so using it here allows for massive compression ratio gains.
34+
By :user:`Ilan Gold <ilan-gold>`
3135

3236
Fixes
3337
~~~~~
3438
* Remove redundant ``id`` from codec metadata serialization in Zarr3 codecs.
3539
By :user:`Norman Rzepka <normanrz>`, :issue:`685`
40+
* Preallocate output buffers and resize directly as needed.
41+
By :user:`John Kirkham <jakirkham>`, :issue:`656`
42+
43+
Maintenance
44+
~~~~~~~~~~~
45+
* Replace internal ``Buffer`` usage with ``memoryview``\ s.
46+
By :user:`John Kirkham <jakirkham>`, :issue:`656`
3647

3748
.. _release_0.15.0:
3849

numcodecs/blosc.pyx

Lines changed: 94 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@ import os
99
from deprecated import deprecated
1010

1111

12-
from cpython.buffer cimport PyBUF_ANY_CONTIGUOUS, PyBUF_WRITEABLE
13-
from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING
12+
from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_FromStringAndSize
13+
from cpython.memoryview cimport PyMemoryView_GET_BUFFER
1414

15+
from .compat_ext cimport PyBytes_RESIZE, ensure_continguous_memoryview
1516

16-
from .compat_ext cimport Buffer
17-
from .compat_ext import Buffer
1817
from .compat import ensure_contiguous_ndarray
1918
from .abc import Codec
2019

@@ -154,17 +153,16 @@ def _cbuffer_sizes(source):
154153
155154
"""
156155
cdef:
157-
Buffer buffer
156+
memoryview source_mv
157+
const Py_buffer* source_pb
158158
size_t nbytes, cbytes, blocksize
159159

160-
# obtain buffer
161-
buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
160+
# obtain source memoryview
161+
source_mv = ensure_continguous_memoryview(source)
162+
source_pb = PyMemoryView_GET_BUFFER(source_mv)
162163

163164
# determine buffer size
164-
blosc_cbuffer_sizes(buffer.ptr, &nbytes, &cbytes, &blocksize)
165-
166-
# release buffers
167-
buffer.release()
165+
blosc_cbuffer_sizes(source_pb.buf, &nbytes, &cbytes, &blocksize)
168166

169167
return nbytes, cbytes, blocksize
170168

@@ -173,16 +171,15 @@ cbuffer_sizes = deprecated(_cbuffer_sizes)
173171
def cbuffer_complib(source):
174172
"""Return the name of the compression library used to compress `source`."""
175173
cdef:
176-
Buffer buffer
174+
memoryview source_mv
175+
const Py_buffer* source_pb
177176

178-
# obtain buffer
179-
buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
177+
# obtain source memoryview
178+
source_mv = ensure_continguous_memoryview(source)
179+
source_pb = PyMemoryView_GET_BUFFER(source_mv)
180180

181181
# determine buffer size
182-
complib = blosc_cbuffer_complib(buffer.ptr)
183-
184-
# release buffers
185-
buffer.release()
182+
complib = blosc_cbuffer_complib(source_pb.buf)
186183

187184
complib = complib.decode('ascii')
188185

@@ -202,18 +199,17 @@ def _cbuffer_metainfo(source):
202199
203200
"""
204201
cdef:
205-
Buffer buffer
202+
memoryview source_mv
203+
const Py_buffer* source_pb
206204
size_t typesize
207205
int flags
208206

209-
# obtain buffer
210-
buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
207+
# obtain source memoryview
208+
source_mv = ensure_continguous_memoryview(source)
209+
source_pb = PyMemoryView_GET_BUFFER(source_mv)
211210

212211
# determine buffer size
213-
blosc_cbuffer_metainfo(buffer.ptr, &typesize, &flags)
214-
215-
# release buffers
216-
buffer.release()
212+
blosc_cbuffer_metainfo(source_pb.buf, &typesize, &flags)
217213

218214
# decompose flags
219215
if flags & BLOSC_DOSHUFFLE:
@@ -235,7 +231,7 @@ def _err_bad_cname(cname):
235231
err_bad_cname = deprecated(_err_bad_cname)
236232

237233
def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
238-
int blocksize=AUTOBLOCKS):
234+
int blocksize=AUTOBLOCKS, typesize=None):
239235
"""Compress data.
240236
241237
Parameters
@@ -263,23 +259,34 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
263259
"""
264260

265261
cdef:
266-
char *source_ptr
267-
char *dest_ptr
268-
Buffer source_buffer
262+
memoryview source_mv
263+
const Py_buffer* source_pb
264+
const char* source_ptr
269265
size_t nbytes, itemsize
270266
int cbytes
271267
bytes dest
268+
char* dest_ptr
272269

273270
# check valid cname early
274271
cname_str = cname.decode('ascii')
275272
if cname_str not in list_compressors():
276273
_err_bad_cname(cname_str)
277274

278-
# setup source buffer
279-
source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
280-
source_ptr = source_buffer.ptr
281-
nbytes = source_buffer.nbytes
282-
itemsize = source_buffer.itemsize
275+
# obtain source memoryview
276+
source_mv = ensure_continguous_memoryview(source)
277+
source_pb = PyMemoryView_GET_BUFFER(source_mv)
278+
279+
# extract metadata
280+
source_ptr = <const char*>source_pb.buf
281+
nbytes = source_pb.len
282+
283+
# validate typesize
284+
if isinstance(typesize, int):
285+
if typesize < 1:
286+
raise ValueError(f"Cannot use typesize {typesize} less than 1.")
287+
itemsize = typesize
288+
else:
289+
itemsize = source_pb.itemsize
283290

284291
# determine shuffle
285292
if shuffle == AUTOSHUFFLE:
@@ -328,16 +335,14 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
328335
cname, blocksize, 1)
329336

330337
finally:
331-
332-
# release buffers
333-
source_buffer.release()
338+
pass
334339

335340
# check compression was successful
336341
if cbytes <= 0:
337342
raise RuntimeError('error during blosc compression: %d' % cbytes)
338343

339344
# resize after compression
340-
dest = dest[:cbytes]
345+
PyBytes_RESIZE(dest, cbytes)
341346

342347
return dest
343348

@@ -361,30 +366,36 @@ def decompress(source, dest=None):
361366
"""
362367
cdef:
363368
int ret
364-
char *source_ptr
365-
char *dest_ptr
366-
Buffer source_buffer
367-
Buffer dest_buffer = None
369+
memoryview source_mv
370+
const Py_buffer* source_pb
371+
const char* source_ptr
372+
memoryview dest_mv
373+
Py_buffer* dest_pb
374+
char* dest_ptr
368375
size_t nbytes, cbytes, blocksize
369376

370-
# setup source buffer
371-
source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
372-
source_ptr = source_buffer.ptr
377+
# obtain source memoryview
378+
source_mv = ensure_continguous_memoryview(source)
379+
source_pb = PyMemoryView_GET_BUFFER(source_mv)
380+
381+
# get source pointer
382+
source_ptr = <const char*>source_pb.buf
373383

374384
# determine buffer size
375385
blosc_cbuffer_sizes(source_ptr, &nbytes, &cbytes, &blocksize)
376386

377387
# setup destination buffer
378388
if dest is None:
379389
# allocate memory
380-
dest = PyBytes_FromStringAndSize(NULL, nbytes)
381-
dest_ptr = PyBytes_AS_STRING(dest)
382-
dest_nbytes = nbytes
390+
dest_1d = dest = PyBytes_FromStringAndSize(NULL, nbytes)
383391
else:
384-
arr = ensure_contiguous_ndarray(dest)
385-
dest_buffer = Buffer(arr, PyBUF_ANY_CONTIGUOUS | PyBUF_WRITEABLE)
386-
dest_ptr = dest_buffer.ptr
387-
dest_nbytes = dest_buffer.nbytes
392+
dest_1d = ensure_contiguous_ndarray(dest)
393+
394+
# obtain dest memoryview
395+
dest_mv = memoryview(dest_1d)
396+
dest_pb = PyMemoryView_GET_BUFFER(dest_mv)
397+
dest_ptr = <char*>dest_pb.buf
398+
dest_nbytes = dest_pb.len
388399

389400
try:
390401

@@ -403,11 +414,7 @@ def decompress(source, dest=None):
403414
ret = blosc_decompress_ctx(source_ptr, dest_ptr, nbytes, 1)
404415

405416
finally:
406-
407-
# release buffers
408-
source_buffer.release()
409-
if dest_buffer is not None:
410-
dest_buffer.release()
417+
pass
411418

412419
# handle errors
413420
if ret <= 0:
@@ -444,14 +451,20 @@ def _decompress_partial(source, start, nitems, dest=None):
444451
int encoding_size
445452
int nitems_bytes
446453
int start_bytes
447-
char *source_ptr
448-
char *dest_ptr
449-
Buffer source_buffer
450-
Buffer dest_buffer = None
454+
memoryview source_mv
455+
const Py_buffer* source_pb
456+
const char* source_ptr
457+
memoryview dest_mv
458+
Py_buffer* dest_pb
459+
char* dest_ptr
460+
size_t dest_nbytes
461+
462+
# obtain source memoryview
463+
source_mv = ensure_continguous_memoryview(source)
464+
source_pb = PyMemoryView_GET_BUFFER(source_mv)
451465

452-
# setup source buffer
453-
source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
454-
source_ptr = source_buffer.ptr
466+
# setup source pointer
467+
source_ptr = <const char*>source_pb.buf
455468

456469
# get encoding size from source buffer header
457470
encoding_size = source[3]
@@ -462,26 +475,25 @@ def _decompress_partial(source, start, nitems, dest=None):
462475

463476
# setup destination buffer
464477
if dest is None:
465-
dest = PyBytes_FromStringAndSize(NULL, nitems_bytes)
466-
dest_ptr = PyBytes_AS_STRING(dest)
467-
dest_nbytes = nitems_bytes
478+
# allocate memory
479+
dest_1d = dest = PyBytes_FromStringAndSize(NULL, nitems_bytes)
468480
else:
469-
arr = ensure_contiguous_ndarray(dest)
470-
dest_buffer = Buffer(arr, PyBUF_ANY_CONTIGUOUS | PyBUF_WRITEABLE)
471-
dest_ptr = dest_buffer.ptr
472-
dest_nbytes = dest_buffer.nbytes
481+
dest_1d = ensure_contiguous_ndarray(dest)
482+
483+
# obtain dest memoryview
484+
dest_mv = memoryview(dest_1d)
485+
dest_pb = PyMemoryView_GET_BUFFER(dest_mv)
486+
dest_ptr = <char*>dest_pb.buf
487+
dest_nbytes = dest_pb.len
473488

474489
# try decompression
475490
try:
476491
if dest_nbytes < nitems_bytes:
477492
raise ValueError('destination buffer too small; expected at least %s, '
478493
'got %s' % (nitems_bytes, dest_nbytes))
479494
ret = blosc_getitem(source_ptr, start, nitems, dest_ptr)
480-
481495
finally:
482-
source_buffer.release()
483-
if dest_buffer is not None:
484-
dest_buffer.release()
496+
pass
485497

486498
# ret refers to the number of bytes returned from blosc_getitem.
487499
if ret <= 0:
@@ -552,6 +564,8 @@ class Blosc(Codec):
552564
blocksize : int
553565
The requested size of the compressed blocks. If 0 (default), an automatic
554566
blocksize will be used.
567+
typesize : int, optional
568+
The size in bytes of uncompressed array elements.
555569
556570
See Also
557571
--------
@@ -566,7 +580,9 @@ class Blosc(Codec):
566580
AUTOSHUFFLE = AUTOSHUFFLE
567581
max_buffer_size = 2**31 - 1
568582

569-
def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS):
583+
def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None):
584+
if isinstance(typesize, int) and typesize < 1:
585+
raise ValueError(f"Cannot use typesize {typesize} less than 1.")
570586
self.cname = cname
571587
if isinstance(cname, str):
572588
self._cname_bytes = cname.encode('ascii')
@@ -575,10 +591,11 @@ class Blosc(Codec):
575591
self.clevel = clevel
576592
self.shuffle = shuffle
577593
self.blocksize = blocksize
594+
self.typesize = typesize
578595

579596
def encode(self, buf):
580597
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)
581-
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize)
598+
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize, self.typesize)
582599

583600
def decode(self, buf, out=None):
584601
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)

0 commit comments

Comments
 (0)