diff --git a/numcodecs/tests/test_vlen_bytes.py b/numcodecs/tests/test_vlen_bytes.py index 467c9a85..3546dbaa 100644 --- a/numcodecs/tests/test_vlen_bytes.py +++ b/numcodecs/tests/test_vlen_bytes.py @@ -1,4 +1,3 @@ -import sys import unittest import numpy as np @@ -85,9 +84,6 @@ def test_decode_errors(): codec.decode(enc, out=np.zeros(10, dtype='i4')) -# TODO: fix this test on GitHub actions somehow... -# See https://github.com/zarr-developers/numcodecs/issues/683 -@pytest.mark.skipif(sys.platform == "darwin", reason="Test is failing on macOS on GitHub actions.") def test_encode_none(): a = np.array([b'foo', None, b'bar'], dtype=object) codec = VLenBytes() diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index ba8d1622..14530ecf 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -10,23 +10,10 @@ cimport cython from libc.stdint cimport uint8_t, uint32_t from libc.string cimport memcpy -from cpython.buffer cimport PyBuffer_IsContiguous -from cpython.bytearray cimport ( - PyByteArray_AS_STRING, - PyByteArray_FromStringAndSize, -) -from cpython.bytes cimport ( - PyBytes_AS_STRING, - PyBytes_GET_SIZE, - PyBytes_Check, - PyBytes_FromStringAndSize, -) +from cpython.bytearray cimport PyByteArray_FromStringAndSize +from cpython.bytes cimport PyBytes_FromStringAndSize from cpython.memoryview cimport PyMemoryView_GET_BUFFER -from cpython.unicode cimport ( - PyUnicode_AsUTF8String, - PyUnicode_Check, - PyUnicode_FromStringAndSize, -) +from cpython.unicode cimport PyUnicode_FromStringAndSize from numpy cimport ndarray @@ -89,15 +76,15 @@ class VLenUTF8(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length, total_length + Py_ssize_t i, L, n_items, data_length ndarray[object, ndim=1] input_values object[:] encoded_values int[:] encoded_lengths - char* encv bytes b bytearray out char* data - object u + object o + unicode u # normalise input input_values = np.asarray(buf, dtype=object).reshape(-1, order='A') @@ -110,36 +97,33 @@ class VLenUTF8(Codec): encoded_lengths = np.empty(n_items, dtype=np.intc) # first iteration to convert to bytes - data_length = 0 + data_length = HEADER_LENGTH for i in range(n_items): - u = input_values[i] - if u is None or u == 0: # treat these as missing value, normalize - u = '' - elif not PyUnicode_Check(u): - raise TypeError('expected unicode string, found %r' % u) - b = PyUnicode_AsUTF8String(u) - l = PyBytes_GET_SIZE(b) + o = input_values[i] + # replace missing value and coerce to typed data + u = "" if o is None or o == 0 else o + b = u.encode("utf-8") + L = len(b) encoded_values[i] = b - data_length += l + HEADER_LENGTH - encoded_lengths[i] = l + data_length += L + HEADER_LENGTH + encoded_lengths[i] = L # setup output - total_length = HEADER_LENGTH + data_length - out = PyByteArray_FromStringAndSize(NULL, total_length) + out = PyByteArray_FromStringAndSize(NULL, data_length) # write header - data = PyByteArray_AS_STRING(out) + data = out store_le32(data, n_items) # second iteration, store data data += HEADER_LENGTH for i in range(n_items): - l = encoded_lengths[i] - store_le32(data, l) + L = encoded_lengths[i] + store_le32(data, L) data += HEADER_LENGTH - encv = PyBytes_AS_STRING(encoded_values[i]) - memcpy(data, encv, l) - data += l + b = encoded_values[i] + memcpy(data, b, L) + data += L return out @@ -151,16 +135,14 @@ class VLenUTF8(Codec): const Py_buffer* buf_pb const char* data const char* data_end - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length # obtain memoryview buf = ensure_contiguous_ndarray(buf) - buf_mv = memoryview(buf) + buf_mv = ensure_continguous_memoryview(buf) buf_pb = PyMemoryView_GET_BUFFER(buf_mv) # sanity checks - if not PyBuffer_IsContiguous(buf_pb, b'A'): - raise BufferError("`buf` must contain contiguous memory") if buf_pb.len < HEADER_LENGTH: raise ValueError('corrupt buffer, missing or truncated header') @@ -184,12 +166,12 @@ class VLenUTF8(Codec): for i in range(n_items): if data + HEADER_LENGTH > data_end: raise ValueError('corrupt buffer, data seem truncated') - l = load_le32(data) + L = load_le32(data) data += HEADER_LENGTH - if data + l > data_end: + if data + L > data_end: raise ValueError('corrupt buffer, data seem truncated') - out[i] = PyUnicode_FromStringAndSize(data, l) - data += l + out[i] = PyUnicode_FromStringAndSize(data, L) + data += L return out @@ -225,11 +207,12 @@ class VLenBytes(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length, total_length + Py_ssize_t i, L, n_items, data_length object[:] values + object[:] normed_values int[:] lengths - char* encv - object b + object o + bytes b bytearray out char* data @@ -240,37 +223,36 @@ class VLenBytes(Codec): n_items = values.shape[0] # setup intermediates + normed_values = np.empty(n_items, dtype=object) lengths = np.empty(n_items, dtype=np.intc) # first iteration to find lengths - data_length = 0 + data_length = HEADER_LENGTH for i in range(n_items): - b = values[i] - if b is None or b == 0: # treat these as missing value, normalize - b = b'' - elif not PyBytes_Check(b): - raise TypeError('expected byte string, found %r' % b) - l = PyBytes_GET_SIZE(b) - data_length += l + HEADER_LENGTH - lengths[i] = l + o = values[i] + # replace missing value and coerce to typed data + b = b"" if o is None or o == 0 else o + normed_values[i] = b + L = len(b) + data_length += HEADER_LENGTH + L + lengths[i] = L # setup output - total_length = HEADER_LENGTH + data_length - out = PyByteArray_FromStringAndSize(NULL, total_length) + out = PyByteArray_FromStringAndSize(NULL, data_length) # write header - data = PyByteArray_AS_STRING(out) + data = out store_le32(data, n_items) # second iteration, store data data += HEADER_LENGTH for i in range(n_items): - l = lengths[i] - store_le32(data, l) + L = lengths[i] + store_le32(data, L) data += HEADER_LENGTH - encv = PyBytes_AS_STRING(values[i]) - memcpy(data, encv, l) - data += l + b = normed_values[i] + memcpy(data, b, L) + data += L return out @@ -282,16 +264,14 @@ class VLenBytes(Codec): const Py_buffer* buf_pb const char* data const char* data_end - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length # obtain memoryview buf = ensure_contiguous_ndarray(buf) - buf_mv = memoryview(buf) + buf_mv = ensure_continguous_memoryview(buf) buf_pb = PyMemoryView_GET_BUFFER(buf_mv) # sanity checks - if not PyBuffer_IsContiguous(buf_pb, b'A'): - raise BufferError("`buf` must contain contiguous memory") if buf_pb.len < HEADER_LENGTH: raise ValueError('corrupt buffer, missing or truncated header') @@ -315,12 +295,12 @@ class VLenBytes(Codec): for i in range(n_items): if data + HEADER_LENGTH > data_end: raise ValueError('corrupt buffer, data seem truncated') - l = load_le32(data) + L = load_le32(data) data += HEADER_LENGTH - if data + l > data_end: + if data + L > data_end: raise ValueError('corrupt buffer, data seem truncated') - out[i] = PyBytes_FromStringAndSize(data, l) - data += l + out[i] = PyBytes_FromStringAndSize(data, L) + data += L return out @@ -369,17 +349,16 @@ class VLenArray(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length, total_length + Py_ssize_t i, L, n_items, data_length object[:] values object[:] normed_values int[:] lengths - const char* encv bytes b bytearray out char* data memoryview value_mv const Py_buffer* value_pb - object v + object o # normalise input values = np.asarray(buf, dtype=object).reshape(-1, order='A') @@ -392,41 +371,41 @@ class VLenArray(Codec): lengths = np.empty(n_items, dtype=np.intc) # first iteration to convert to bytes - data_length = 0 + data_length = HEADER_LENGTH for i in range(n_items): - v = values[i] - if v is None: - v = np.array([], dtype=self.dtype) - else: - v = np.ascontiguousarray(v, self.dtype) - if v.ndim != 1: - raise ValueError('only 1-dimensional arrays are supported') - l = v.nbytes - normed_values[i] = v - data_length += l + HEADER_LENGTH - lengths[i] = l + o = values[i] + # replace missing value and coerce to typed data + value_mv = ensure_continguous_memoryview( + np.array([], dtype=self.dtype) if o is None + else np.ascontiguousarray(o, self.dtype) + ) + value_pb = PyMemoryView_GET_BUFFER(value_mv) + if value_pb.ndim != 1: + raise ValueError("only 1-dimensional arrays are supported") + L = value_pb.len + normed_values[i] = value_mv + data_length += HEADER_LENGTH + L + lengths[i] = L # setup output - total_length = HEADER_LENGTH + data_length - out = PyByteArray_FromStringAndSize(NULL, total_length) + out = PyByteArray_FromStringAndSize(NULL, data_length) # write header - data = PyByteArray_AS_STRING(out) + data = out store_le32(data, n_items) # second iteration, store data data += HEADER_LENGTH for i in range(n_items): - l = lengths[i] - store_le32(data, l) + L = lengths[i] + store_le32(data, L) data += HEADER_LENGTH - value_mv = ensure_continguous_memoryview(normed_values[i]) + value_mv = normed_values[i] value_pb = PyMemoryView_GET_BUFFER(value_mv) - encv = value_pb.buf - memcpy(data, encv, l) - data += l + memcpy(data, value_pb.buf, L) + data += L return out @@ -441,16 +420,14 @@ class VLenArray(Codec): object v memoryview v_mv Py_buffer* v_pb - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length # obtain memoryview buf = ensure_contiguous_ndarray(buf) - buf_mv = memoryview(buf) + buf_mv = ensure_continguous_memoryview(buf) buf_pb = PyMemoryView_GET_BUFFER(buf_mv) # sanity checks - if not PyBuffer_IsContiguous(buf_pb, b'A'): - raise BufferError("`buf` must contain contiguous memory") if buf_pb.len < HEADER_LENGTH: raise ValueError('corrupt buffer, missing or truncated header') @@ -474,18 +451,18 @@ class VLenArray(Codec): for i in range(n_items): if data + HEADER_LENGTH > data_end: raise ValueError('corrupt buffer, data seem truncated') - l = load_le32(data) + L = load_le32(data) data += HEADER_LENGTH - if data + l > data_end: + if data + L > data_end: raise ValueError('corrupt buffer, data seem truncated') # Create & fill array value - v = np.empty((l,), dtype="uint8").view(self.dtype) + v = np.empty((L,), dtype="uint8").view(self.dtype) v_mv = memoryview(v) v_pb = PyMemoryView_GET_BUFFER(v_mv) - memcpy(v_pb.buf, data, l) + memcpy(v_pb.buf, data, L) out[i] = v - data += l + data += L return out