From 07571e548d85ce02f144df4ec5ee27d176e2cc13 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 15:45:40 -0700 Subject: [PATCH 01/11] Re-enable `VLenBytes` round-trip `None` test --- numcodecs/tests/test_vlen_bytes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/numcodecs/tests/test_vlen_bytes.py b/numcodecs/tests/test_vlen_bytes.py index 467c9a85..3546dbaa 100644 --- a/numcodecs/tests/test_vlen_bytes.py +++ b/numcodecs/tests/test_vlen_bytes.py @@ -1,4 +1,3 @@ -import sys import unittest import numpy as np @@ -85,9 +84,6 @@ def test_decode_errors(): codec.decode(enc, out=np.zeros(10, dtype='i4')) -# TODO: fix this test on GitHub actions somehow... -# See https://github.com/zarr-developers/numcodecs/issues/683 -@pytest.mark.skipif(sys.platform == "darwin", reason="Test is failing on macOS on GitHub actions.") def test_encode_none(): a = np.array([b'foo', None, b'bar'], dtype=object) codec = VLenBytes() From b97fc62d50f56442d95e0d9a1f20ed872e931d21 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 16:52:18 -0700 Subject: [PATCH 02/11] Store normalized values in `VLenBytes.encode` During `VLenBytes.encode`, it normalizes some values. However it was not actually keeping the normalized values. So when it comes time to get properties from these items, it assumes they are of the right type. However it is still grabbing the original unnormalized values. So fix this by storing the normalized values for process. --- numcodecs/vlen.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index ba8d1622..2f1f39d8 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -227,6 +227,7 @@ class VLenBytes(Codec): cdef: Py_ssize_t i, l, n_items, data_length, total_length object[:] values + object[:] normed_values int[:] lengths char* encv object b @@ -240,6 +241,7 @@ class VLenBytes(Codec): n_items = values.shape[0] # setup intermediates + normed_values = np.empty(n_items, dtype=object) lengths = np.empty(n_items, dtype=np.intc) # first iteration to find lengths @@ -250,6 +252,7 @@ class VLenBytes(Codec): b = b'' elif not PyBytes_Check(b): raise TypeError('expected byte string, found %r' % b) + normed_values[i] = b l = PyBytes_GET_SIZE(b) data_length += l + HEADER_LENGTH lengths[i] = l @@ -268,7 +271,7 @@ class VLenBytes(Codec): l = lengths[i] store_le32(data, l) data += HEADER_LENGTH - encv = PyBytes_AS_STRING(values[i]) + encv = PyBytes_AS_STRING(normed_values[i]) memcpy(data, encv, l) data += l From e7b6b4b5d3d5d24ce5b9e811729cb6f9ccc41866 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 17:15:16 -0700 Subject: [PATCH 03/11] Use double quotes in encoding normalization steps --- numcodecs/vlen.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index 2f1f39d8..edba035f 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -114,9 +114,9 @@ class VLenUTF8(Codec): for i in range(n_items): u = input_values[i] if u is None or u == 0: # treat these as missing value, normalize - u = '' + u = "" elif not PyUnicode_Check(u): - raise TypeError('expected unicode string, found %r' % u) + raise TypeError("expected unicode string, found %r" % u) b = PyUnicode_AsUTF8String(u) l = PyBytes_GET_SIZE(b) encoded_values[i] = b @@ -249,9 +249,9 @@ class VLenBytes(Codec): for i in range(n_items): b = values[i] if b is None or b == 0: # treat these as missing value, normalize - b = b'' + b = b"" elif not PyBytes_Check(b): - raise TypeError('expected byte string, found %r' % b) + raise TypeError("expected byte string, found %r" % b) normed_values[i] = b l = PyBytes_GET_SIZE(b) data_length += l + HEADER_LENGTH @@ -403,7 +403,7 @@ class VLenArray(Codec): else: v = np.ascontiguousarray(v, self.dtype) if v.ndim != 1: - raise ValueError('only 1-dimensional arrays are supported') + raise ValueError("only 1-dimensional arrays are supported") l = v.nbytes normed_values[i] = v data_length += l + HEADER_LENGTH From 9bcd15c5fd054c43d1aec10bf79a9ac9d3b7ff81 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 17:15:17 -0700 Subject: [PATCH 04/11] Use `ensure_contiguous_memoryview` more in `vlen` --- numcodecs/vlen.pyx | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index edba035f..f7ee2f5b 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -10,7 +10,6 @@ cimport cython from libc.stdint cimport uint8_t, uint32_t from libc.string cimport memcpy -from cpython.buffer cimport PyBuffer_IsContiguous from cpython.bytearray cimport ( PyByteArray_AS_STRING, PyByteArray_FromStringAndSize, @@ -155,12 +154,10 @@ class VLenUTF8(Codec): # obtain memoryview buf = ensure_contiguous_ndarray(buf) - buf_mv = memoryview(buf) + buf_mv = ensure_continguous_memoryview(buf) buf_pb = PyMemoryView_GET_BUFFER(buf_mv) # sanity checks - if not PyBuffer_IsContiguous(buf_pb, b'A'): - raise BufferError("`buf` must contain contiguous memory") if buf_pb.len < HEADER_LENGTH: raise ValueError('corrupt buffer, missing or truncated header') @@ -289,12 +286,10 @@ class VLenBytes(Codec): # obtain memoryview buf = ensure_contiguous_ndarray(buf) - buf_mv = memoryview(buf) + buf_mv = ensure_continguous_memoryview(buf) buf_pb = PyMemoryView_GET_BUFFER(buf_mv) # sanity checks - if not PyBuffer_IsContiguous(buf_pb, b'A'): - raise BufferError("`buf` must contain contiguous memory") if buf_pb.len < HEADER_LENGTH: raise ValueError('corrupt buffer, missing or truncated header') @@ -448,12 +443,10 @@ class VLenArray(Codec): # obtain memoryview buf = ensure_contiguous_ndarray(buf) - buf_mv = memoryview(buf) + buf_mv = ensure_continguous_memoryview(buf) buf_pb = PyMemoryView_GET_BUFFER(buf_mv) # sanity checks - if not PyBuffer_IsContiguous(buf_pb, b'A'): - raise BufferError("`buf` must contain contiguous memory") if buf_pb.len < HEADER_LENGTH: raise ValueError('corrupt buffer, missing or truncated header') From 8e24cce1315253b176609abace0c4884a5a4cd67 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 17:15:17 -0700 Subject: [PATCH 05/11] Assign normalized values to encode to typed vars Drop our own type checking in favor of assigning to Cython typed-variables. This should do the same kind of type checking, but may be more robust than what we are doing. --- numcodecs/vlen.pyx | 47 +++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index f7ee2f5b..3995ba21 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -17,13 +17,11 @@ from cpython.bytearray cimport ( from cpython.bytes cimport ( PyBytes_AS_STRING, PyBytes_GET_SIZE, - PyBytes_Check, PyBytes_FromStringAndSize, ) from cpython.memoryview cimport PyMemoryView_GET_BUFFER from cpython.unicode cimport ( PyUnicode_AsUTF8String, - PyUnicode_Check, PyUnicode_FromStringAndSize, ) @@ -96,7 +94,8 @@ class VLenUTF8(Codec): bytes b bytearray out char* data - object u + object o + unicode u # normalise input input_values = np.asarray(buf, dtype=object).reshape(-1, order='A') @@ -111,11 +110,11 @@ class VLenUTF8(Codec): # first iteration to convert to bytes data_length = 0 for i in range(n_items): - u = input_values[i] - if u is None or u == 0: # treat these as missing value, normalize + o = input_values[i] + if o is None or o == 0: # treat these as missing value, normalize u = "" - elif not PyUnicode_Check(u): - raise TypeError("expected unicode string, found %r" % u) + else: + u = o b = PyUnicode_AsUTF8String(u) l = PyBytes_GET_SIZE(b) encoded_values[i] = b @@ -227,7 +226,8 @@ class VLenBytes(Codec): object[:] normed_values int[:] lengths char* encv - object b + object o + bytes b bytearray out char* data @@ -244,11 +244,11 @@ class VLenBytes(Codec): # first iteration to find lengths data_length = 0 for i in range(n_items): - b = values[i] - if b is None or b == 0: # treat these as missing value, normalize + o = values[i] + if o is None or o == 0: # treat these as missing value, normalize b = b"" - elif not PyBytes_Check(b): - raise TypeError("expected byte string, found %r" % b) + else: + b = o normed_values[i] = b l = PyBytes_GET_SIZE(b) data_length += l + HEADER_LENGTH @@ -377,7 +377,7 @@ class VLenArray(Codec): char* data memoryview value_mv const Py_buffer* value_pb - object v + object o # normalise input values = np.asarray(buf, dtype=object).reshape(-1, order='A') @@ -392,15 +392,20 @@ class VLenArray(Codec): # first iteration to convert to bytes data_length = 0 for i in range(n_items): - v = values[i] - if v is None: - v = np.array([], dtype=self.dtype) + o = values[i] + if o is None: + value_mv = ensure_continguous_memoryview( + np.array([], dtype=self.dtype) + ) else: - v = np.ascontiguousarray(v, self.dtype) - if v.ndim != 1: + value_mv = ensure_continguous_memoryview( + np.ascontiguousarray(o, self.dtype) + ) + value_pb = PyMemoryView_GET_BUFFER(value_mv) + if value_pb.ndim != 1: raise ValueError("only 1-dimensional arrays are supported") - l = v.nbytes - normed_values[i] = v + l = value_pb.len + normed_values[i] = value_mv data_length += l + HEADER_LENGTH lengths[i] = l @@ -419,7 +424,7 @@ class VLenArray(Codec): store_le32(data, l) data += HEADER_LENGTH - value_mv = ensure_continguous_memoryview(normed_values[i]) + value_mv = normed_values[i] value_pb = PyMemoryView_GET_BUFFER(value_mv) encv = value_pb.buf From d6085a8ceab593af48519a4074600621ed96de65 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 17:15:18 -0700 Subject: [PATCH 06/11] Use ternary expression to normalize values Simplify normalization code using the ternary expression. --- numcodecs/vlen.pyx | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index 3995ba21..c9a3b94a 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -111,10 +111,8 @@ class VLenUTF8(Codec): data_length = 0 for i in range(n_items): o = input_values[i] - if o is None or o == 0: # treat these as missing value, normalize - u = "" - else: - u = o + # replace missing value and coerce to typed data + u = "" if o is None or o == 0 else o b = PyUnicode_AsUTF8String(u) l = PyBytes_GET_SIZE(b) encoded_values[i] = b @@ -245,10 +243,8 @@ class VLenBytes(Codec): data_length = 0 for i in range(n_items): o = values[i] - if o is None or o == 0: # treat these as missing value, normalize - b = b"" - else: - b = o + # replace missing value and coerce to typed data + b = b"" if o is None or o == 0 else o normed_values[i] = b l = PyBytes_GET_SIZE(b) data_length += l + HEADER_LENGTH @@ -393,14 +389,11 @@ class VLenArray(Codec): data_length = 0 for i in range(n_items): o = values[i] - if o is None: - value_mv = ensure_continguous_memoryview( - np.array([], dtype=self.dtype) - ) - else: - value_mv = ensure_continguous_memoryview( - np.ascontiguousarray(o, self.dtype) - ) + # replace missing value and coerce to typed data + value_mv = ensure_continguous_memoryview( + np.array([], dtype=self.dtype) if o is None + else np.ascontiguousarray(o, self.dtype) + ) value_pb = PyMemoryView_GET_BUFFER(value_mv) if value_pb.ndim != 1: raise ValueError("only 1-dimensional arrays are supported") From c05d33bab60723a64485f9a85861792503483dfe Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 17:15:18 -0700 Subject: [PATCH 07/11] Use Cython's tight binding in `vlen` Instead of making so many calls to C Python APIs, rely on the Cython types of variables and Cython's tight binding to pick the right function to apply in each instance. This makes the code more readable to the typical Python developer. Also it makes it less likely some issue would sneak in like the one encountered in this bug report. --- numcodecs/vlen.pyx | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index c9a3b94a..1b71f364 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -10,20 +10,10 @@ cimport cython from libc.stdint cimport uint8_t, uint32_t from libc.string cimport memcpy -from cpython.bytearray cimport ( - PyByteArray_AS_STRING, - PyByteArray_FromStringAndSize, -) -from cpython.bytes cimport ( - PyBytes_AS_STRING, - PyBytes_GET_SIZE, - PyBytes_FromStringAndSize, -) +from cpython.bytearray cimport PyByteArray_FromStringAndSize +from cpython.bytes cimport PyBytes_FromStringAndSize from cpython.memoryview cimport PyMemoryView_GET_BUFFER -from cpython.unicode cimport ( - PyUnicode_AsUTF8String, - PyUnicode_FromStringAndSize, -) +from cpython.unicode cimport PyUnicode_FromStringAndSize from numpy cimport ndarray @@ -113,8 +103,8 @@ class VLenUTF8(Codec): o = input_values[i] # replace missing value and coerce to typed data u = "" if o is None or o == 0 else o - b = PyUnicode_AsUTF8String(u) - l = PyBytes_GET_SIZE(b) + b = u.encode("utf-8") + l = len(b) encoded_values[i] = b data_length += l + HEADER_LENGTH encoded_lengths[i] = l @@ -124,7 +114,7 @@ class VLenUTF8(Codec): out = PyByteArray_FromStringAndSize(NULL, total_length) # write header - data = PyByteArray_AS_STRING(out) + data = out store_le32(data, n_items) # second iteration, store data @@ -133,7 +123,8 @@ class VLenUTF8(Codec): l = encoded_lengths[i] store_le32(data, l) data += HEADER_LENGTH - encv = PyBytes_AS_STRING(encoded_values[i]) + b = encoded_values[i] + encv = b memcpy(data, encv, l) data += l @@ -246,7 +237,7 @@ class VLenBytes(Codec): # replace missing value and coerce to typed data b = b"" if o is None or o == 0 else o normed_values[i] = b - l = PyBytes_GET_SIZE(b) + l = len(b) data_length += l + HEADER_LENGTH lengths[i] = l @@ -255,7 +246,7 @@ class VLenBytes(Codec): out = PyByteArray_FromStringAndSize(NULL, total_length) # write header - data = PyByteArray_AS_STRING(out) + data = out store_le32(data, n_items) # second iteration, store data @@ -264,7 +255,8 @@ class VLenBytes(Codec): l = lengths[i] store_le32(data, l) data += HEADER_LENGTH - encv = PyBytes_AS_STRING(normed_values[i]) + b = normed_values[i] + encv = b memcpy(data, encv, l) data += l @@ -407,7 +399,7 @@ class VLenArray(Codec): out = PyByteArray_FromStringAndSize(NULL, total_length) # write header - data = PyByteArray_AS_STRING(out) + data = out store_le32(data, n_items) # second iteration, store data From 5b550aec24b467f0d930b2fc992a906cdf1954c0 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 17:28:50 -0700 Subject: [PATCH 08/11] Use `const char*` with binary encoded values When encoding values, there is no need to modify the encoded data when writing it out. So mark the pointers used to reference the encoded data as `const`. While there is nothing happening here that should cause issues, this will help further safeguard developers making changes here and clarify the intent. --- numcodecs/vlen.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index 1b71f364..c8aa7b17 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -80,7 +80,7 @@ class VLenUTF8(Codec): ndarray[object, ndim=1] input_values object[:] encoded_values int[:] encoded_lengths - char* encv + const char* encv bytes b bytearray out char* data @@ -214,7 +214,7 @@ class VLenBytes(Codec): object[:] values object[:] normed_values int[:] lengths - char* encv + const char* encv object o bytes b bytearray out From 739dd8850ee25596348c6a73f946ee873da278dc Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 17:36:35 -0700 Subject: [PATCH 09/11] Inline pointer usage in `memcpy` calls --- numcodecs/vlen.pyx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index c8aa7b17..6c04260c 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -80,7 +80,6 @@ class VLenUTF8(Codec): ndarray[object, ndim=1] input_values object[:] encoded_values int[:] encoded_lengths - const char* encv bytes b bytearray out char* data @@ -124,8 +123,7 @@ class VLenUTF8(Codec): store_le32(data, l) data += HEADER_LENGTH b = encoded_values[i] - encv = b - memcpy(data, encv, l) + memcpy(data, b, l) data += l return out @@ -214,7 +212,6 @@ class VLenBytes(Codec): object[:] values object[:] normed_values int[:] lengths - const char* encv object o bytes b bytearray out @@ -256,8 +253,7 @@ class VLenBytes(Codec): store_le32(data, l) data += HEADER_LENGTH b = normed_values[i] - encv = b - memcpy(data, encv, l) + memcpy(data, b, l) data += l return out @@ -359,7 +355,6 @@ class VLenArray(Codec): object[:] values object[:] normed_values int[:] lengths - const char* encv bytes b bytearray out char* data @@ -411,9 +406,8 @@ class VLenArray(Codec): value_mv = normed_values[i] value_pb = PyMemoryView_GET_BUFFER(value_mv) - encv = value_pb.buf - memcpy(data, encv, l) + memcpy(data, value_pb.buf, l) data += l return out From a37b8f9024576d418d0acdfde148174e6f33b483 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 20:28:02 -0700 Subject: [PATCH 10/11] Consolidate `total_length` into `data_length` These variables are nearly identical and only the total length is used. As `data` is used elsewhere, change `data_length` to capture the value of `total_length` and just use `data_length` throughout. --- numcodecs/vlen.pyx | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index 6c04260c..36846f6e 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -76,7 +76,7 @@ class VLenUTF8(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length, total_length + Py_ssize_t i, l, n_items, data_length ndarray[object, ndim=1] input_values object[:] encoded_values int[:] encoded_lengths @@ -97,7 +97,7 @@ class VLenUTF8(Codec): encoded_lengths = np.empty(n_items, dtype=np.intc) # first iteration to convert to bytes - data_length = 0 + data_length = HEADER_LENGTH for i in range(n_items): o = input_values[i] # replace missing value and coerce to typed data @@ -109,8 +109,7 @@ class VLenUTF8(Codec): encoded_lengths[i] = l # setup output - total_length = HEADER_LENGTH + data_length - out = PyByteArray_FromStringAndSize(NULL, total_length) + out = PyByteArray_FromStringAndSize(NULL, data_length) # write header data = out @@ -208,7 +207,7 @@ class VLenBytes(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length, total_length + Py_ssize_t i, l, n_items, data_length object[:] values object[:] normed_values int[:] lengths @@ -228,7 +227,7 @@ class VLenBytes(Codec): lengths = np.empty(n_items, dtype=np.intc) # first iteration to find lengths - data_length = 0 + data_length = HEADER_LENGTH for i in range(n_items): o = values[i] # replace missing value and coerce to typed data @@ -239,8 +238,7 @@ class VLenBytes(Codec): lengths[i] = l # setup output - total_length = HEADER_LENGTH + data_length - out = PyByteArray_FromStringAndSize(NULL, total_length) + out = PyByteArray_FromStringAndSize(NULL, data_length) # write header data = out @@ -351,7 +349,7 @@ class VLenArray(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length, total_length + Py_ssize_t i, l, n_items, data_length object[:] values object[:] normed_values int[:] lengths @@ -373,7 +371,7 @@ class VLenArray(Codec): lengths = np.empty(n_items, dtype=np.intc) # first iteration to convert to bytes - data_length = 0 + data_length = HEADER_LENGTH for i in range(n_items): o = values[i] # replace missing value and coerce to typed data @@ -390,8 +388,7 @@ class VLenArray(Codec): lengths[i] = l # setup output - total_length = HEADER_LENGTH + data_length - out = PyByteArray_FromStringAndSize(NULL, total_length) + out = PyByteArray_FromStringAndSize(NULL, data_length) # write header data = out From 644f147c200b921caed2240f2868ee076acd6089 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 9 Apr 2025 20:38:35 -0700 Subject: [PATCH 11/11] Use upper case `L` to avoid confusion --- numcodecs/vlen.pyx | 80 +++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx index 36846f6e..14530ecf 100644 --- a/numcodecs/vlen.pyx +++ b/numcodecs/vlen.pyx @@ -76,7 +76,7 @@ class VLenUTF8(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length ndarray[object, ndim=1] input_values object[:] encoded_values int[:] encoded_lengths @@ -103,10 +103,10 @@ class VLenUTF8(Codec): # replace missing value and coerce to typed data u = "" if o is None or o == 0 else o b = u.encode("utf-8") - l = len(b) + L = len(b) encoded_values[i] = b - data_length += l + HEADER_LENGTH - encoded_lengths[i] = l + data_length += L + HEADER_LENGTH + encoded_lengths[i] = L # setup output out = PyByteArray_FromStringAndSize(NULL, data_length) @@ -118,12 +118,12 @@ class VLenUTF8(Codec): # second iteration, store data data += HEADER_LENGTH for i in range(n_items): - l = encoded_lengths[i] - store_le32(data, l) + L = encoded_lengths[i] + store_le32(data, L) data += HEADER_LENGTH b = encoded_values[i] - memcpy(data, b, l) - data += l + memcpy(data, b, L) + data += L return out @@ -135,7 +135,7 @@ class VLenUTF8(Codec): const Py_buffer* buf_pb const char* data const char* data_end - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length # obtain memoryview buf = ensure_contiguous_ndarray(buf) @@ -166,12 +166,12 @@ class VLenUTF8(Codec): for i in range(n_items): if data + HEADER_LENGTH > data_end: raise ValueError('corrupt buffer, data seem truncated') - l = load_le32(data) + L = load_le32(data) data += HEADER_LENGTH - if data + l > data_end: + if data + L > data_end: raise ValueError('corrupt buffer, data seem truncated') - out[i] = PyUnicode_FromStringAndSize(data, l) - data += l + out[i] = PyUnicode_FromStringAndSize(data, L) + data += L return out @@ -207,7 +207,7 @@ class VLenBytes(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length object[:] values object[:] normed_values int[:] lengths @@ -233,9 +233,9 @@ class VLenBytes(Codec): # replace missing value and coerce to typed data b = b"" if o is None or o == 0 else o normed_values[i] = b - l = len(b) - data_length += l + HEADER_LENGTH - lengths[i] = l + L = len(b) + data_length += HEADER_LENGTH + L + lengths[i] = L # setup output out = PyByteArray_FromStringAndSize(NULL, data_length) @@ -247,12 +247,12 @@ class VLenBytes(Codec): # second iteration, store data data += HEADER_LENGTH for i in range(n_items): - l = lengths[i] - store_le32(data, l) + L = lengths[i] + store_le32(data, L) data += HEADER_LENGTH b = normed_values[i] - memcpy(data, b, l) - data += l + memcpy(data, b, L) + data += L return out @@ -264,7 +264,7 @@ class VLenBytes(Codec): const Py_buffer* buf_pb const char* data const char* data_end - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length # obtain memoryview buf = ensure_contiguous_ndarray(buf) @@ -295,12 +295,12 @@ class VLenBytes(Codec): for i in range(n_items): if data + HEADER_LENGTH > data_end: raise ValueError('corrupt buffer, data seem truncated') - l = load_le32(data) + L = load_le32(data) data += HEADER_LENGTH - if data + l > data_end: + if data + L > data_end: raise ValueError('corrupt buffer, data seem truncated') - out[i] = PyBytes_FromStringAndSize(data, l) - data += l + out[i] = PyBytes_FromStringAndSize(data, L) + data += L return out @@ -349,7 +349,7 @@ class VLenArray(Codec): @cython.boundscheck(False) def encode(self, buf): cdef: - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length object[:] values object[:] normed_values int[:] lengths @@ -382,10 +382,10 @@ class VLenArray(Codec): value_pb = PyMemoryView_GET_BUFFER(value_mv) if value_pb.ndim != 1: raise ValueError("only 1-dimensional arrays are supported") - l = value_pb.len + L = value_pb.len normed_values[i] = value_mv - data_length += l + HEADER_LENGTH - lengths[i] = l + data_length += HEADER_LENGTH + L + lengths[i] = L # setup output out = PyByteArray_FromStringAndSize(NULL, data_length) @@ -397,15 +397,15 @@ class VLenArray(Codec): # second iteration, store data data += HEADER_LENGTH for i in range(n_items): - l = lengths[i] - store_le32(data, l) + L = lengths[i] + store_le32(data, L) data += HEADER_LENGTH value_mv = normed_values[i] value_pb = PyMemoryView_GET_BUFFER(value_mv) - memcpy(data, value_pb.buf, l) - data += l + memcpy(data, value_pb.buf, L) + data += L return out @@ -420,7 +420,7 @@ class VLenArray(Codec): object v memoryview v_mv Py_buffer* v_pb - Py_ssize_t i, l, n_items, data_length + Py_ssize_t i, L, n_items, data_length # obtain memoryview buf = ensure_contiguous_ndarray(buf) @@ -451,18 +451,18 @@ class VLenArray(Codec): for i in range(n_items): if data + HEADER_LENGTH > data_end: raise ValueError('corrupt buffer, data seem truncated') - l = load_le32(data) + L = load_le32(data) data += HEADER_LENGTH - if data + l > data_end: + if data + L > data_end: raise ValueError('corrupt buffer, data seem truncated') # Create & fill array value - v = np.empty((l,), dtype="uint8").view(self.dtype) + v = np.empty((L,), dtype="uint8").view(self.dtype) v_mv = memoryview(v) v_pb = PyMemoryView_GET_BUFFER(v_mv) - memcpy(v_pb.buf, data, l) + memcpy(v_pb.buf, data, L) out[i] = v - data += l + data += L return out