From 07571e548d85ce02f144df4ec5ee27d176e2cc13 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 15:45:40 -0700
Subject: [PATCH 01/11] Re-enable `VLenBytes` round-trip `None` test

---
 numcodecs/tests/test_vlen_bytes.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/numcodecs/tests/test_vlen_bytes.py b/numcodecs/tests/test_vlen_bytes.py
index 467c9a85..3546dbaa 100644
--- a/numcodecs/tests/test_vlen_bytes.py
+++ b/numcodecs/tests/test_vlen_bytes.py
@@ -1,4 +1,3 @@
-import sys
 import unittest
 
 import numpy as np
@@ -85,9 +84,6 @@ def test_decode_errors():
         codec.decode(enc, out=np.zeros(10, dtype='i4'))
 
 
-# TODO: fix this test on GitHub actions somehow...
-# See https://github.com/zarr-developers/numcodecs/issues/683
-@pytest.mark.skipif(sys.platform == "darwin", reason="Test is failing on macOS on GitHub actions.")
 def test_encode_none():
     a = np.array([b'foo', None, b'bar'], dtype=object)
     codec = VLenBytes()

From b97fc62d50f56442d95e0d9a1f20ed872e931d21 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 16:52:18 -0700
Subject: [PATCH 02/11] Store normalized values in `VLenBytes.encode`

During `VLenBytes.encode`, it normalizes some values. However it was not
actually keeping the normalized values. So when it comes time to get
properties from these items, it assumes they are of the right type.
However it is still grabbing the original unnormalized values. So fix
this by storing the normalized values for process.
---
 numcodecs/vlen.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index ba8d1622..2f1f39d8 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -227,6 +227,7 @@ class VLenBytes(Codec):
         cdef:
             Py_ssize_t i, l, n_items, data_length, total_length
             object[:] values
+            object[:] normed_values
             int[:] lengths
             char* encv
             object b
@@ -240,6 +241,7 @@ class VLenBytes(Codec):
         n_items = values.shape[0]
 
         # setup intermediates
+        normed_values = np.empty(n_items, dtype=object)
         lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to find lengths
@@ -250,6 +252,7 @@ class VLenBytes(Codec):
                 b = b''
             elif not PyBytes_Check(b):
                 raise TypeError('expected byte string, found %r' % b)
+            normed_values[i] = b
             l = PyBytes_GET_SIZE(b)
             data_length += l + HEADER_LENGTH
             lengths[i] = l
@@ -268,7 +271,7 @@ class VLenBytes(Codec):
             l = lengths[i]
             store_le32(<uint8_t*>data, l)
             data += HEADER_LENGTH
-            encv = PyBytes_AS_STRING(values[i])
+            encv = PyBytes_AS_STRING(normed_values[i])
             memcpy(data, encv, l)
             data += l
 

From e7b6b4b5d3d5d24ce5b9e811729cb6f9ccc41866 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 17:15:16 -0700
Subject: [PATCH 03/11] Use double quotes in encoding normalization steps

---
 numcodecs/vlen.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index 2f1f39d8..edba035f 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -114,9 +114,9 @@ class VLenUTF8(Codec):
         for i in range(n_items):
             u = input_values[i]
             if u is None or u == 0:  # treat these as missing value, normalize
-                u = ''
+                u = ""
             elif not PyUnicode_Check(u):
-                raise TypeError('expected unicode string, found %r' % u)
+                raise TypeError("expected unicode string, found %r" % u)
             b = PyUnicode_AsUTF8String(u)
             l = PyBytes_GET_SIZE(b)
             encoded_values[i] = b
@@ -249,9 +249,9 @@ class VLenBytes(Codec):
         for i in range(n_items):
             b = values[i]
             if b is None or b == 0:  # treat these as missing value, normalize
-                b = b''
+                b = b""
             elif not PyBytes_Check(b):
-                raise TypeError('expected byte string, found %r' % b)
+                raise TypeError("expected byte string, found %r" % b)
             normed_values[i] = b
             l = PyBytes_GET_SIZE(b)
             data_length += l + HEADER_LENGTH
@@ -403,7 +403,7 @@ class VLenArray(Codec):
             else:
                 v = np.ascontiguousarray(v, self.dtype)
             if v.ndim != 1:
-                raise ValueError('only 1-dimensional arrays are supported')
+                raise ValueError("only 1-dimensional arrays are supported")
             l = v.nbytes
             normed_values[i] = v
             data_length += l + HEADER_LENGTH

From 9bcd15c5fd054c43d1aec10bf79a9ac9d3b7ff81 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 17:15:17 -0700
Subject: [PATCH 04/11] Use `ensure_contiguous_memoryview` more in `vlen`

---
 numcodecs/vlen.pyx | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index edba035f..f7ee2f5b 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -10,7 +10,6 @@ cimport cython
 from libc.stdint cimport uint8_t, uint32_t
 from libc.string cimport memcpy
 
-from cpython.buffer cimport PyBuffer_IsContiguous
 from cpython.bytearray cimport (
     PyByteArray_AS_STRING,
     PyByteArray_FromStringAndSize,
@@ -155,12 +154,10 @@ class VLenUTF8(Codec):
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -289,12 +286,10 @@ class VLenBytes(Codec):
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -448,12 +443,10 @@ class VLenArray(Codec):
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 

From 8e24cce1315253b176609abace0c4884a5a4cd67 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 17:15:17 -0700
Subject: [PATCH 05/11] Assign normalized values to encode to typed vars

Drop our own type checking in favor of assigning to Cython
typed-variables. This should do the same kind of type checking, but may
be more robust than what we are doing.
---
 numcodecs/vlen.pyx | 47 +++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index f7ee2f5b..3995ba21 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -17,13 +17,11 @@ from cpython.bytearray cimport (
 from cpython.bytes cimport (
     PyBytes_AS_STRING,
     PyBytes_GET_SIZE,
-    PyBytes_Check,
     PyBytes_FromStringAndSize,
 )
 from cpython.memoryview cimport PyMemoryView_GET_BUFFER
 from cpython.unicode cimport (
     PyUnicode_AsUTF8String,
-    PyUnicode_Check,
     PyUnicode_FromStringAndSize,
 )
 
@@ -96,7 +94,8 @@ class VLenUTF8(Codec):
             bytes b
             bytearray out
             char* data
-            object u
+            object o
+            unicode u
 
         # normalise input
         input_values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -111,11 +110,11 @@ class VLenUTF8(Codec):
         # first iteration to convert to bytes
         data_length = 0
         for i in range(n_items):
-            u = input_values[i]
-            if u is None or u == 0:  # treat these as missing value, normalize
+            o = input_values[i]
+            if o is None or o == 0:  # treat these as missing value, normalize
                 u = ""
-            elif not PyUnicode_Check(u):
-                raise TypeError("expected unicode string, found %r" % u)
+            else:
+                u = o
             b = PyUnicode_AsUTF8String(u)
             l = PyBytes_GET_SIZE(b)
             encoded_values[i] = b
@@ -227,7 +226,8 @@ class VLenBytes(Codec):
             object[:] normed_values
             int[:] lengths
             char* encv
-            object b
+            object o
+            bytes b
             bytearray out
             char* data
 
@@ -244,11 +244,11 @@ class VLenBytes(Codec):
         # first iteration to find lengths
         data_length = 0
         for i in range(n_items):
-            b = values[i]
-            if b is None or b == 0:  # treat these as missing value, normalize
+            o = values[i]
+            if o is None or o == 0:  # treat these as missing value, normalize
                 b = b""
-            elif not PyBytes_Check(b):
-                raise TypeError("expected byte string, found %r" % b)
+            else:
+                b = o
             normed_values[i] = b
             l = PyBytes_GET_SIZE(b)
             data_length += l + HEADER_LENGTH
@@ -377,7 +377,7 @@ class VLenArray(Codec):
             char* data
             memoryview value_mv
             const Py_buffer* value_pb
-            object v
+            object o
 
         # normalise input
         values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -392,15 +392,20 @@ class VLenArray(Codec):
         # first iteration to convert to bytes
         data_length = 0
         for i in range(n_items):
-            v = values[i]
-            if v is None:
-                v = np.array([], dtype=self.dtype)
+            o = values[i]
+            if o is None:
+                value_mv = ensure_continguous_memoryview(
+                    np.array([], dtype=self.dtype)
+                )
             else:
-                v = np.ascontiguousarray(v, self.dtype)
-            if v.ndim != 1:
+                value_mv = ensure_continguous_memoryview(
+                    np.ascontiguousarray(o, self.dtype)
+                )
+            value_pb = PyMemoryView_GET_BUFFER(value_mv)
+            if value_pb.ndim != 1:
                 raise ValueError("only 1-dimensional arrays are supported")
-            l = v.nbytes
-            normed_values[i] = v
+            l = value_pb.len
+            normed_values[i] = value_mv
             data_length += l + HEADER_LENGTH
             lengths[i] = l
 
@@ -419,7 +424,7 @@ class VLenArray(Codec):
             store_le32(<uint8_t*>data, l)
             data += HEADER_LENGTH
 
-            value_mv = ensure_continguous_memoryview(normed_values[i])
+            value_mv = normed_values[i]
             value_pb = PyMemoryView_GET_BUFFER(value_mv)
             encv = <const char*>value_pb.buf
 

From d6085a8ceab593af48519a4074600621ed96de65 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 17:15:18 -0700
Subject: [PATCH 06/11] Use ternary expression to normalize values

Simplify normalization code using the ternary expression.
---
 numcodecs/vlen.pyx | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index 3995ba21..c9a3b94a 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -111,10 +111,8 @@ class VLenUTF8(Codec):
         data_length = 0
         for i in range(n_items):
             o = input_values[i]
-            if o is None or o == 0:  # treat these as missing value, normalize
-                u = ""
-            else:
-                u = o
+            # replace missing value and coerce to typed data
+            u = "" if o is None or o == 0 else o
             b = PyUnicode_AsUTF8String(u)
             l = PyBytes_GET_SIZE(b)
             encoded_values[i] = b
@@ -245,10 +243,8 @@ class VLenBytes(Codec):
         data_length = 0
         for i in range(n_items):
             o = values[i]
-            if o is None or o == 0:  # treat these as missing value, normalize
-                b = b""
-            else:
-                b = o
+            # replace missing value and coerce to typed data
+            b = b"" if o is None or o == 0 else o
             normed_values[i] = b
             l = PyBytes_GET_SIZE(b)
             data_length += l + HEADER_LENGTH
@@ -393,14 +389,11 @@ class VLenArray(Codec):
         data_length = 0
         for i in range(n_items):
             o = values[i]
-            if o is None:
-                value_mv = ensure_continguous_memoryview(
-                    np.array([], dtype=self.dtype)
-                )
-            else:
-                value_mv = ensure_continguous_memoryview(
-                    np.ascontiguousarray(o, self.dtype)
-                )
+            # replace missing value and coerce to typed data
+            value_mv = ensure_continguous_memoryview(
+                np.array([], dtype=self.dtype) if o is None
+                else np.ascontiguousarray(o, self.dtype)
+            )
             value_pb = PyMemoryView_GET_BUFFER(value_mv)
             if value_pb.ndim != 1:
                 raise ValueError("only 1-dimensional arrays are supported")

From c05d33bab60723a64485f9a85861792503483dfe Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 17:15:18 -0700
Subject: [PATCH 07/11] Use Cython's tight binding in `vlen`

Instead of making so many calls to C Python APIs, rely on the Cython
types of variables and Cython's tight binding to pick the right function
to apply in each instance.

This makes the code more readable to the typical Python developer. Also
it makes it less likely some issue would sneak in like the one
encountered in this bug report.
---
 numcodecs/vlen.pyx | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index c9a3b94a..1b71f364 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -10,20 +10,10 @@ cimport cython
 from libc.stdint cimport uint8_t, uint32_t
 from libc.string cimport memcpy
 
-from cpython.bytearray cimport (
-    PyByteArray_AS_STRING,
-    PyByteArray_FromStringAndSize,
-)
-from cpython.bytes cimport (
-    PyBytes_AS_STRING,
-    PyBytes_GET_SIZE,
-    PyBytes_FromStringAndSize,
-)
+from cpython.bytearray cimport PyByteArray_FromStringAndSize
+from cpython.bytes cimport PyBytes_FromStringAndSize
 from cpython.memoryview cimport PyMemoryView_GET_BUFFER
-from cpython.unicode cimport (
-    PyUnicode_AsUTF8String,
-    PyUnicode_FromStringAndSize,
-)
+from cpython.unicode cimport PyUnicode_FromStringAndSize
 
 from numpy cimport ndarray
 
@@ -113,8 +103,8 @@ class VLenUTF8(Codec):
             o = input_values[i]
             # replace missing value and coerce to typed data
             u = "" if o is None or o == 0 else o
-            b = PyUnicode_AsUTF8String(u)
-            l = PyBytes_GET_SIZE(b)
+            b = u.encode("utf-8")
+            l = len(b)
             encoded_values[i] = b
             data_length += l + HEADER_LENGTH
             encoded_lengths[i] = l
@@ -124,7 +114,7 @@ class VLenUTF8(Codec):
         out = PyByteArray_FromStringAndSize(NULL, total_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
@@ -133,7 +123,8 @@ class VLenUTF8(Codec):
             l = encoded_lengths[i]
             store_le32(<uint8_t*>data, l)
             data += HEADER_LENGTH
-            encv = PyBytes_AS_STRING(encoded_values[i])
+            b = encoded_values[i]
+            encv = b
             memcpy(data, encv, l)
             data += l
 
@@ -246,7 +237,7 @@ class VLenBytes(Codec):
             # replace missing value and coerce to typed data
             b = b"" if o is None or o == 0 else o
             normed_values[i] = b
-            l = PyBytes_GET_SIZE(b)
+            l = len(b)
             data_length += l + HEADER_LENGTH
             lengths[i] = l
 
@@ -255,7 +246,7 @@ class VLenBytes(Codec):
         out = PyByteArray_FromStringAndSize(NULL, total_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
@@ -264,7 +255,8 @@ class VLenBytes(Codec):
             l = lengths[i]
             store_le32(<uint8_t*>data, l)
             data += HEADER_LENGTH
-            encv = PyBytes_AS_STRING(normed_values[i])
+            b = normed_values[i]
+            encv = b
             memcpy(data, encv, l)
             data += l
 
@@ -407,7 +399,7 @@ class VLenArray(Codec):
         out = PyByteArray_FromStringAndSize(NULL, total_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data

From 5b550aec24b467f0d930b2fc992a906cdf1954c0 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 17:28:50 -0700
Subject: [PATCH 08/11] Use `const char*` with binary encoded values

When encoding values, there is no need to modify the encoded data when
writing it out. So mark the pointers used to reference the encoded data
as `const`. While there is nothing happening here that should cause
issues, this will help further safeguard developers making changes here
and clarify the intent.
---
 numcodecs/vlen.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index 1b71f364..c8aa7b17 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -80,7 +80,7 @@ class VLenUTF8(Codec):
             ndarray[object, ndim=1] input_values
             object[:] encoded_values
             int[:] encoded_lengths
-            char* encv
+            const char* encv
             bytes b
             bytearray out
             char* data
@@ -214,7 +214,7 @@ class VLenBytes(Codec):
             object[:] values
             object[:] normed_values
             int[:] lengths
-            char* encv
+            const char* encv
             object o
             bytes b
             bytearray out

From 739dd8850ee25596348c6a73f946ee873da278dc Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 17:36:35 -0700
Subject: [PATCH 09/11] Inline pointer usage in `memcpy` calls

---
 numcodecs/vlen.pyx | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index c8aa7b17..6c04260c 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -80,7 +80,6 @@ class VLenUTF8(Codec):
             ndarray[object, ndim=1] input_values
             object[:] encoded_values
             int[:] encoded_lengths
-            const char* encv
             bytes b
             bytearray out
             char* data
@@ -124,8 +123,7 @@ class VLenUTF8(Codec):
             store_le32(<uint8_t*>data, l)
             data += HEADER_LENGTH
             b = encoded_values[i]
-            encv = b
-            memcpy(data, encv, l)
+            memcpy(data, <const char*>b, l)
             data += l
 
         return out
@@ -214,7 +212,6 @@ class VLenBytes(Codec):
             object[:] values
             object[:] normed_values
             int[:] lengths
-            const char* encv
             object o
             bytes b
             bytearray out
@@ -256,8 +253,7 @@ class VLenBytes(Codec):
             store_le32(<uint8_t*>data, l)
             data += HEADER_LENGTH
             b = normed_values[i]
-            encv = b
-            memcpy(data, encv, l)
+            memcpy(data, <const char*>b, l)
             data += l
 
         return out
@@ -359,7 +355,6 @@ class VLenArray(Codec):
             object[:] values
             object[:] normed_values
             int[:] lengths
-            const char* encv
             bytes b
             bytearray out
             char* data
@@ -411,9 +406,8 @@ class VLenArray(Codec):
 
             value_mv = normed_values[i]
             value_pb = PyMemoryView_GET_BUFFER(value_mv)
-            encv = <const char*>value_pb.buf
 
-            memcpy(data, encv, l)
+            memcpy(data, value_pb.buf, l)
             data += l
 
         return out

From a37b8f9024576d418d0acdfde148174e6f33b483 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 20:28:02 -0700
Subject: [PATCH 10/11] Consolidate `total_length` into `data_length`

These variables are nearly identical and only the total length is used.
As `data` is used elsewhere, change `data_length` to capture the value
of `total_length` and just use `data_length` throughout.
---
 numcodecs/vlen.pyx | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index 6c04260c..36846f6e 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -76,7 +76,7 @@ class VLenUTF8(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length, total_length
+            Py_ssize_t i, l, n_items, data_length
             ndarray[object, ndim=1] input_values
             object[:] encoded_values
             int[:] encoded_lengths
@@ -97,7 +97,7 @@ class VLenUTF8(Codec):
         encoded_lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to convert to bytes
-        data_length = 0
+        data_length = HEADER_LENGTH
         for i in range(n_items):
             o = input_values[i]
             # replace missing value and coerce to typed data
@@ -109,8 +109,7 @@ class VLenUTF8(Codec):
             encoded_lengths[i] = l
 
         # setup output
-        total_length = HEADER_LENGTH + data_length
-        out = PyByteArray_FromStringAndSize(NULL, total_length)
+        out = PyByteArray_FromStringAndSize(NULL, data_length)
 
         # write header
         data = out
@@ -208,7 +207,7 @@ class VLenBytes(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length, total_length
+            Py_ssize_t i, l, n_items, data_length
             object[:] values
             object[:] normed_values
             int[:] lengths
@@ -228,7 +227,7 @@ class VLenBytes(Codec):
         lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to find lengths
-        data_length = 0
+        data_length = HEADER_LENGTH
         for i in range(n_items):
             o = values[i]
             # replace missing value and coerce to typed data
@@ -239,8 +238,7 @@ class VLenBytes(Codec):
             lengths[i] = l
 
         # setup output
-        total_length = HEADER_LENGTH + data_length
-        out = PyByteArray_FromStringAndSize(NULL, total_length)
+        out = PyByteArray_FromStringAndSize(NULL, data_length)
 
         # write header
         data = out
@@ -351,7 +349,7 @@ class VLenArray(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length, total_length
+            Py_ssize_t i, l, n_items, data_length
             object[:] values
             object[:] normed_values
             int[:] lengths
@@ -373,7 +371,7 @@ class VLenArray(Codec):
         lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to convert to bytes
-        data_length = 0
+        data_length = HEADER_LENGTH
         for i in range(n_items):
             o = values[i]
             # replace missing value and coerce to typed data
@@ -390,8 +388,7 @@ class VLenArray(Codec):
             lengths[i] = l
 
         # setup output
-        total_length = HEADER_LENGTH + data_length
-        out = PyByteArray_FromStringAndSize(NULL, total_length)
+        out = PyByteArray_FromStringAndSize(NULL, data_length)
 
         # write header
         data = out

From 644f147c200b921caed2240f2868ee076acd6089 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 9 Apr 2025 20:38:35 -0700
Subject: [PATCH 11/11] Use upper case `L` to avoid confusion

---
 numcodecs/vlen.pyx | 80 +++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
index 36846f6e..14530ecf 100644
--- a/numcodecs/vlen.pyx
+++ b/numcodecs/vlen.pyx
@@ -76,7 +76,7 @@ class VLenUTF8(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
             ndarray[object, ndim=1] input_values
             object[:] encoded_values
             int[:] encoded_lengths
@@ -103,10 +103,10 @@ class VLenUTF8(Codec):
             # replace missing value and coerce to typed data
             u = "" if o is None or o == 0 else o
             b = u.encode("utf-8")
-            l = len(b)
+            L = len(b)
             encoded_values[i] = b
-            data_length += l + HEADER_LENGTH
-            encoded_lengths[i] = l
+            data_length += L + HEADER_LENGTH
+            encoded_lengths[i] = L
 
         # setup output
         out = PyByteArray_FromStringAndSize(NULL, data_length)
@@ -118,12 +118,12 @@ class VLenUTF8(Codec):
         # second iteration, store data
         data += HEADER_LENGTH
         for i in range(n_items):
-            l = encoded_lengths[i]
-            store_le32(<uint8_t*>data, l)
+            L = encoded_lengths[i]
+            store_le32(<uint8_t*>data, L)
             data += HEADER_LENGTH
             b = encoded_values[i]
-            memcpy(data, <const char*>b, l)
-            data += l
+            memcpy(data, <const char*>b, L)
+            data += L
 
         return out
 
@@ -135,7 +135,7 @@ class VLenUTF8(Codec):
             const Py_buffer* buf_pb
             const char* data
             const char* data_end
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
@@ -166,12 +166,12 @@ class VLenUTF8(Codec):
         for i in range(n_items):
             if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            l = load_le32(<uint8_t*>data)
+            L = load_le32(<uint8_t*>data)
             data += HEADER_LENGTH
-            if data + l > data_end:
+            if data + L > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            out[i] = PyUnicode_FromStringAndSize(data, l)
-            data += l
+            out[i] = PyUnicode_FromStringAndSize(data, L)
+            data += L
 
         return out
 
@@ -207,7 +207,7 @@ class VLenBytes(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
             object[:] values
             object[:] normed_values
             int[:] lengths
@@ -233,9 +233,9 @@ class VLenBytes(Codec):
             # replace missing value and coerce to typed data
             b = b"" if o is None or o == 0 else o
             normed_values[i] = b
-            l = len(b)
-            data_length += l + HEADER_LENGTH
-            lengths[i] = l
+            L = len(b)
+            data_length += HEADER_LENGTH + L
+            lengths[i] = L
 
         # setup output
         out = PyByteArray_FromStringAndSize(NULL, data_length)
@@ -247,12 +247,12 @@ class VLenBytes(Codec):
         # second iteration, store data
         data += HEADER_LENGTH
         for i in range(n_items):
-            l = lengths[i]
-            store_le32(<uint8_t*>data, l)
+            L = lengths[i]
+            store_le32(<uint8_t*>data, L)
             data += HEADER_LENGTH
             b = normed_values[i]
-            memcpy(data, <const char*>b, l)
-            data += l
+            memcpy(data, <const char*>b, L)
+            data += L
 
         return out
 
@@ -264,7 +264,7 @@ class VLenBytes(Codec):
             const Py_buffer* buf_pb
             const char* data
             const char* data_end
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
@@ -295,12 +295,12 @@ class VLenBytes(Codec):
         for i in range(n_items):
             if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            l = load_le32(<uint8_t*>data)
+            L = load_le32(<uint8_t*>data)
             data += HEADER_LENGTH
-            if data + l > data_end:
+            if data + L > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            out[i] = PyBytes_FromStringAndSize(data, l)
-            data += l
+            out[i] = PyBytes_FromStringAndSize(data, L)
+            data += L
 
         return out
 
@@ -349,7 +349,7 @@ class VLenArray(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
             object[:] values
             object[:] normed_values
             int[:] lengths
@@ -382,10 +382,10 @@ class VLenArray(Codec):
             value_pb = PyMemoryView_GET_BUFFER(value_mv)
             if value_pb.ndim != 1:
                 raise ValueError("only 1-dimensional arrays are supported")
-            l = value_pb.len
+            L = value_pb.len
             normed_values[i] = value_mv
-            data_length += l + HEADER_LENGTH
-            lengths[i] = l
+            data_length += HEADER_LENGTH + L
+            lengths[i] = L
 
         # setup output
         out = PyByteArray_FromStringAndSize(NULL, data_length)
@@ -397,15 +397,15 @@ class VLenArray(Codec):
         # second iteration, store data
         data += HEADER_LENGTH
         for i in range(n_items):
-            l = lengths[i]
-            store_le32(<uint8_t*>data, l)
+            L = lengths[i]
+            store_le32(<uint8_t*>data, L)
             data += HEADER_LENGTH
 
             value_mv = normed_values[i]
             value_pb = PyMemoryView_GET_BUFFER(value_mv)
 
-            memcpy(data, value_pb.buf, l)
-            data += l
+            memcpy(data, value_pb.buf, L)
+            data += L
 
         return out
 
@@ -420,7 +420,7 @@ class VLenArray(Codec):
             object v
             memoryview v_mv
             Py_buffer* v_pb
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
@@ -451,18 +451,18 @@ class VLenArray(Codec):
         for i in range(n_items):
             if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            l = load_le32(<uint8_t*>data)
+            L = load_le32(<uint8_t*>data)
             data += HEADER_LENGTH
-            if data + l > data_end:
+            if data + L > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
 
             # Create & fill array value
-            v = np.empty((l,), dtype="uint8").view(self.dtype)
+            v = np.empty((L,), dtype="uint8").view(self.dtype)
             v_mv = memoryview(v)
             v_pb = PyMemoryView_GET_BUFFER(v_mv)
-            memcpy(v_pb.buf, data, l)
+            memcpy(v_pb.buf, data, L)
 
             out[i] = v
-            data += l
+            data += L
 
         return out