@@ -10,23 +10,10 @@ cimport cython
1010from libc.stdint cimport uint8_t, uint32_t
1111from libc.string cimport memcpy
1212
13- from cpython.buffer cimport PyBuffer_IsContiguous
14- from cpython.bytearray cimport (
15- PyByteArray_AS_STRING,
16- PyByteArray_FromStringAndSize,
17- )
18- from cpython.bytes cimport (
19- PyBytes_AS_STRING,
20- PyBytes_GET_SIZE,
21- PyBytes_Check,
22- PyBytes_FromStringAndSize,
23- )
13+ from cpython.bytearray cimport PyByteArray_FromStringAndSize
14+ from cpython.bytes cimport PyBytes_FromStringAndSize
2415from cpython.memoryview cimport PyMemoryView_GET_BUFFER
25- from cpython.unicode cimport (
26- PyUnicode_AsUTF8String,
27- PyUnicode_Check,
28- PyUnicode_FromStringAndSize,
29- )
16+ from cpython.unicode cimport PyUnicode_FromStringAndSize
3017
3118from numpy cimport ndarray
3219
@@ -89,15 +76,15 @@ class VLenUTF8(Codec):
8976 @ cython.boundscheck (False )
9077 def encode (self , buf ):
9178 cdef:
92- Py_ssize_t i, l , n_items, data_length, total_length
79+ Py_ssize_t i, L , n_items, data_length
9380 ndarray[object , ndim= 1 ] input_values
9481 object [:] encoded_values
9582 int [:] encoded_lengths
96- char * encv
9783 bytes b
9884 bytearray out
9985 char * data
100- object u
86+ object o
87+ unicode u
10188
10289 # normalise input
10390 input_values = np.asarray(buf, dtype = object ).reshape(- 1 , order = ' A' )
@@ -110,36 +97,33 @@ class VLenUTF8(Codec):
11097 encoded_lengths = np.empty(n_items, dtype = np.intc)
11198
11299 # first iteration to convert to bytes
113- data_length = 0
100+ data_length = HEADER_LENGTH
114101 for i in range (n_items):
115- u = input_values[i]
116- if u is None or u == 0 : # treat these as missing value, normalize
117- u = ' '
118- elif not PyUnicode_Check(u):
119- raise TypeError (' expected unicode string, found %r ' % u)
120- b = PyUnicode_AsUTF8String(u)
121- l = PyBytes_GET_SIZE(b)
102+ o = input_values[i]
103+ # replace missing value and coerce to typed data
104+ u = " " if o is None or o == 0 else o
105+ b = u.encode(" utf-8" )
106+ L = len (b)
122107 encoded_values[i] = b
123- data_length += l + HEADER_LENGTH
124- encoded_lengths[i] = l
108+ data_length += L + HEADER_LENGTH
109+ encoded_lengths[i] = L
125110
126111 # setup output
127- total_length = HEADER_LENGTH + data_length
128- out = PyByteArray_FromStringAndSize(NULL , total_length)
112+ out = PyByteArray_FromStringAndSize(NULL , data_length)
129113
130114 # write header
131- data = PyByteArray_AS_STRING( out)
115+ data = out
132116 store_le32(< uint8_t* > data, n_items)
133117
134118 # second iteration, store data
135119 data += HEADER_LENGTH
136120 for i in range (n_items):
137- l = encoded_lengths[i]
138- store_le32(< uint8_t* > data, l )
121+ L = encoded_lengths[i]
122+ store_le32(< uint8_t* > data, L )
139123 data += HEADER_LENGTH
140- encv = PyBytes_AS_STRING( encoded_values[i])
141- memcpy(data, encv, l )
142- data += l
124+ b = encoded_values[i]
125+ memcpy(data, < const char * > b, L )
126+ data += L
143127
144128 return out
145129
@@ -151,16 +135,14 @@ class VLenUTF8(Codec):
151135 const Py_buffer* buf_pb
152136 const char * data
153137 const char * data_end
154- Py_ssize_t i, l , n_items, data_length
138+ Py_ssize_t i, L , n_items, data_length
155139
156140 # obtain memoryview
157141 buf = ensure_contiguous_ndarray(buf)
158- buf_mv = memoryview (buf)
142+ buf_mv = ensure_continguous_memoryview (buf)
159143 buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
160144
161145 # sanity checks
162- if not PyBuffer_IsContiguous(buf_pb, b' A' ):
163- raise BufferError(" `buf` must contain contiguous memory" )
164146 if buf_pb.len < HEADER_LENGTH:
165147 raise ValueError (' corrupt buffer, missing or truncated header' )
166148
@@ -184,12 +166,12 @@ class VLenUTF8(Codec):
184166 for i in range (n_items):
185167 if data + HEADER_LENGTH > data_end:
186168 raise ValueError (' corrupt buffer, data seem truncated' )
187- l = load_le32(< uint8_t* > data)
169+ L = load_le32(< uint8_t* > data)
188170 data += HEADER_LENGTH
189- if data + l > data_end:
171+ if data + L > data_end:
190172 raise ValueError (' corrupt buffer, data seem truncated' )
191- out[i] = PyUnicode_FromStringAndSize(data, l )
192- data += l
173+ out[i] = PyUnicode_FromStringAndSize(data, L )
174+ data += L
193175
194176 return out
195177
@@ -225,11 +207,12 @@ class VLenBytes(Codec):
225207 @ cython.boundscheck (False )
226208 def encode (self , buf ):
227209 cdef:
228- Py_ssize_t i, l , n_items, data_length, total_length
210+ Py_ssize_t i, L , n_items, data_length
229211 object [:] values
212+ object [:] normed_values
230213 int [:] lengths
231- char * encv
232- object b
214+ object o
215+ bytes b
233216 bytearray out
234217 char * data
235218
@@ -240,37 +223,36 @@ class VLenBytes(Codec):
240223 n_items = values.shape[0 ]
241224
242225 # setup intermediates
226+ normed_values = np.empty(n_items, dtype = object )
243227 lengths = np.empty(n_items, dtype = np.intc)
244228
245229 # first iteration to find lengths
246- data_length = 0
230+ data_length = HEADER_LENGTH
247231 for i in range (n_items):
248- b = values[i]
249- if b is None or b == 0 : # treat these as missing value, normalize
250- b = b' '
251- elif not PyBytes_Check(b):
252- raise TypeError (' expected byte string, found %r ' % b)
253- l = PyBytes_GET_SIZE(b)
254- data_length += l + HEADER_LENGTH
255- lengths[i] = l
232+ o = values[i]
233+ # replace missing value and coerce to typed data
234+ b = b" " if o is None or o == 0 else o
235+ normed_values[i] = b
236+ L = len (b)
237+ data_length += HEADER_LENGTH + L
238+ lengths[i] = L
256239
257240 # setup output
258- total_length = HEADER_LENGTH + data_length
259- out = PyByteArray_FromStringAndSize(NULL , total_length)
241+ out = PyByteArray_FromStringAndSize(NULL , data_length)
260242
261243 # write header
262- data = PyByteArray_AS_STRING( out)
244+ data = out
263245 store_le32(< uint8_t* > data, n_items)
264246
265247 # second iteration, store data
266248 data += HEADER_LENGTH
267249 for i in range (n_items):
268- l = lengths[i]
269- store_le32(< uint8_t* > data, l )
250+ L = lengths[i]
251+ store_le32(< uint8_t* > data, L )
270252 data += HEADER_LENGTH
271- encv = PyBytes_AS_STRING(values [i])
272- memcpy(data, encv, l )
273- data += l
253+ b = normed_values [i]
254+ memcpy(data, < const char * > b, L )
255+ data += L
274256
275257 return out
276258
@@ -282,16 +264,14 @@ class VLenBytes(Codec):
282264 const Py_buffer* buf_pb
283265 const char * data
284266 const char * data_end
285- Py_ssize_t i, l , n_items, data_length
267+ Py_ssize_t i, L , n_items, data_length
286268
287269 # obtain memoryview
288270 buf = ensure_contiguous_ndarray(buf)
289- buf_mv = memoryview (buf)
271+ buf_mv = ensure_continguous_memoryview (buf)
290272 buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
291273
292274 # sanity checks
293- if not PyBuffer_IsContiguous(buf_pb, b' A' ):
294- raise BufferError(" `buf` must contain contiguous memory" )
295275 if buf_pb.len < HEADER_LENGTH:
296276 raise ValueError (' corrupt buffer, missing or truncated header' )
297277
@@ -315,12 +295,12 @@ class VLenBytes(Codec):
315295 for i in range (n_items):
316296 if data + HEADER_LENGTH > data_end:
317297 raise ValueError (' corrupt buffer, data seem truncated' )
318- l = load_le32(< uint8_t* > data)
298+ L = load_le32(< uint8_t* > data)
319299 data += HEADER_LENGTH
320- if data + l > data_end:
300+ if data + L > data_end:
321301 raise ValueError (' corrupt buffer, data seem truncated' )
322- out[i] = PyBytes_FromStringAndSize(data, l )
323- data += l
302+ out[i] = PyBytes_FromStringAndSize(data, L )
303+ data += L
324304
325305 return out
326306
@@ -369,17 +349,16 @@ class VLenArray(Codec):
369349 @ cython.boundscheck (False )
370350 def encode (self , buf ):
371351 cdef:
372- Py_ssize_t i, l , n_items, data_length, total_length
352+ Py_ssize_t i, L , n_items, data_length
373353 object [:] values
374354 object [:] normed_values
375355 int [:] lengths
376- const char * encv
377356 bytes b
378357 bytearray out
379358 char * data
380359 memoryview value_mv
381360 const Py_buffer* value_pb
382- object v
361+ object o
383362
384363 # normalise input
385364 values = np.asarray(buf, dtype = object ).reshape(- 1 , order = ' A' )
@@ -392,41 +371,41 @@ class VLenArray(Codec):
392371 lengths = np.empty(n_items, dtype = np.intc)
393372
394373 # first iteration to convert to bytes
395- data_length = 0
374+ data_length = HEADER_LENGTH
396375 for i in range (n_items):
397- v = values[i]
398- if v is None :
399- v = np.array([], dtype = self .dtype)
400- else :
401- v = np.ascontiguousarray(v, self .dtype)
402- if v.ndim != 1 :
403- raise ValueError (' only 1-dimensional arrays are supported' )
404- l = v.nbytes
405- normed_values[i] = v
406- data_length += l + HEADER_LENGTH
407- lengths[i] = l
376+ o = values[i]
377+ # replace missing value and coerce to typed data
378+ value_mv = ensure_continguous_memoryview(
379+ np.array([], dtype = self .dtype) if o is None
380+ else np.ascontiguousarray(o, self .dtype)
381+ )
382+ value_pb = PyMemoryView_GET_BUFFER(value_mv)
383+ if value_pb.ndim != 1 :
384+ raise ValueError (" only 1-dimensional arrays are supported" )
385+ L = value_pb.len
386+ normed_values[i] = value_mv
387+ data_length += HEADER_LENGTH + L
388+ lengths[i] = L
408389
409390 # setup output
410- total_length = HEADER_LENGTH + data_length
411- out = PyByteArray_FromStringAndSize(NULL , total_length)
391+ out = PyByteArray_FromStringAndSize(NULL , data_length)
412392
413393 # write header
414- data = PyByteArray_AS_STRING( out)
394+ data = out
415395 store_le32(< uint8_t* > data, n_items)
416396
417397 # second iteration, store data
418398 data += HEADER_LENGTH
419399 for i in range (n_items):
420- l = lengths[i]
421- store_le32(< uint8_t* > data, l )
400+ L = lengths[i]
401+ store_le32(< uint8_t* > data, L )
422402 data += HEADER_LENGTH
423403
424- value_mv = ensure_continguous_memoryview( normed_values[i])
404+ value_mv = normed_values[i]
425405 value_pb = PyMemoryView_GET_BUFFER(value_mv)
426- encv = < const char * > value_pb.buf
427406
428- memcpy(data, encv, l )
429- data += l
407+ memcpy(data, value_pb.buf, L )
408+ data += L
430409
431410 return out
432411
@@ -441,16 +420,14 @@ class VLenArray(Codec):
441420 object v
442421 memoryview v_mv
443422 Py_buffer* v_pb
444- Py_ssize_t i, l , n_items, data_length
423+ Py_ssize_t i, L , n_items, data_length
445424
446425 # obtain memoryview
447426 buf = ensure_contiguous_ndarray(buf)
448- buf_mv = memoryview (buf)
427+ buf_mv = ensure_continguous_memoryview (buf)
449428 buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
450429
451430 # sanity checks
452- if not PyBuffer_IsContiguous(buf_pb, b' A' ):
453- raise BufferError(" `buf` must contain contiguous memory" )
454431 if buf_pb.len < HEADER_LENGTH:
455432 raise ValueError (' corrupt buffer, missing or truncated header' )
456433
@@ -474,18 +451,18 @@ class VLenArray(Codec):
474451 for i in range (n_items):
475452 if data + HEADER_LENGTH > data_end:
476453 raise ValueError (' corrupt buffer, data seem truncated' )
477- l = load_le32(< uint8_t* > data)
454+ L = load_le32(< uint8_t* > data)
478455 data += HEADER_LENGTH
479- if data + l > data_end:
456+ if data + L > data_end:
480457 raise ValueError (' corrupt buffer, data seem truncated' )
481458
482459 # Create & fill array value
483- v = np.empty((l ,), dtype = " uint8" ).view(self .dtype)
460+ v = np.empty((L ,), dtype = " uint8" ).view(self .dtype)
484461 v_mv = memoryview(v)
485462 v_pb = PyMemoryView_GET_BUFFER(v_mv)
486- memcpy(v_pb.buf, data, l )
463+ memcpy(v_pb.buf, data, L )
487464
488465 out[i] = v
489- data += l
466+ data += L
490467
491468 return out
0 commit comments