88#include "librt_internal.h"
99
1010#define START_SIZE 512
11- #define MAX_SHORT_INT_TAGGED (255 << 1)
1211
13- #define MAX_SHORT_LEN 127
14- #define LONG_STR_TAG 1
12+ // See comment in read_int_internal() on motivation for these values.
13+ #define MIN_ONE_BYTE_INT -10
14+ #define MAX_ONE_BYTE_INT 117 // 2 ** 7 - 1 - 10
15+ #define MIN_TWO_BYTES_INT -100
16+ #define MAX_TWO_BYTES_INT 16283 // 2 ** (8 + 6) - 1 - 100
17+ #define MIN_FOUR_BYTES_INT -10000
18+ #define MAX_FOUR_BYTES_INT 536860911 // 2 ** (3 * 8 + 5) - 1 - 10000
1519
16- #define MIN_SHORT_INT -10
17- #define MAX_SHORT_INT 117
18- #define MEDIUM_INT_TAG 1
19- #define LONG_INT_TAG 3
20+ #define TWO_BYTES_INT_BIT 1
21+ #define FOUR_BYTES_INT_BIT 2
22+ #define LONG_INT_BIT 4
23+
24+ #define FOUR_BYTES_INT_TRAILER 3
25+ // We add one reserved bit here so that we can potentially support
26+ // 8 bytes format in the future.
27+ #define LONG_INT_TRAILER 15
2028
2129#define CPY_BOOL_ERROR 2
2230#define CPY_NONE_ERROR 2
3543#define _WRITE (data , type , v ) *(type *)(((BufferObject *)data)->buf + ((BufferObject *)data)->pos) = v; \
3644 ((BufferObject *)data)->pos += sizeof(type);
3745
46+ #if PY_BIG_ENDIAN
47+ uint16_t reverse_16 (uint16_t number ) {
48+ return (number << 8 ) | (number >> 8 );
49+ }
50+
51+ uint32_t reverse_32 (uint32_t number ) {
52+ return ((number & 0xFF ) << 24 ) | ((number & 0xFF00 ) << 8 ) | ((number & 0xFF0000 ) >> 8 ) | (number >> 24 );
53+ }
54+ #endif
55+
3856typedef struct {
3957 PyObject_HEAD
4058 Py_ssize_t pos ;
4159 Py_ssize_t end ;
4260 Py_ssize_t size ;
4361 char * buf ;
44- PyObject * source ;
4562} BufferObject ;
4663
4764static PyTypeObject BufferType ;
@@ -259,26 +276,50 @@ write_bool(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwname
259276}
260277
261278/*
262- str format: size followed by UTF-8 bytes
263- short strings (len <= 127): single byte for size as `(uint8_t)size << 1`
264- long strings: \x01 followed by size as Py_ssize_t
279+ str format: size as int (see below) followed by UTF-8 bytes
265280*/
266281
282+ static inline CPyTagged
283+ _read_short_int (PyObject * data , uint8_t first ) {
284+ uint8_t second ;
285+ uint16_t two_more ;
286+ if ((first & TWO_BYTES_INT_BIT ) == 0 ) {
287+ // Note we use tagged ints since this function can return an error.
288+ return ((Py_ssize_t )(first >> 1 ) + MIN_ONE_BYTE_INT ) << 1 ;
289+ }
290+ if ((first & FOUR_BYTES_INT_BIT ) == 0 ) {
291+ _CHECK_READ (data , 1 , CPY_INT_TAG )
292+ second = _READ (data , uint8_t )
293+ return ((((Py_ssize_t )second ) << 6 ) + (Py_ssize_t )(first >> 2 ) + MIN_TWO_BYTES_INT ) << 1 ;
294+ }
295+ // The caller is responsible to verify this is called only for short ints.
296+ _CHECK_READ (data , 3 , CPY_INT_TAG )
297+ // TODO: check if compilers emit optimal code for these two reads, and tweak if needed.
298+ second = _READ (data , uint8_t )
299+ two_more = _READ (data , uint16_t )
300+ #if PY_BIG_ENDIAN
301+ two_more = reverse_16 (two_more );
302+ #endif
303+ Py_ssize_t higher = (((Py_ssize_t )two_more ) << 13 ) + (((Py_ssize_t )second ) << 5 );
304+ return (higher + (Py_ssize_t )(first >> 3 ) + MIN_FOUR_BYTES_INT ) << 1 ;
305+ }
306+
267307static PyObject *
268308read_str_internal (PyObject * data ) {
269309 _CHECK_BUFFER (data , NULL )
270310
271311 // Read string length.
272- Py_ssize_t size ;
273312 _CHECK_READ (data , 1 , NULL )
274313 uint8_t first = _READ (data , uint8_t )
275- if (likely (first != LONG_STR_TAG )) {
276- // Common case: short string (len <= 127).
277- size = (Py_ssize_t )(first >> 1 );
278- } else {
279- _CHECK_READ (data , sizeof (CPyTagged ), NULL )
280- size = _READ (data , Py_ssize_t )
314+ if (unlikely (first == LONG_INT_TRAILER )) {
315+ // Fail fast for invalid/tampered data.
316+ PyErr_SetString (PyExc_ValueError , "invalid str size" );
317+ return NULL ;
281318 }
319+ CPyTagged tagged_size = _read_short_int (data , first );
320+ if (tagged_size == CPY_INT_TAG )
321+ return NULL ;
322+ Py_ssize_t size = tagged_size >> 1 ;
282323 // Read string content.
283324 char * buf = ((BufferObject * )data )-> buf ;
284325 _CHECK_READ (data , size , NULL )
@@ -302,6 +343,35 @@ read_str(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames)
302343 return read_str_internal (data );
303344}
304345
346+ // The caller *must* check that real_value is within allowed range (29 bits).
347+ static inline char
348+ _write_short_int (PyObject * data , Py_ssize_t real_value ) {
349+ if (real_value >= MIN_ONE_BYTE_INT && real_value <= MAX_ONE_BYTE_INT ) {
350+ _CHECK_SIZE (data , 1 )
351+ _WRITE (data , uint8_t , (uint8_t )(real_value - MIN_ONE_BYTE_INT ) << 1 )
352+ ((BufferObject * )data )-> end += 1 ;
353+ } else if (real_value >= MIN_TWO_BYTES_INT && real_value <= MAX_TWO_BYTES_INT ) {
354+ _CHECK_SIZE (data , 2 )
355+ #if PY_BIG_ENDIAN
356+ uint16_t to_write = ((uint16_t )(real_value - MIN_TWO_BYTES_INT ) << 2 ) | TWO_BYTES_INT_BIT ;
357+ _WRITE (data , uint16_t , reverse_16 (to_write ))
358+ #else
359+ _WRITE (data , uint16_t , ((uint16_t )(real_value - MIN_TWO_BYTES_INT ) << 2 ) | TWO_BYTES_INT_BIT )
360+ #endif
361+ ((BufferObject * )data )-> end += 2 ;
362+ } else {
363+ _CHECK_SIZE (data , 4 )
364+ #if PY_BIG_ENDIAN
365+ uint32_t to_write = ((uint32_t )(real_value - MIN_FOUR_BYTES_INT ) << 3 ) | FOUR_BYTES_INT_TRAILER ;
366+ _WRITE (data , uint32_t , reverse_32 (to_write ))
367+ #else
368+ _WRITE (data , uint32_t , ((uint32_t )(real_value - MIN_FOUR_BYTES_INT ) << 3 ) | FOUR_BYTES_INT_TRAILER )
369+ #endif
370+ ((BufferObject * )data )-> end += 4 ;
371+ }
372+ return CPY_NONE ;
373+ }
374+
305375static char
306376write_str_internal (PyObject * data , PyObject * value ) {
307377 _CHECK_BUFFER (data , CPY_NONE_ERROR )
@@ -311,24 +381,20 @@ write_str_internal(PyObject *data, PyObject *value) {
311381 if (unlikely (chunk == NULL ))
312382 return CPY_NONE_ERROR ;
313383
314- Py_ssize_t need ;
315384 // Write string length.
316- if (likely (size <= MAX_SHORT_LEN )) {
317- // Common case: short string (len <= 127) store as single byte.
318- need = size + 1 ;
319- _CHECK_SIZE (data , need )
320- _WRITE (data , uint8_t , (uint8_t )size << 1 )
385+ if (likely (size >= MIN_FOUR_BYTES_INT && size <= MAX_FOUR_BYTES_INT )) {
386+ if (_write_short_int (data , size ) == CPY_NONE_ERROR )
387+ return CPY_NONE_ERROR ;
321388 } else {
322- need = size + sizeof (Py_ssize_t ) + 1 ;
323- _CHECK_SIZE (data , need )
324- _WRITE (data , uint8_t , LONG_STR_TAG )
325- _WRITE (data , Py_ssize_t , size )
389+ PyErr_SetString (PyExc_ValueError , "str too long to serialize" );
390+ return CPY_NONE_ERROR ;
326391 }
327392 // Write string content.
393+ _CHECK_SIZE (data , size )
328394 char * buf = ((BufferObject * )data )-> buf ;
329395 memcpy (buf + ((BufferObject * )data )-> pos , chunk , size );
330396 ((BufferObject * )data )-> pos += size ;
331- ((BufferObject * )data )-> end += need ;
397+ ((BufferObject * )data )-> end += size ;
332398 return CPY_NONE ;
333399}
334400
@@ -353,26 +419,25 @@ write_str(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames
353419}
354420
355421/*
356- bytes format: size followed by bytes
357- short bytes (len <= 127): single byte for size as `(uint8_t)size << 1`
358- long bytes: \x01 followed by size as Py_ssize_t
422+ bytes format: size as int (see below) followed by bytes
359423*/
360424
361425static PyObject *
362426read_bytes_internal (PyObject * data ) {
363427 _CHECK_BUFFER (data , NULL )
364428
365429 // Read length.
366- Py_ssize_t size ;
367430 _CHECK_READ (data , 1 , NULL )
368431 uint8_t first = _READ (data , uint8_t )
369- if (likely (first != LONG_STR_TAG )) {
370- // Common case: short bytes (len <= 127).
371- size = (Py_ssize_t )(first >> 1 );
372- } else {
373- _CHECK_READ (data , sizeof (CPyTagged ), NULL )
374- size = _READ (data , Py_ssize_t )
432+ if (unlikely (first == LONG_INT_TRAILER )) {
433+ // Fail fast for invalid/tampered data.
434+ PyErr_SetString (PyExc_ValueError , "invalid bytes size" );
435+ return NULL ;
375436 }
437+ CPyTagged tagged_size = _read_short_int (data , first );
438+ if (tagged_size == CPY_INT_TAG )
439+ return NULL ;
440+ Py_ssize_t size = tagged_size >> 1 ;
376441 // Read bytes content.
377442 char * buf = ((BufferObject * )data )-> buf ;
378443 _CHECK_READ (data , size , NULL )
@@ -405,24 +470,20 @@ write_bytes_internal(PyObject *data, PyObject *value) {
405470 return CPY_NONE_ERROR ;
406471 Py_ssize_t size = PyBytes_GET_SIZE (value );
407472
408- Py_ssize_t need ;
409473 // Write length.
410- if (likely (size <= MAX_SHORT_LEN )) {
411- // Common case: short bytes (len <= 127) store as single byte.
412- need = size + 1 ;
413- _CHECK_SIZE (data , need )
414- _WRITE (data , uint8_t , (uint8_t )size << 1 )
474+ if (likely (size >= MIN_FOUR_BYTES_INT && size <= MAX_FOUR_BYTES_INT )) {
475+ if (_write_short_int (data , size ) == CPY_NONE_ERROR )
476+ return CPY_NONE_ERROR ;
415477 } else {
416- need = size + sizeof (Py_ssize_t ) + 1 ;
417- _CHECK_SIZE (data , need )
418- _WRITE (data , uint8_t , LONG_STR_TAG )
419- _WRITE (data , Py_ssize_t , size )
478+ PyErr_SetString (PyExc_ValueError , "bytes too long to serialize" );
479+ return CPY_NONE_ERROR ;
420480 }
421481 // Write bytes content.
482+ _CHECK_SIZE (data , size )
422483 char * buf = ((BufferObject * )data )-> buf ;
423484 memcpy (buf + ((BufferObject * )data )-> pos , chunk , size );
424485 ((BufferObject * )data )-> pos += size ;
425- ((BufferObject * )data )-> end += need ;
486+ ((BufferObject * )data )-> end += size ;
426487 return CPY_NONE ;
427488}
428489
@@ -455,7 +516,7 @@ static double
455516read_float_internal (PyObject * data ) {
456517 _CHECK_BUFFER (data , CPY_FLOAT_ERROR )
457518 _CHECK_READ (data , sizeof (double ), CPY_FLOAT_ERROR )
458- double res = _READ (data , double );
519+ double res = _READ (data , double )
459520 return res ;
460521}
461522
@@ -505,9 +566,13 @@ write_float(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnam
505566
506567/*
507568int format:
508- most common values (-10 <= value <= 117): single byte as `(uint8_t)(value + 10) << 1`
509- medium values (fit in CPyTagged): \x01 followed by CPyTagged value
510- long values (very rare): \x03 followed by decimal string (see str format)
569+ one byte: last bit 0, 7 bits used
570+ two bytes: last two bits 01, 14 bits used
571+ four bytes: last three bits 011, 29 bits used
572+ everything else: 00001111 followed by serialized string representation
573+
574+ Note: for fixed size formats we skew ranges towards more positive values,
575+ since negative integers are much more rare.
511576*/
512577
513578static CPyTagged
@@ -516,22 +581,17 @@ read_int_internal(PyObject *data) {
516581 _CHECK_READ (data , 1 , CPY_INT_TAG )
517582
518583 uint8_t first = _READ (data , uint8_t )
519- if ((first & MEDIUM_INT_TAG ) == 0 ) {
520- // Most common case: int that is small in absolute value.
521- return ((Py_ssize_t )(first >> 1 ) + MIN_SHORT_INT ) << 1 ;
522- }
523- if (first == MEDIUM_INT_TAG ) {
524- _CHECK_READ (data , sizeof (CPyTagged ), CPY_INT_TAG )
525- CPyTagged ret = _READ (data , CPyTagged )
526- return ret ;
584+ if (likely (first != LONG_INT_TRAILER )) {
585+ return _read_short_int (data , first );
527586 }
528- // People who have literal ints not fitting in size_t should be punished :-)
529587 PyObject * str_ret = read_str_internal (data );
530588 if (unlikely (str_ret == NULL ))
531589 return CPY_INT_TAG ;
532590 PyObject * ret_long = PyLong_FromUnicodeObject (str_ret , 10 );
533591 Py_DECREF (str_ret );
534- return ((CPyTagged )ret_long ) | CPY_INT_TAG ;
592+ if (ret_long == NULL )
593+ return CPY_INT_TAG ;
594+ return CPyTagged_StealFromObject (ret_long );
535595}
536596
537597static PyObject *
@@ -549,36 +609,38 @@ read_int(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames)
549609 return CPyTagged_StealAsObject (retval );
550610}
551611
612+ static inline char
613+ _write_long_int (PyObject * data , CPyTagged value ) {
614+ // TODO(jukka): write a more compact/optimal format for arbitrary length ints.
615+ _CHECK_SIZE (data , 1 )
616+ _WRITE (data , uint8_t , LONG_INT_TRAILER )
617+ ((BufferObject * )data )-> end += 1 ;
618+ PyObject * int_value = CPyTagged_AsObject (value );
619+ if (unlikely (int_value == NULL ))
620+ return CPY_NONE_ERROR ;
621+ PyObject * str_value = PyObject_Str (int_value );
622+ Py_DECREF (int_value );
623+ if (unlikely (str_value == NULL ))
624+ return CPY_NONE_ERROR ;
625+ char res = write_str_internal (data , str_value );
626+ Py_DECREF (str_value );
627+ return res ;
628+ }
629+
552630static char
553631write_int_internal (PyObject * data , CPyTagged value ) {
554632 _CHECK_BUFFER (data , CPY_NONE_ERROR )
555633
556634 if (likely ((value & CPY_INT_TAG ) == 0 )) {
557635 Py_ssize_t real_value = CPyTagged_ShortAsSsize_t (value );
558- if (real_value >= MIN_SHORT_INT && real_value <= MAX_SHORT_INT ) {
559- // Most common case: int that is small in absolute value.
560- _CHECK_SIZE (data , 1 )
561- _WRITE (data , uint8_t , (uint8_t )(real_value - MIN_SHORT_INT ) << 1 )
562- ((BufferObject * )data )-> end += 1 ;
636+ if (likely (real_value >= MIN_FOUR_BYTES_INT && real_value <= MAX_FOUR_BYTES_INT )) {
637+ return _write_short_int (data , real_value );
563638 } else {
564- _CHECK_SIZE (data , sizeof (CPyTagged ) + 1 )
565- _WRITE (data , uint8_t , MEDIUM_INT_TAG )
566- _WRITE (data , CPyTagged , value )
567- ((BufferObject * )data )-> end += sizeof (CPyTagged ) + 1 ;
639+ return _write_long_int (data , value );
568640 }
569641 } else {
570- _CHECK_SIZE (data , 1 )
571- _WRITE (data , uint8_t , LONG_INT_TAG )
572- ((BufferObject * )data )-> end += 1 ;
573- PyObject * str_value = PyObject_Str (CPyTagged_LongAsObject (value ));
574- if (unlikely (str_value == NULL ))
575- return CPY_NONE_ERROR ;
576- char res = write_str_internal (data , str_value );
577- Py_DECREF (str_value );
578- if (unlikely (res == CPY_NONE_ERROR ))
579- return CPY_NONE_ERROR ;
642+ return _write_long_int (data , value );
580643 }
581- return CPY_NONE ;
582644}
583645
584646static PyObject *
0 commit comments