66#include "librt_internal.h"
77
88#define START_SIZE 512
9- #define MAX_SHORT_INT_TAGGED (255 << 1)
109
11- #define MAX_SHORT_LEN 127
12- #define LONG_STR_TAG 1
10+ // See comment in read_int_internal() on motivation for these values.
11+ #define MIN_ONE_BYTE_INT -10
12+ #define MAX_ONE_BYTE_INT 117 // 2 ** 7 - 1 - 10
13+ #define MIN_TWO_BYTES_INT -100
14+ #define MAX_TWO_BYTES_INT 16283 // 2 ** (8 + 6) - 1 - 100
15+ #define MIN_FOUR_BYTES_INT -10000
16+ #define MAX_FOUR_BYTES_INT 536860911 // 2 ** (3 * 8 + 5) - 1 - 10000
1317
14- #define MIN_SHORT_INT -10
15- #define MAX_SHORT_INT 117
16- #define MEDIUM_INT_TAG 1
17- #define LONG_INT_TAG 3
18+ #define TWO_BYTES_INT_BIT 1
19+ #define FOUR_BYTES_INT_BIT 2
20+ #define LONG_INT_BIT 4
21+
22+ #define FOUR_BYTES_INT_TRAILER 3
23+ #define LONG_INT_TRAILER 7
1824
1925#define CPY_BOOL_ERROR 2
2026#define CPY_NONE_ERROR 2
@@ -39,7 +45,6 @@ typedef struct {
3945 Py_ssize_t end ;
4046 Py_ssize_t size ;
4147 char * buf ;
42- PyObject * source ;
4348} BufferObject ;
4449
4550static PyTypeObject BufferType ;
@@ -253,26 +258,47 @@ write_bool(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwname
253258}
254259
255260/*
256- str format: size followed by UTF-8 bytes
257- short strings (len <= 127): single byte for size as `(uint8_t)size << 1`
258- long strings: \x01 followed by size as Py_ssize_t
261+ str format: size as int (see below) followed by UTF-8 bytes
259262*/
260263
264+ static inline CPyTagged
265+ _read_short_int (PyObject * data , uint8_t first ) {
266+ uint8_t second ;
267+ uint16_t two_more ;
268+ if ((first & TWO_BYTES_INT_BIT ) == 0 ) {
269+ // Note we use tagged ints since this function can return an error.
270+ return ((Py_ssize_t )(first >> 1 ) + MIN_ONE_BYTE_INT ) << 1 ;
271+ }
272+ if ((first & FOUR_BYTES_INT_BIT ) == 0 ) {
273+ _CHECK_READ (data , 1 , CPY_INT_TAG )
274+ second = _READ (data , uint8_t )
275+ return ((((Py_ssize_t )second ) << 6 ) + (Py_ssize_t )(first >> 2 ) + MIN_TWO_BYTES_INT ) << 1 ;
276+ }
277+ // The caller is responsible to verify this is called only for short ints.
278+ _CHECK_READ (data , 3 , CPY_INT_TAG )
279+ // TODO: check if compilers emit optimal code for these two reads, and tweak if needed.
280+ second = _READ (data , uint8_t )
281+ two_more = _READ (data , uint16_t )
282+ Py_ssize_t higher = (((Py_ssize_t )two_more ) << 13 ) + (((Py_ssize_t )second ) << 5 );
283+ return (higher + (Py_ssize_t )(first >> 3 ) + MIN_FOUR_BYTES_INT ) << 1 ;
284+ }
285+
261286static PyObject *
262287read_str_internal (PyObject * data ) {
263288 _CHECK_BUFFER (data , NULL )
264289
265290 // Read string length.
266- Py_ssize_t size ;
267291 _CHECK_READ (data , 1 , NULL )
268292 uint8_t first = _READ (data , uint8_t )
269- if (likely (first != LONG_STR_TAG )) {
270- // Common case: short string (len <= 127).
271- size = (Py_ssize_t )(first >> 1 );
272- } else {
273- _CHECK_READ (data , sizeof (CPyTagged ), NULL )
274- size = _READ (data , Py_ssize_t )
293+ if (unlikely (first == LONG_INT_TRAILER )) {
294+ // Fail fast for invalid/tampered data.
295+ PyErr_SetString (PyExc_ValueError , "invalid str size" );
296+ return NULL ;
275297 }
298+ CPyTagged tagged_size = _read_short_int (data , first );
299+ if (tagged_size == CPY_INT_TAG )
300+ return NULL ;
301+ Py_ssize_t size = tagged_size >> 1 ;
276302 // Read string content.
277303 char * buf = ((BufferObject * )data )-> buf ;
278304 _CHECK_READ (data , size , NULL )
@@ -296,6 +322,24 @@ read_str(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames)
296322 return read_str_internal (data );
297323}
298324
325+ static inline char
326+ _write_short_int (PyObject * data , Py_ssize_t real_value ) {
327+ if (real_value >= MIN_ONE_BYTE_INT && real_value <= MAX_ONE_BYTE_INT ) {
328+ _CHECK_SIZE (data , 1 )
329+ _WRITE (data , uint8_t , (uint8_t )(real_value - MIN_ONE_BYTE_INT ) << 1 )
330+ ((BufferObject * )data )-> end += 1 ;
331+ } else if (real_value >= MIN_TWO_BYTES_INT && real_value <= MAX_TWO_BYTES_INT ) {
332+ _CHECK_SIZE (data , 2 )
333+ _WRITE (data , uint16_t , ((uint16_t )(real_value - MIN_TWO_BYTES_INT ) << 2 ) | TWO_BYTES_INT_BIT )
334+ ((BufferObject * )data )-> end += 2 ;
335+ } else {
336+ _CHECK_SIZE (data , 4 )
337+ _WRITE (data , uint32_t , ((uint32_t )(real_value - MIN_FOUR_BYTES_INT ) << 3 ) | FOUR_BYTES_INT_TRAILER )
338+ ((BufferObject * )data )-> end += 4 ;
339+ }
340+ return CPY_NONE ;
341+ }
342+
299343static char
300344write_str_internal (PyObject * data , PyObject * value ) {
301345 _CHECK_BUFFER (data , CPY_NONE_ERROR )
@@ -305,24 +349,20 @@ write_str_internal(PyObject *data, PyObject *value) {
305349 if (unlikely (chunk == NULL ))
306350 return CPY_NONE_ERROR ;
307351
308- Py_ssize_t need ;
309352 // Write string length.
310- if (likely (size <= MAX_SHORT_LEN )) {
311- // Common case: short string (len <= 127) store as single byte.
312- need = size + 1 ;
313- _CHECK_SIZE (data , need )
314- _WRITE (data , uint8_t , (uint8_t )size << 1 )
353+ if (likely (size >= MIN_FOUR_BYTES_INT && size <= MAX_FOUR_BYTES_INT )) {
354+ if (_write_short_int (data , size ) == CPY_NONE_ERROR )
355+ return CPY_NONE_ERROR ;
315356 } else {
316- need = size + sizeof (Py_ssize_t ) + 1 ;
317- _CHECK_SIZE (data , need )
318- _WRITE (data , uint8_t , LONG_STR_TAG )
319- _WRITE (data , Py_ssize_t , size )
357+ PyErr_SetString (PyExc_ValueError , "str too long to serialize" );
358+ return CPY_NONE_ERROR ;
320359 }
321360 // Write string content.
361+ _CHECK_SIZE (data , size )
322362 char * buf = ((BufferObject * )data )-> buf ;
323363 memcpy (buf + ((BufferObject * )data )-> pos , chunk , size );
324364 ((BufferObject * )data )-> pos += size ;
325- ((BufferObject * )data )-> end += need ;
365+ ((BufferObject * )data )-> end += size ;
326366 return CPY_NONE ;
327367}
328368
@@ -347,26 +387,25 @@ write_str(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames
347387}
348388
349389/*
350- bytes format: size followed by bytes
351- short bytes (len <= 127): single byte for size as `(uint8_t)size << 1`
352- long bytes: \x01 followed by size as Py_ssize_t
390+ bytes format: size as int (see below) followed by bytes
353391*/
354392
355393static PyObject *
356394read_bytes_internal (PyObject * data ) {
357395 _CHECK_BUFFER (data , NULL )
358396
359397 // Read length.
360- Py_ssize_t size ;
361398 _CHECK_READ (data , 1 , NULL )
362399 uint8_t first = _READ (data , uint8_t )
363- if (likely (first != LONG_STR_TAG )) {
364- // Common case: short bytes (len <= 127).
365- size = (Py_ssize_t )(first >> 1 );
366- } else {
367- _CHECK_READ (data , sizeof (CPyTagged ), NULL )
368- size = _READ (data , Py_ssize_t )
400+ if (unlikely (first == LONG_INT_TRAILER )) {
401+ // Fail fast for invalid/tampered data.
402+ PyErr_SetString (PyExc_ValueError , "invalid bytes size" );
403+ return NULL ;
369404 }
405+ CPyTagged tagged_size = _read_short_int (data , first );
406+ if (tagged_size == CPY_INT_TAG )
407+ return NULL ;
408+ Py_ssize_t size = tagged_size >> 1 ;
370409 // Read bytes content.
371410 char * buf = ((BufferObject * )data )-> buf ;
372411 _CHECK_READ (data , size , NULL )
@@ -399,24 +438,20 @@ write_bytes_internal(PyObject *data, PyObject *value) {
399438 return CPY_NONE_ERROR ;
400439 Py_ssize_t size = PyBytes_GET_SIZE (value );
401440
402- Py_ssize_t need ;
403441 // Write length.
404- if (likely (size <= MAX_SHORT_LEN )) {
405- // Common case: short bytes (len <= 127) store as single byte.
406- need = size + 1 ;
407- _CHECK_SIZE (data , need )
408- _WRITE (data , uint8_t , (uint8_t )size << 1 )
442+ if (likely (size >= MIN_FOUR_BYTES_INT && size <= MAX_FOUR_BYTES_INT )) {
443+ if (_write_short_int (data , size ) == CPY_NONE_ERROR )
444+ return CPY_NONE_ERROR ;
409445 } else {
410- need = size + sizeof (Py_ssize_t ) + 1 ;
411- _CHECK_SIZE (data , need )
412- _WRITE (data , uint8_t , LONG_STR_TAG )
413- _WRITE (data , Py_ssize_t , size )
446+ PyErr_SetString (PyExc_ValueError , "bytes too long to serialize" );
447+ return CPY_NONE_ERROR ;
414448 }
415449 // Write bytes content.
450+ _CHECK_SIZE (data , size )
416451 char * buf = ((BufferObject * )data )-> buf ;
417452 memcpy (buf + ((BufferObject * )data )-> pos , chunk , size );
418453 ((BufferObject * )data )-> pos += size ;
419- ((BufferObject * )data )-> end += need ;
454+ ((BufferObject * )data )-> end += size ;
420455 return CPY_NONE ;
421456}
422457
@@ -449,7 +484,7 @@ static double
449484read_float_internal (PyObject * data ) {
450485 _CHECK_BUFFER (data , CPY_FLOAT_ERROR )
451486 _CHECK_READ (data , sizeof (double ), CPY_FLOAT_ERROR )
452- double res = _READ (data , double );
487+ double res = _READ (data , double )
453488 return res ;
454489}
455490
@@ -499,9 +534,13 @@ write_float(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnam
499534
500535/*
501536int format:
502- most common values (-10 <= value <= 117): single byte as `(uint8_t)(value + 10) << 1`
503- medium values (fit in CPyTagged): \x01 followed by CPyTagged value
504- long values (very rare): \x03 followed by decimal string (see str format)
537+ one byte: last bit 0, 7 bits used
538+ two bytes: last two bits 01, 14 bits used
539+ four bytes: last three bits 011, 29 bits used
540+ everything else: 00000111 followed by serialized string representation
541+
542+ Note: for fixed size formats we skew ranges towards more positive values,
543+ since negative integers are much more rare.
505544*/
506545
507546static CPyTagged
@@ -510,22 +549,18 @@ read_int_internal(PyObject *data) {
510549 _CHECK_READ (data , 1 , CPY_INT_TAG )
511550
512551 uint8_t first = _READ (data , uint8_t )
513- if ((first & MEDIUM_INT_TAG ) == 0 ) {
514- // Most common case: int that is small in absolute value.
515- return ((Py_ssize_t )(first >> 1 ) + MIN_SHORT_INT ) << 1 ;
516- }
517- if (first == MEDIUM_INT_TAG ) {
518- _CHECK_READ (data , sizeof (CPyTagged ), CPY_INT_TAG )
519- CPyTagged ret = _READ (data , CPyTagged )
520- return ret ;
552+ if (likely (first != LONG_INT_TRAILER )) {
553+ return _read_short_int (data , first );
521554 }
522555 // People who have literal ints not fitting in size_t should be punished :-)
523556 PyObject * str_ret = read_str_internal (data );
524557 if (unlikely (str_ret == NULL ))
525558 return CPY_INT_TAG ;
526559 PyObject * ret_long = PyLong_FromUnicodeObject (str_ret , 10 );
527560 Py_DECREF (str_ret );
528- return ((CPyTagged )ret_long ) | CPY_INT_TAG ;
561+ if (ret_long == NULL )
562+ return CPY_INT_TAG ;
563+ return CPyTagged_StealFromObject (ret_long );
529564}
530565
531566static PyObject *
@@ -543,36 +578,38 @@ read_int(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames)
543578 return CPyTagged_StealAsObject (retval );
544579}
545580
581+ static inline char
582+ _write_long_int (PyObject * data , CPyTagged value ) {
583+ // TODO(jukka): write a more compact/optimal format for arbitrary length ints.
584+ _CHECK_SIZE (data , 1 )
585+ _WRITE (data , uint8_t , LONG_INT_TRAILER )
586+ ((BufferObject * )data )-> end += 1 ;
587+ PyObject * int_value = CPyTagged_AsObject (value );
588+ if (unlikely (int_value == NULL ))
589+ return CPY_NONE_ERROR ;
590+ PyObject * str_value = PyObject_Str (int_value );
591+ Py_DECREF (int_value );
592+ if (unlikely (str_value == NULL ))
593+ return CPY_NONE_ERROR ;
594+ char res = write_str_internal (data , str_value );
595+ Py_DECREF (str_value );
596+ return res ;
597+ }
598+
546599static char
547600write_int_internal (PyObject * data , CPyTagged value ) {
548601 _CHECK_BUFFER (data , CPY_NONE_ERROR )
549602
550603 if (likely ((value & CPY_INT_TAG ) == 0 )) {
551604 Py_ssize_t real_value = CPyTagged_ShortAsSsize_t (value );
552- if (real_value >= MIN_SHORT_INT && real_value <= MAX_SHORT_INT ) {
553- // Most common case: int that is small in absolute value.
554- _CHECK_SIZE (data , 1 )
555- _WRITE (data , uint8_t , (uint8_t )(real_value - MIN_SHORT_INT ) << 1 )
556- ((BufferObject * )data )-> end += 1 ;
605+ if (likely (real_value >= MIN_FOUR_BYTES_INT && real_value <= MAX_FOUR_BYTES_INT )) {
606+ return _write_short_int (data , real_value );
557607 } else {
558- _CHECK_SIZE (data , sizeof (CPyTagged ) + 1 )
559- _WRITE (data , uint8_t , MEDIUM_INT_TAG )
560- _WRITE (data , CPyTagged , value )
561- ((BufferObject * )data )-> end += sizeof (CPyTagged ) + 1 ;
608+ return _write_long_int (data , value );
562609 }
563610 } else {
564- _CHECK_SIZE (data , 1 )
565- _WRITE (data , uint8_t , LONG_INT_TAG )
566- ((BufferObject * )data )-> end += 1 ;
567- PyObject * str_value = PyObject_Str (CPyTagged_LongAsObject (value ));
568- if (unlikely (str_value == NULL ))
569- return CPY_NONE_ERROR ;
570- char res = write_str_internal (data , str_value );
571- Py_DECREF (str_value );
572- if (unlikely (res == CPY_NONE_ERROR ))
573- return CPY_NONE_ERROR ;
611+ return _write_long_int (data , value );
574612 }
575- return CPY_NONE ;
576613}
577614
578615static PyObject *
0 commit comments