Skip to content

Commit 7213139

Browse files
authored
Use more efficient serialization format for long integers in cache files (#20151)
A long integer (one that doesn't fit in the 4-byte encoding) will now be encoded like this: * initial header byte * short integer (1-4 bytes) encoding the number of bytes of data and sign * variable-length number of data bytes (absolute value of the integer) -- all bits are used For example, a 32-bit integer can now always be encoded using at most 6 bytes (+ type tag). This is optimized for size efficiency, not performance, since large integers are not expected to be a performance bottleneck. Having an efficient format makes it easier to improve performance in the future, however, without changing the encoding. The header byte has a few unused bits which could be used to slightly improve efficiency, but I decided that it's not worth the extra complexity.
1 parent c52b17a commit 7213139

File tree

2 files changed

+120
-15
lines changed

2 files changed

+120
-15
lines changed

mypyc/lib-rt/librt_internal.c

Lines changed: 94 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -592,14 +592,35 @@ read_int_internal(PyObject *data) {
592592
if (likely(first != LONG_INT_TRAILER)) {
593593
return _read_short_int(data, first);
594594
}
595-
PyObject *str_ret = read_str_internal(data);
596-
if (unlikely(str_ret == NULL))
595+
596+
// Long integer encoding -- byte length and sign, followed by a byte array.
597+
598+
// Read byte length and sign.
599+
_CHECK_READ(data, 1, CPY_INT_TAG)
600+
first = _READ(data, uint8_t)
601+
Py_ssize_t size_and_sign = _read_short_int(data, first);
602+
if (size_and_sign == CPY_INT_TAG)
597603
return CPY_INT_TAG;
598-
PyObject* ret_long = PyLong_FromUnicodeObject(str_ret, 10);
599-
Py_DECREF(str_ret);
600-
if (ret_long == NULL)
604+
bool sign = (size_and_sign >> 1) & 1;
605+
Py_ssize_t size = size_and_sign >> 2;
606+
607+
// Construct an int object from the byte array.
608+
_CHECK_READ(data, size, CPY_INT_TAG)
609+
char *buf = ((BufferObject *)data)->buf;
610+
PyObject *num = _PyLong_FromByteArray(
611+
(unsigned char *)(buf + ((BufferObject *)data)->pos), size, 1, 0);
612+
if (num == NULL)
601613
return CPY_INT_TAG;
602-
return CPyTagged_StealFromObject(ret_long);
614+
((BufferObject *)data)->pos += size;
615+
if (sign) {
616+
PyObject *old = num;
617+
num = PyNumber_Negative(old);
618+
Py_DECREF(old);
619+
if (num == NULL) {
620+
return CPY_INT_TAG;
621+
}
622+
}
623+
return CPyTagged_StealFromObject(num);
603624
}
604625

605626
static PyObject*
@@ -617,22 +638,81 @@ read_int(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames)
617638
return CPyTagged_StealAsObject(retval);
618639
}
619640

641+
642+
static inline int hex_to_int(char c) {
643+
if (c >= '0' && c <= '9')
644+
return c - '0';
645+
else if (c >= 'a' && c <= 'f')
646+
return c - 'a' + 10;
647+
else
648+
return c - 'A' + 10; // Assume valid hex digit
649+
}
650+
620651
static inline char
621652
_write_long_int(PyObject *data, CPyTagged value) {
622-
// TODO(jukka): write a more compact/optimal format for arbitrary length ints.
623653
_CHECK_SIZE(data, 1)
624654
_WRITE(data, uint8_t, LONG_INT_TRAILER)
625655
((BufferObject *)data)->end += 1;
656+
657+
PyObject *hex_str = NULL;
626658
PyObject* int_value = CPyTagged_AsObject(value);
627659
if (unlikely(int_value == NULL))
628-
return CPY_NONE_ERROR;
629-
PyObject *str_value = PyObject_Str(int_value);
660+
goto error;
661+
662+
hex_str = PyNumber_ToBase(int_value, 16);
663+
if (hex_str == NULL)
664+
goto error;
630665
Py_DECREF(int_value);
631-
if (unlikely(str_value == NULL))
632-
return CPY_NONE_ERROR;
633-
char res = write_str_internal(data, str_value);
634-
Py_DECREF(str_value);
635-
return res;
666+
int_value = NULL;
667+
668+
const char *str = PyUnicode_AsUTF8(hex_str);
669+
if (str == NULL)
670+
goto error;
671+
Py_ssize_t len = strlen(str);
672+
bool neg;
673+
if (str[0] == '-') {
674+
str++;
675+
len--;
676+
neg = true;
677+
} else {
678+
neg = false;
679+
}
680+
// Skip the 0x hex prefix.
681+
str += 2;
682+
len -= 2;
683+
684+
// Write bytes encoded length and sign.
685+
Py_ssize_t size = (len + 1) / 2;
686+
Py_ssize_t encoded_size = (size << 1) | neg;
687+
if (encoded_size <= MAX_FOUR_BYTES_INT) {
688+
if (_write_short_int(data, encoded_size) == CPY_NONE_ERROR)
689+
goto error;
690+
} else {
691+
PyErr_SetString(PyExc_ValueError, "int too long to serialize");
692+
goto error;
693+
}
694+
695+
// Write absolute integer value as byte array in a variable-length little endian format.
696+
int i;
697+
for (i = len; i > 1; i -= 2) {
698+
if (write_tag_internal(
699+
data, hex_to_int(str[i - 1]) | (hex_to_int(str[i - 2]) << 4)) == CPY_NONE_ERROR)
700+
goto error;
701+
}
702+
// The final byte may correspond to only one hex digit.
703+
if (i == 1) {
704+
if (write_tag_internal(data, hex_to_int(str[i - 1])) == CPY_NONE_ERROR)
705+
goto error;
706+
}
707+
708+
Py_DECREF(hex_str);
709+
return CPY_NONE;
710+
711+
error:
712+
713+
Py_XDECREF(int_value);
714+
Py_XDECREF(hex_str);
715+
return CPY_NONE_ERROR;
636716
}
637717

638718
static char

mypyc/test-data/run-classes.test

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2806,13 +2806,36 @@ def test_buffer_int_size() -> None:
28062806

28072807
def test_buffer_int_powers() -> None:
28082808
# 0, 1, 2 are tested above
2809-
for p in range(2, 9):
2809+
for p in range(2, 200):
28102810
b = Buffer()
28112811
write_int(b, 1 << p)
2812+
write_int(b, (1 << p) - 1)
28122813
write_int(b, -1 << p)
2814+
write_int(b, (-1 << p) + 1)
28132815
b = Buffer(b.getvalue())
28142816
assert read_int(b) == 1 << p
2817+
assert read_int(b) == (1 << p) - 1
28152818
assert read_int(b) == -1 << p
2819+
assert read_int(b) == (-1 << p) + 1
2820+
2821+
def test_positive_long_int_serialized_bytes() -> None:
2822+
b = Buffer()
2823+
n = 0x123456789ab
2824+
write_int(b, n)
2825+
x = b.getvalue()
2826+
# Two prefix bytes, followed by little endian encoded integer in variable-length format
2827+
assert x == b"\x0f\x2c\xab\x89\x67\x45\x23\x01"
2828+
b = Buffer(x)
2829+
assert read_int(b) == n
2830+
2831+
def test_negative_long_int_serialized_bytes() -> None:
2832+
b = Buffer()
2833+
n = -0x123456789abcde
2834+
write_int(b, n)
2835+
x = b.getvalue()
2836+
assert x == b"\x0f\x32\xde\xbc\x9a\x78\x56\x34\x12"
2837+
b = Buffer(x)
2838+
assert read_int(b) == n
28162839

28172840
def test_buffer_str_size() -> None:
28182841
for s in ("", "a", "a" * 117):
@@ -2837,6 +2860,8 @@ test_buffer_roundtrip()
28372860
test_buffer_int_size()
28382861
test_buffer_str_size()
28392862
test_buffer_int_powers()
2863+
test_positive_long_int_serialized_bytes()
2864+
test_negative_long_int_serialized_bytes()
28402865

28412866
def test_buffer_basic_interpreted() -> None:
28422867
b = Buffer(b"foo")

0 commit comments

Comments
 (0)