-
-
Notifications
You must be signed in to change notification settings - Fork 33.2k
gh-101178: refactor base64.b85encode to be memory friendly #112248
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
84e20ea
74fc245
60a3ae6
aaa09e1
cb46a5d
c88450b
2fc892c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1239,13 +1239,104 @@ binascii_b2a_qp_impl(PyObject *module, Py_buffer *data, int quotetabs, | |
| return rv; | ||
| } | ||
|
|
||
| /*[clinic input] | ||
| binascii.b2a_base85 | ||
|
|
||
| data: Py_buffer | ||
| chars: Py_buffer | ||
| pad: bool = False | ||
| foldnuls: bool = False | ||
| foldspaces: bool = False | ||
|
|
||
| Utility method used by the base64 module to encode a85/b85 data | ||
|
|
||
| data: bytes | ||
| chars: 85 bytes conversion table | ||
| pad: use NULL-paded input if necessary | ||
| foldnuls: replace NULL chunks by 'z' | ||
| foldspaces: replace space-only chucks by 'y' | ||
|
|
||
| [clinic start generated code]*/ | ||
|
|
||
| static PyObject * | ||
| binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, Py_buffer *chars, | ||
| int pad, int foldnuls, int foldspaces) | ||
| /*[clinic end generated code: output=0a92b3c535580aa0 input=a2d8ae712ed5adba]*/ | ||
| { | ||
| if (chars->len != 85) { | ||
| PyErr_SetString(PyExc_ValueError, | ||
| "chars must be exactly 85 bytes long"); | ||
| return NULL; | ||
| } | ||
|
|
||
| _PyBytesWriter writer; | ||
| _PyBytesWriter_Init(&writer); | ||
|
|
||
| const size_t bin_len = data->len; | ||
|
|
||
| // Allocate up to maxium encoded length, adjusted at end | ||
| const size_t ascii_len = ((bin_len + 3) / 4) * 5; | ||
|
|
||
| unsigned char *ascii_data = _PyBytesWriter_Alloc(&writer, ascii_len); | ||
| if (ascii_data == NULL) { | ||
| PyErr_NoMemory(); | ||
| return NULL; | ||
| } | ||
|
|
||
| const unsigned char *table = chars->buf; | ||
| const unsigned char *bin_data = data->buf; | ||
|
|
||
| size_t i = 0 ; | ||
| int padding = 0; | ||
|
|
||
| while (i < bin_len) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would also credit the git implementation for this one as it's heavily based on it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Credit added. I don't know if I should phrase it differently? I've also added some other comments to try to explain the logic more |
||
| // translate each 4 byte chunk to 32bit integer | ||
| uint32_t value = 0; | ||
| for (int cnt = 24; cnt >= 0; cnt -= 8) { | ||
| value |= bin_data[i] << cnt; | ||
| if (++i == bin_len) { | ||
| // Number of bytes under the 4 bytes rounded value | ||
| padding = cnt / 8; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if (foldnuls && value == 0) { | ||
| *ascii_data++ = 'z'; | ||
| } | ||
| else if (foldspaces && value == 0x20202020) { | ||
| *ascii_data++ = 'y'; | ||
| } | ||
| else { | ||
| for (int j = 4; j >= 0; j--) { | ||
| ascii_data[j] = table[value % 85]; | ||
| value /= 85; | ||
| } | ||
| ascii_data += 5; | ||
| } | ||
| } | ||
|
|
||
| if (padding && !pad && foldnuls && ascii_data[-1] == 'z') { | ||
| ascii_data--; | ||
| memset(ascii_data, table[0], 5); | ||
| ascii_data += 5; | ||
| } | ||
|
|
||
| if (!pad) { | ||
| ascii_data -= padding; | ||
| } | ||
|
|
||
| return _PyBytesWriter_Finish(&writer, ascii_data); | ||
| } | ||
|
|
||
| /* List of functions defined in the module */ | ||
|
|
||
| static struct PyMethodDef binascii_module_methods[] = { | ||
| BINASCII_A2B_UU_METHODDEF | ||
| BINASCII_B2A_UU_METHODDEF | ||
| BINASCII_A2B_BASE64_METHODDEF | ||
| BINASCII_B2A_BASE64_METHODDEF | ||
| BINASCII_B2A_BASE85_METHODDEF | ||
| BINASCII_A2B_HEX_METHODDEF | ||
| BINASCII_B2A_HEX_METHODDEF | ||
| BINASCII_HEXLIFY_METHODDEF | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would feel weird not to have
binascii.a2b_base85so I would suggest keeping it private for now. Ideally,base64should have its own C accelerator module but maybe it's an overkill.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed to private
That was what delayed me initially because I had no idea how to add a module dedicated to base64, I found out that it did use the binascii one only last week