Skip to content

Commit 96f4a79

Browse files
Hizuru3vstinner
authored andcommitted
pythongh-129569: The function unicodedata.normalize() always returns built-in str (pythonGH-129570)
(cherry picked from commit c359fcd) Co-authored-by: Hizuru <[email protected]> Co-authored-by: Victor Stinner <[email protected]>
1 parent 6c4de32 commit 96f4a79

File tree

3 files changed

+29
-5
lines changed

3 files changed

+29
-5
lines changed

Lib/test/test_unicodedata.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,29 @@ def test_bug_834676(self):
440440
# Check for bug 834676
441441
unicodedata.normalize('NFC', '\ud55c\uae00')
442442

443+
def test_normalize_return_type(self):
444+
# gh-129569: normalize() return type must always be str
445+
normalize = unicodedata.normalize
446+
447+
class MyStr(str):
448+
pass
449+
450+
normalization_forms = ("NFC", "NFKC", "NFD", "NFKD")
451+
input_strings = (
452+
# normalized strings
453+
"",
454+
"ascii",
455+
# unnormalized strings
456+
"\u1e0b\u0323",
457+
"\u0071\u0307\u0323",
458+
)
459+
460+
for form in normalization_forms:
461+
for input_str in input_strings:
462+
with self.subTest(form=form, input_str=input_str):
463+
self.assertIs(type(normalize(form, input_str)), str)
464+
self.assertIs(type(normalize(form, MyStr(input_str))), str)
465+
443466

444467
if __name__ == "__main__":
445468
unittest.main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`unicodedata.normalize` to always return a built-in :class:`str` object when given an input of a :class:`str` subclass, regardless of whether the string is already normalized.

Modules/unicodedata.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -939,34 +939,34 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
939939
if (PyUnicode_GET_LENGTH(input) == 0) {
940940
/* Special case empty input strings, since resizing
941941
them later would cause internal errors. */
942-
return Py_NewRef(input);
942+
return PyUnicode_FromObject(input);
943943
}
944944

945945
if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
946946
if (is_normalized_quickcheck(self, input,
947947
true, false, true) == YES) {
948-
return Py_NewRef(input);
948+
return PyUnicode_FromObject(input);
949949
}
950950
return nfc_nfkc(self, input, 0);
951951
}
952952
if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
953953
if (is_normalized_quickcheck(self, input,
954954
true, true, true) == YES) {
955-
return Py_NewRef(input);
955+
return PyUnicode_FromObject(input);
956956
}
957957
return nfc_nfkc(self, input, 1);
958958
}
959959
if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
960960
if (is_normalized_quickcheck(self, input,
961961
false, false, true) == YES) {
962-
return Py_NewRef(input);
962+
return PyUnicode_FromObject(input);
963963
}
964964
return nfd_nfkd(self, input, 0);
965965
}
966966
if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
967967
if (is_normalized_quickcheck(self, input,
968968
false, true, true) == YES) {
969-
return Py_NewRef(input);
969+
return PyUnicode_FromObject(input);
970970
}
971971
return nfd_nfkd(self, input, 1);
972972
}

0 commit comments

Comments
 (0)