Skip to content

Commit 298a320

Browse files
gh-130567: Fix locale.strxfrm() failure on FreeBSD
Fix locale.strxfrm() failure on FreeBSD and DragonFlyBSD for strings containing characters 'Å' (U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE) or 'Å' (U+212B ANGSTROM SIGN).
1 parent 4ed046c commit 298a320

File tree

3 files changed

+49
-2
lines changed

3 files changed

+49
-2
lines changed

Lib/locale.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,23 @@ def setlocale(category, value=None):
9797
if 'strcoll' not in globals():
9898
strcoll = _strcoll
9999

100+
if sys.platform.startswith(('freebsd', 'dragonflybsd')):
101+
# On FreeBSD, wcsxfrm() fails with EINVAL for
102+
# 'Å' (U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE) and
103+
# 'Å' (U+212B ANGSTROM SIGN) on non-C locales.
104+
# As a workaround, replace them with
105+
# 'å' (U+00E5 LATIN SMALL LETTER A WITH RING ABOVE).
106+
# To preserve the relative order of these characters according to
107+
# wcscoll(), add a digit 0-2.
108+
_strxfrm = strxfrm
109+
def strxfrm(string, /):
110+
if (not string.isascii() and
111+
_setlocale(LC_COLLATE) not in ('C', 'C.UTF-8', 'POSIX') and
112+
('\xe5' in string or '\xc5' in string or '\u212b' in string)):
113+
string = string.replace('\xe5', '\xe50')
114+
string = string.replace('\xc5', '\xe51')
115+
string = string.replace('\u212b', '\xe52')
116+
return _strxfrm(string)
100117

101118
_localeconv = localeconv
102119

Lib/test/test_locale.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,12 +332,14 @@ def test_strcoll(self):
332332
self.assertLess(locale.strcoll('a', 'b'), 0)
333333
self.assertEqual(locale.strcoll('a', 'a'), 0)
334334
self.assertGreater(locale.strcoll('b', 'a'), 0)
335+
self.assertLess(locale.strcoll('A', 'B'), 0)
335336
# embedded null character
336337
self.assertRaises(ValueError, locale.strcoll, 'a\0', 'a')
337338
self.assertRaises(ValueError, locale.strcoll, 'a', 'a\0')
338339

339340
def test_strxfrm(self):
340341
self.assertLess(locale.strxfrm('a'), locale.strxfrm('b'))
342+
self.assertLess(locale.strxfrm('A'), locale.strxfrm('B'))
341343
# embedded null character
342344
self.assertRaises(ValueError, locale.strxfrm, 'a\0')
343345

@@ -351,8 +353,7 @@ def setUp(self):
351353
enc = codecs.lookup(locale.getencoding() or 'ascii').name
352354
if enc not in ('utf-8', 'iso8859-1', 'cp1252'):
353355
raise unittest.SkipTest('encoding not suitable')
354-
if enc != 'iso8859-1' and (sys.platform == 'darwin' or is_android or
355-
sys.platform.startswith('freebsd')):
356+
if enc != 'iso8859-1' and (sys.platform == 'darwin' or is_android):
356357
raise unittest.SkipTest('wcscoll/wcsxfrm have known bugs')
357358
BaseLocalizedTest.setUp(self)
358359

@@ -363,6 +364,10 @@ def setUp(self):
363364
"gh-124108: NetBSD doesn't support UTF-8 for LC_COLLATE")
364365
def test_strcoll_with_diacritic(self):
365366
self.assertLess(locale.strcoll('à', 'b'), 0)
367+
self.assertLess(locale.strcoll('À', 'B'), 0)
368+
self.assertLess(locale.strcoll('å', 'b'), 0)
369+
self.assertLess(locale.strcoll('\xc5', 'B'), 0)
370+
self.assertLess(locale.strcoll('\u212b', 'B'), 0)
366371

367372
@unittest.skipIf(sys.platform.startswith('aix'),
368373
'bpo-29972: broken test on AIX')
@@ -371,6 +376,28 @@ def test_strcoll_with_diacritic(self):
371376
"gh-124108: NetBSD doesn't support UTF-8 for LC_COLLATE")
372377
def test_strxfrm_with_diacritic(self):
373378
self.assertLess(locale.strxfrm('à'), locale.strxfrm('b'))
379+
self.assertLess(locale.strxfrm('À'), locale.strxfrm('B'))
380+
self.assertLess(locale.strxfrm('å'), locale.strxfrm('b'))
381+
# gh-130567: Should not fail with OSError EINVAL.
382+
self.assertLess(locale.strxfrm('\xc5'), locale.strxfrm('B'))
383+
self.assertLess(locale.strxfrm('\u212b'), locale.strxfrm('B'))
384+
385+
def test_strxfrm_strcoll_consistency(self):
386+
enc = codecs.lookup(locale.getencoding() or 'ascii').name
387+
if enc != 'utf-8':
388+
self.skipTest('strcoll() and strxfrm() can be inconsistent on non-UTF-8 locale')
389+
def check(a, b):
390+
r = locale.strcoll(a, b)
391+
if r < 0:
392+
self.assertLess(locale.strxfrm(a), locale.strxfrm(b))
393+
elif r > 0:
394+
self.assertGreater(locale.strxfrm(a), locale.strxfrm(b))
395+
else:
396+
self.assertEqual(locale.strxfrm(a), locale.strxfrm(b))
397+
check('à', 'À')
398+
check('å', '\xc5') # 'Å' U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE
399+
check('å', '\u212b') # 'Å' U+212B ANGSTROM SIGN
400+
check('\xc5', '\u212b')
374401

375402

376403
class NormalizeTest(unittest.TestCase):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix :func:`locale.strxfrm` failure on FreeBSD and DragonFlyBSD for strings
2+
containing characters 'Å' (U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE) or
3+
'Å' (U+212B ANGSTROM SIGN).

0 commit comments

Comments
 (0)