Skip to content

Commit f4ed162

Browse files
fix(sort): Enable locale-aware collation for UTF-8 locales (#9176)
* fix(sort): Enable locale-aware collation for UTF-8 locales Fixes #9148 The sort implementation had locale support infrastructure (ICU collator) but it was never being used due to the fast_lexicographic optimization bypassing all locale-aware code.
1 parent 8d17c3f commit f4ed162

File tree

5 files changed

+156
-11
lines changed

5 files changed

+156
-11
lines changed

src/uu/sort/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ workspace = true
1919
[lib]
2020
path = "src/sort.rs"
2121

22+
[features]
23+
i18n-collator = ["uucore/i18n-collator"]
24+
2225
[dependencies]
2326
bigdecimal = { workspace = true }
2427
binary-heap-plus = { workspace = true }
@@ -39,6 +42,7 @@ uucore = { workspace = true, features = [
3942
"parser-size",
4043
"version-cmp",
4144
"i18n-decimal",
45+
"i18n-collator",
4246
] }
4347
fluent = { workspace = true }
4448

src/uu/sort/src/sort.rs

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use chunks::LineData;
2323
use clap::builder::ValueParser;
2424
use clap::{Arg, ArgAction, ArgMatches, Command};
2525
use custom_str_cmp::custom_str_cmp;
26+
2627
use ext_sort::ext_sort;
2728
use fnv::FnvHasher;
2829
use numeric_str_cmp::{NumInfo, NumInfoParseSettings, human_numeric_str_cmp, numeric_str_cmp};
@@ -47,6 +48,8 @@ use uucore::error::{FromIo, strip_errno};
4748
use uucore::error::{UError, UResult, USimpleError, UUsageError};
4849
use uucore::extendedbigdecimal::ExtendedBigDecimal;
4950
use uucore::format_usage;
51+
#[cfg(feature = "i18n-collator")]
52+
use uucore::i18n::collator::locale_cmp;
5053
use uucore::i18n::decimal::locale_decimal_separator;
5154
use uucore::line_ending::LineEnding;
5255
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
@@ -318,7 +321,10 @@ impl GlobalSettings {
318321
/// Precompute some data needed for sorting.
319322
/// This function **must** be called before starting to sort, and `GlobalSettings` may not be altered
320323
/// afterwards.
321-
fn init_precomputed(&mut self) {
324+
///
325+
/// When i18n-collator is enabled, `disable_fast_lexicographic` should be set to true if we're
326+
/// in a UTF-8 locale (to force locale-aware collation instead of byte comparison).
327+
fn init_precomputed(&mut self, disable_fast_lexicographic: bool) {
322328
self.precomputed.needs_tokens = self.selectors.iter().any(|s| s.needs_tokens);
323329
self.precomputed.selections_per_line =
324330
self.selectors.iter().filter(|s| s.needs_selection).count();
@@ -333,11 +339,15 @@ impl GlobalSettings {
333339
.filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
334340
.count();
335341

336-
self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic();
342+
self.precomputed.fast_lexicographic =
343+
!disable_fast_lexicographic && self.can_use_fast_lexicographic();
337344
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
338345
}
339346

340347
/// Returns true when the fast lexicographic path can be used safely.
348+
/// Note: When i18n-collator is enabled, the caller must have already determined
349+
/// whether locale-aware collation is needed (via checking if we're in a UTF-8 locale).
350+
/// This check is performed in uumain() before init_precomputed() is called.
341351
fn can_use_fast_lexicographic(&self) -> bool {
342352
self.mode == SortMode::Default
343353
&& !self.ignore_case
@@ -2065,7 +2075,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
20652075
emit_debug_warnings(&settings, &global_flags, &legacy_warnings);
20662076
}
20672077

2068-
settings.init_precomputed();
2078+
// Initialize locale collation if needed (UTF-8 locales)
2079+
// This MUST happen before init_precomputed() to avoid the performance regression
2080+
#[cfg(feature = "i18n-collator")]
2081+
let needs_locale_collation = uucore::i18n::collator::init_locale_collation();
2082+
2083+
#[cfg(not(feature = "i18n-collator"))]
2084+
let needs_locale_collation = false;
2085+
2086+
settings.init_precomputed(needs_locale_collation);
20692087

20702088
let result = exec(&mut files, &settings, output, &mut tmp_dir);
20712089
// Wait here if `SIGINT` was received,
@@ -2446,13 +2464,36 @@ fn compare_by<'a>(
24462464
}
24472465
SortMode::Month => month_compare(a_str, b_str),
24482466
SortMode::Version => version_cmp(a_str, b_str),
2449-
SortMode::Default => custom_str_cmp(
2450-
a_str,
2451-
b_str,
2452-
settings.ignore_non_printing,
2453-
settings.dictionary_order,
2454-
settings.ignore_case,
2455-
),
2467+
SortMode::Default => {
2468+
// Use locale-aware comparison if feature is enabled and no custom flags are set
2469+
#[cfg(feature = "i18n-collator")]
2470+
{
2471+
if settings.ignore_case
2472+
|| settings.dictionary_order
2473+
|| settings.ignore_non_printing
2474+
{
2475+
custom_str_cmp(
2476+
a_str,
2477+
b_str,
2478+
settings.ignore_non_printing,
2479+
settings.dictionary_order,
2480+
settings.ignore_case,
2481+
)
2482+
} else {
2483+
locale_cmp(a_str, b_str)
2484+
}
2485+
}
2486+
#[cfg(not(feature = "i18n-collator"))]
2487+
{
2488+
custom_str_cmp(
2489+
a_str,
2490+
b_str,
2491+
settings.ignore_non_printing,
2492+
settings.dictionary_order,
2493+
settings.ignore_case,
2494+
)
2495+
}
2496+
}
24562497
};
24572498
if cmp != Ordering::Equal {
24582499
return if settings.reverse { cmp.reverse() } else { cmp };

src/uucore/src/lib/features/i18n/collator.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,45 @@ pub fn init_collator(opts: CollatorOptions) {
3030
.expect("Collator already initialized");
3131
}
3232

33+
/// Initialize the collator for locale-aware string comparison if needed.
34+
///
35+
/// This function checks if the current locale requires locale-aware collation
36+
/// (UTF-8 encoding) and initializes the ICU collator with appropriate settings
37+
/// if necessary. For C/POSIX locales, no initialization is needed as byte
38+
/// comparison is sufficient.
39+
///
40+
/// # Returns
41+
///
42+
/// `true` if the collator was initialized for a UTF-8 locale, `false` if
43+
/// using C/POSIX locale (no initialization needed).
44+
///
45+
/// # Example
46+
///
47+
/// ```
48+
/// use uucore::i18n::collator::init_locale_collation;
49+
///
50+
/// if init_locale_collation() {
51+
/// // Using locale-aware collation
52+
/// } else {
53+
/// // Using byte comparison (C/POSIX locale)
54+
/// }
55+
/// ```
56+
pub fn init_locale_collation() -> bool {
57+
use crate::i18n::{UEncoding, get_locale_encoding};
58+
59+
// Check if we need locale-aware collation
60+
if get_locale_encoding() != UEncoding::Utf8 {
61+
// C/POSIX locale - no collator needed
62+
return false;
63+
}
64+
65+
// UTF-8 locale - initialize collator with Shifted mode to match GNU behavior
66+
let mut opts = CollatorOptions::default();
67+
opts.alternate_handling = Some(AlternateHandling::Shifted);
68+
69+
try_init_collator(opts)
70+
}
71+
3372
/// Compare both strings with regard to the current locale.
3473
pub fn locale_cmp(left: &[u8], right: &[u8]) -> Ordering {
3574
// If the detected locale is 'C', just do byte-wise comparison

src/uucore/src/lib/features/i18n/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ pub enum UEncoding {
2020
Utf8,
2121
}
2222

23-
const DEFAULT_LOCALE: Locale = locale!("en-US-posix");
23+
// Use "und" (undefined) as the marker for C/POSIX locale
24+
// This ensures real locales like "en-US" won't match
25+
const DEFAULT_LOCALE: Locale = locale!("und");
2426

2527
/// Look at 3 environment variables in the following order
2628
///
@@ -38,6 +40,11 @@ fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
3840
let mut split = locale_var_str.split(&['.', '@']);
3941

4042
if let Some(simple) = split.next() {
43+
// Handle explicit C and POSIX locales - these should always use byte comparison
44+
if simple == "C" || simple == "POSIX" {
45+
return (DEFAULT_LOCALE, UEncoding::Ascii);
46+
}
47+
4148
// Naively convert the locale name to BCP47 tag format.
4249
//
4350
// See https://en.wikipedia.org/wiki/IETF_language_tag

tests/by-util/test_sort.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2463,4 +2463,58 @@ fn test_start_buffer() {
24632463
.stdout_only_bytes(&expected);
24642464
}
24652465

2466+
#[test]
2467+
fn test_locale_collation_c_locale() {
2468+
// C locale uses byte order - this is deterministic and tests the fix for #9148
2469+
// Accented characters (UTF-8 multibyte) sort after ASCII letters
2470+
let input = \ne\nE\na\nA\nz\n";
2471+
// C locale byte order: A=0x41, E=0x45, a=0x61, e=0x65, z=0x7A, é=0xC3 0xA9
2472+
let expected = "A\nE\na\ne\nz\né\n";
2473+
2474+
new_ucmd!()
2475+
.env("LC_ALL", "C")
2476+
.pipe_in(input)
2477+
.succeeds()
2478+
.stdout_is(expected);
2479+
}
2480+
2481+
#[test]
2482+
fn test_locale_collation_utf8() {
2483+
// Test French UTF-8 locale handling - behavior depends on i18n-collator feature
2484+
// With feature: locale-aware collation (é sorts near e)
2485+
// Without feature: byte order (é after z, since 0xC3A9 > 0x7A)
2486+
let input = "z\né\ne\na\n";
2487+
2488+
let result = new_ucmd!()
2489+
.env("LC_ALL", "fr_FR.UTF-8")
2490+
.pipe_in(input)
2491+
.succeeds();
2492+
2493+
let output = result.stdout_str();
2494+
let lines: Vec<&str> = output.lines().collect();
2495+
2496+
assert_eq!(lines.len(), 4, "Expected 4 sorted lines");
2497+
assert_eq!(lines[0], "a", "'a' (0x61) should always sort first");
2498+
2499+
// Validate based on which collation mode is active
2500+
if lines[3] == "é" {
2501+
// Byte order mode: é (0xC3A9) > z (0x7A)
2502+
assert_eq!(
2503+
lines,
2504+
vec!["a", "e", "z", "é"],
2505+
"Byte order mode: expected a < e < z < é"
2506+
);
2507+
} else {
2508+
// Locale collation mode: é sorts with base letter e
2509+
assert_eq!(lines[3], "z", "Locale mode: 'z' should sort last");
2510+
let z_pos = lines.iter().position(|&x| x == "z").unwrap();
2511+
let e_pos = lines.iter().position(|&x| x == "e").unwrap();
2512+
let e_accent_pos = lines.iter().position(|&x| x == "é").unwrap();
2513+
assert!(
2514+
e_pos < z_pos && e_accent_pos < z_pos,
2515+
"Locale mode: 'e' ({e_pos}) and 'é' ({e_accent_pos}) should sort before 'z' ({z_pos})"
2516+
);
2517+
}
2518+
}
2519+
24662520
/* spell-checker: enable */

0 commit comments

Comments
 (0)