diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 2b519a989f3..a227d0d3819 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -53,7 +53,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -64,7 +64,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -103,9 +103,9 @@ dependencies = [ [[package]] name = "bigdecimal" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "560f42649de9fa436b73517378a147ec21f6c997a546581df4b4b31677828934" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" dependencies = [ "autocfg", "libm", @@ -184,9 +184,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.0" +version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" [[package]] name = "bytecount" @@ -196,9 +196,9 @@ checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" [[package]] name = "cc" -version = "1.2.48" +version = "1.2.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c481bdbf0ed3b892f6f806287d72acd515b352a4ec27a208489b8c1bc839633a" +checksum = "7a0aeaff4ff1a90589618835a598e545176939b97874f7abc7851caa0618f203" dependencies = [ "find-msvc-tools", "jobserver", @@ -231,18 +231,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.53" +version = "4.5.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" +checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.53" +version = "4.5.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" +checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" dependencies = [ "anstream", "anstyle", @@ -323,30 +323,13 @@ dependencies = [ "libc", ] -[[package]] -name = "crc" -version = "3.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" -dependencies = [ - "crc-catalog", -] - -[[package]] -name = "crc-catalog" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" - [[package]] name = "crc-fast" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2f7c8d397a6353ef0c1d6217ab91b3ddb5431daf57fd013f506b967dcf44458" +checksum = "e75b2483e97a5a7da73ac68a05b629f9c53cff58d8ed1c77866079e18b00dba5" dependencies = [ - "crc", "digest", - "rustversion", "spin", ] @@ -504,7 +487,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -515,9 +498,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "find-msvc-tools" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" +checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" [[package]] name = "flate2" @@ -753,9 +736,9 @@ checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" dependencies = [ "icu_collections", "icu_locale_core", @@ -767,9 +750,9 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" [[package]] name = "icu_provider" @@ -824,9 +807,9 @@ dependencies = [ [[package]] name = "jiff" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49cce2b81f2098e7e3efc35bc2e0a6b7abec9d34128283d7a26fa8f32a6dbb35" +checksum = "a87d9b8105c23642f50cbbae03d1f75d8422c5cb98ce7ee9271f7ff7505be6b8" dependencies = [ "jiff-static", "jiff-tzdb-platform", @@ -834,14 +817,14 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] name = "jiff-static" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "980af8b43c3ad5d8d349ace167ec8170839f753a42d233ba19e08afe1850fa69" +checksum = "b787bebb543f8969132630c51fd0afab173a86c6abae56ff3b9e5e3e3f9f6e58" dependencies = [ "proc-macro2", "quote", @@ -850,9 +833,9 @@ dependencies = [ [[package]] name = "jiff-tzdb" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1283705eb0a21404d2bfd6eef2a7593d240bc42a0bdb39db0ad6fa2ec026524" +checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" [[package]] name = "jiff-tzdb-platform" @@ -894,9 +877,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.178" +version = "0.2.179" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +checksum = "c5a2d376baa530d1238d133232d15e239abad80d05838b4b59354e5268af431f" [[package]] name = "libfuzzer-sys" @@ -1119,9 +1102,9 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "portable-atomic" -version = "1.11.1" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" [[package]] name = "portable-atomic-util" @@ -1154,9 +1137,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.103" +version = "1.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" +checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" dependencies = [ "unicode-ident", ] @@ -1273,15 +1256,15 @@ checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustix" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1292,9 +1275,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "self_cell" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16c2f82143577edb4921b71ede051dac62ca3c16084e918bf7b40c96ae10eb33" +checksum = "b12e76d157a900eb52e81bc6e9f3069344290341720e9178cde2407113ac8d89" [[package]] name = "serde" @@ -1366,9 +1349,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simd-adler32" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" [[package]] name = "similar" @@ -1417,9 +1400,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.111" +version = "2.0.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +checksum = "678faa00651c9eb72dd2020cbdf275d92eccb2400d568e419efdd64838145cb4" dependencies = [ "proc-macro2", "quote", @@ -1439,15 +1422,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.23.0" +version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ "fastrand", "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1902,7 +1885,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 8a9570eaa30..9b149eeb498 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -34,7 +34,7 @@ self_cell = { workspace = true } tempfile = { workspace = true } thiserror = { workspace = true } unicode-width = { workspace = true } -uucore = { workspace = true, features = ["fs", "parser-size", "version-cmp"] } +uucore = { workspace = true, features = ["fs", "parser-size", "version-cmp", "i18n-collator"] } fluent = { workspace = true } [target.'cfg(unix)'.dependencies] diff --git a/src/uu/sort/locales/en-US.ftl b/src/uu/sort/locales/en-US.ftl index a5c5d01b69b..366290cfa25 100644 --- a/src/uu/sort/locales/en-US.ftl +++ b/src/uu/sort/locales/en-US.ftl @@ -10,6 +10,11 @@ sort-after-help = The key format is FIELD[.CHAR][OPTIONS][,FIELD[.CHAR]][OPTIONS Valid options are: MbdfhnRrV. They override the global options for this key. + Locale-aware sorting: + The LC_ALL, LC_COLLATE, and LANG environment variables affect sorting order. + LC_ALL=C uses fast byte-wise comparison. Other locales use slower but correct Unicode collation. + For performance-critical scenarios with ASCII data, consider using LC_ALL=C. + # Error messages sort-open-failed = open failed: {$path}: {$error} sort-parse-key-error = failed to parse key {$key}: {$msg} diff --git a/src/uu/sort/locales/fr-FR.ftl b/src/uu/sort/locales/fr-FR.ftl index 4dbc05a49aa..f5121b6785c 100644 --- a/src/uu/sort/locales/fr-FR.ftl +++ b/src/uu/sort/locales/fr-FR.ftl @@ -10,6 +10,11 @@ sort-after-help = Le format de clé est CHAMP[.CAR][OPTIONS][,CHAMP[.CAR]][OPTIO Les options valides sont : MbdfhnRrV. Elles remplacent les options globales pour cette clé. + Tri selon la locale : + Les variables d'environnement LC_ALL, LC_COLLATE et LANG affectent l'ordre de tri. + LC_ALL=C utilise une comparaison rapide par octets. D'autres locales utilisent une collation Unicode plus lente mais correcte. + Pour des scénarios critiques en performance avec des données ASCII, considérez l'utilisation de LC_ALL=C. + # Messages d'erreur sort-open-failed = échec d'ouverture : {$path} : {$error} sort-parse-key-error = échec d'analyse de la clé {$key} : {$msg} diff --git a/src/uu/sort/src/custom_str_cmp.rs b/src/uu/sort/src/custom_str_cmp.rs index aa4f73ea7bb..7f6b6b7b136 100644 --- a/src/uu/sort/src/custom_str_cmp.rs +++ b/src/uu/sort/src/custom_str_cmp.rs @@ -8,6 +8,7 @@ //! The goal is to compare strings without transforming them first (i.e. not allocating new strings) use std::cmp::Ordering; +use uucore::i18n::collator::locale_cmp; fn filter_char(c: u8, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool { if ignore_non_dictionary && !(c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) { @@ -35,8 +36,8 @@ pub fn custom_str_cmp( ignore_case: bool, ) -> Ordering { if !(ignore_case || ignore_non_dictionary || ignore_non_printing) { - // There are no custom settings. Fall back to the default strcmp, which is faster. - return a.cmp(b); + // There are no custom settings. Fall back to locale-aware comparison. + return locale_cmp(a, b); } let mut a_chars = a .iter() diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 071163c5aee..7dd78bb750d 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -45,6 +45,7 @@ use uucore::error::{FromIo, strip_errno}; use uucore::error::{UError, UResult, USimpleError, UUsageError}; use uucore::extendedbigdecimal::ExtendedBigDecimal; use uucore::format_usage; +use uucore::i18n::collator::CollatorOptions; use uucore::line_ending::LineEnding; use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError}; use uucore::parser::parse_size::{ParseSizeError, Parser}; @@ -354,6 +355,10 @@ impl GlobalSettings { /// Returns true when the fast lexicographic path can be used safely. fn can_use_fast_lexicographic(&self) -> bool { + // Fast path can only be used when locale is C (byte-wise comparison is correct) + if uucore::i18n::should_use_locale_collation() { + return false; + } self.mode == SortMode::Default && !self.ignore_case && !self.dictionary_order @@ -1271,6 +1276,28 @@ fn default_merge_batch_size() -> usize { } } +/// Check if locale-aware collation will be needed based on sort settings and locale +fn will_need_locale_collation(settings: &GlobalSettings) -> bool { + // First check if we're using the C locale (DEFAULT_LOCALE), which doesn't need collator + let (locale, _) = uucore::i18n::get_collating_locale(); + if *locale == uucore::i18n::DEFAULT_LOCALE { + return false; + } + + // Check each selector to see if any would use locale comparison + for selector in &settings.selectors { + let key_settings = &selector.settings; + if key_settings.mode == SortMode::Default + && !key_settings.ignore_case + && !key_settings.ignore_non_printing + && !key_settings.dictionary_order + { + return true; + } + } + false +} + #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { @@ -1571,6 +1598,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { settings.init_precomputed(); + // Initialize locale-aware collator only if needed for string comparisons + if will_need_locale_collation(&settings) { + uucore::i18n::collator::try_init_collator(CollatorOptions::default()); + } + let result = exec(&mut files, &settings, output, &mut tmp_dir); // Wait here if `SIGINT` was received, // for signal handler to do its work and terminate the program. diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index d47f2df9835..b8c0dad2c9b 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -20,7 +20,7 @@ pub enum UEncoding { Utf8, } -const DEFAULT_LOCALE: Locale = locale!("en-US-posix"); +pub const DEFAULT_LOCALE: Locale = locale!("en-US-posix"); /// Look at 3 environment variables in the following order /// @@ -64,12 +64,17 @@ fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) { } /// Get the collating locale from the environment -fn get_collating_locale() -> &'static (Locale, UEncoding) { +pub fn get_collating_locale() -> &'static (Locale, UEncoding) { static COLLATING_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); COLLATING_LOCALE.get_or_init(|| get_locale_from_env("LC_COLLATE")) } +/// Check if locale-aware collation should be used (i.e., not C/POSIX locale) +pub fn should_use_locale_collation() -> bool { + get_collating_locale().0 != DEFAULT_LOCALE +} + /// Get the numeric locale from the environment pub fn get_numeric_locale() -> &'static (Locale, UEncoding) { static NUMERIC_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 6330f759df0..32ccd164b4a 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -2359,4 +2359,24 @@ fn test_start_buffer() { .stdout_only_bytes(&expected); } +#[test] +fn test_locale_sorting_c() { + // Test LC_ALL=C locale sorting (should sort by byte values) + new_ucmd!() + .env("LC_ALL", "C") + .pipe_in("a\no\nu\nä\nö\nü\n") + .succeeds() + .stdout_is("a\no\nu\nä\nö\nü\n"); +} + +#[test] +fn test_locale_sorting_german() { + // Test LC_ALL=de_DE.utf-8 locale sorting (should respect German collation) + new_ucmd!() + .env("LC_ALL", "de_DE.utf-8") + .pipe_in("a\no\nu\nä\nö\nü\n") + .succeeds() + .stdout_is("a\nä\no\nö\nu\nü\n"); +} + /* spell-checker: enable */