Skip to content

Commit 9086147

Browse files
committed
feat(sort): add locale-aware numeric sorting support
Implement NumericLocaleSettings to handle thousands separators and decimal points based on locale. Update tokenization logic to accommodate blank thousands separators for numeric and human-numeric modes, ensuring proper parsing of numbers with locale-specific formatting. This enhances compatibility with international number representations.
1 parent a4701f4 commit 9086147

File tree

1 file changed

+169
-21
lines changed

1 file changed

+169
-21
lines changed

src/uu/sort/src/sort.rs

Lines changed: 169 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ use uucore::version_cmp::version_cmp;
5757
use crate::buffer_hint::automatic_buffer_size;
5858
use crate::tmp_dir::TmpDirWrapper;
5959

60+
#[cfg(unix)]
61+
use nix::libc;
62+
#[cfg(unix)]
63+
use std::ffi::CStr;
64+
6065
mod options {
6166
pub mod modes {
6267
pub const SORT: &str = "sort";
@@ -294,9 +299,35 @@ pub struct GlobalSettings {
294299
buffer_size_is_explicit: bool,
295300
compress_prog: Option<String>,
296301
merge_batch_size: usize,
302+
numeric_locale: NumericLocaleSettings,
297303
precomputed: Precomputed,
298304
}
299305

306+
#[derive(Clone, Copy, Debug)]
307+
struct NumericLocaleSettings {
308+
thousands_sep: Option<u8>,
309+
decimal_pt: Option<u8>,
310+
}
311+
312+
impl Default for NumericLocaleSettings {
313+
fn default() -> Self {
314+
Self {
315+
thousands_sep: None,
316+
decimal_pt: Some(DECIMAL_PT),
317+
}
318+
}
319+
}
320+
321+
impl NumericLocaleSettings {
322+
fn num_info_settings(&self, accept_si_units: bool) -> NumInfoParseSettings {
323+
NumInfoParseSettings {
324+
accept_si_units,
325+
thousands_separator: self.thousands_sep,
326+
decimal_pt: self.decimal_pt,
327+
}
328+
}
329+
}
330+
300331
/// Data needed for sorting. Should be computed once before starting to sort
301332
/// by calling `GlobalSettings::init_precomputed`.
302333
#[derive(Clone, Debug, Default)]
@@ -307,6 +338,8 @@ struct Precomputed {
307338
selections_per_line: usize,
308339
fast_lexicographic: bool,
309340
fast_ascii_insensitive: bool,
341+
tokenize_blank_thousands_sep: bool,
342+
tokenize_allow_unit_after_blank: bool,
310343
}
311344

312345
impl GlobalSettings {
@@ -348,6 +381,20 @@ impl GlobalSettings {
348381
.filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
349382
.count();
350383

384+
let uses_numeric = self
385+
.selectors
386+
.iter()
387+
.any(|s| matches!(s.settings.mode, SortMode::Numeric | SortMode::HumanNumeric));
388+
let uses_human_numeric = self
389+
.selectors
390+
.iter()
391+
.any(|s| matches!(s.settings.mode, SortMode::HumanNumeric));
392+
self.precomputed.tokenize_blank_thousands_sep = self.separator.is_none()
393+
&& uses_numeric
394+
&& self.numeric_locale.thousands_sep == Some(b' ');
395+
self.precomputed.tokenize_allow_unit_after_blank =
396+
self.precomputed.tokenize_blank_thousands_sep && uses_human_numeric;
397+
351398
self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic();
352399
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
353400
}
@@ -415,6 +462,7 @@ impl Default for GlobalSettings {
415462
buffer_size_is_explicit: false,
416463
compress_prog: None,
417464
merge_batch_size: default_merge_batch_size(),
465+
numeric_locale: NumericLocaleSettings::default(),
418466
precomputed: Precomputed::default(),
419467
}
420468
}
@@ -524,7 +572,12 @@ impl<'a> Line<'a> {
524572
) -> Self {
525573
token_buffer.clear();
526574
if settings.precomputed.needs_tokens {
527-
tokenize(line, settings.separator, token_buffer);
575+
tokenize(
576+
line,
577+
settings.separator,
578+
token_buffer,
579+
&settings.precomputed,
580+
);
528581
}
529582
if settings.mode == SortMode::Numeric {
530583
// exclude inf, nan, scientific notation
@@ -534,11 +587,12 @@ impl<'a> Line<'a> {
534587
.and_then(|s| s.parse::<f64>().ok());
535588
line_data.line_num_floats.push(line_num_float);
536589
}
537-
for (selector, selection) in settings
538-
.selectors
539-
.iter()
540-
.map(|selector| (selector, selector.get_selection(line, token_buffer)))
541-
{
590+
for (selector, selection) in settings.selectors.iter().map(|selector| {
591+
(
592+
selector,
593+
selector.get_selection(line, token_buffer, &settings.numeric_locale),
594+
)
595+
}) {
542596
match selection {
543597
Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
544598
Selection::WithNumInfo(str, num_info) => {
@@ -587,18 +641,22 @@ impl<'a> Line<'a> {
587641
writeln!(writer)?;
588642

589643
let mut fields = vec![];
590-
tokenize(self.line, settings.separator, &mut fields);
644+
tokenize(
645+
self.line,
646+
settings.separator,
647+
&mut fields,
648+
&settings.precomputed,
649+
);
591650
for selector in &settings.selectors {
592651
let mut selection = selector.get_range(self.line, Some(&fields));
593652
match selector.settings.mode {
594653
SortMode::Numeric | SortMode::HumanNumeric => {
595654
// find out which range is used for numeric comparisons
596655
let (_, num_range) = NumInfo::parse(
597656
&self.line[selection.clone()],
598-
&NumInfoParseSettings {
599-
accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
600-
..Default::default()
601-
},
657+
&settings
658+
.numeric_locale
659+
.num_info_settings(selector.settings.mode == SortMode::HumanNumeric),
602660
);
603661
let initial_selection = selection.clone();
604662

@@ -716,24 +774,50 @@ impl<'a> Line<'a> {
716774
}
717775

718776
/// Tokenize a line into fields. The result is stored into `token_buffer`.
719-
fn tokenize(line: &[u8], separator: Option<u8>, token_buffer: &mut Vec<Field>) {
777+
fn tokenize(
778+
line: &[u8],
779+
separator: Option<u8>,
780+
token_buffer: &mut Vec<Field>,
781+
precomputed: &Precomputed,
782+
) {
720783
assert!(token_buffer.is_empty());
721784
if let Some(separator) = separator {
722785
tokenize_with_separator(line, separator, token_buffer);
723786
} else {
724-
tokenize_default(line, token_buffer);
787+
tokenize_default(
788+
line,
789+
token_buffer,
790+
precomputed.tokenize_blank_thousands_sep,
791+
precomputed.tokenize_allow_unit_after_blank,
792+
);
725793
}
726794
}
727795

728796
/// By default fields are separated by the first whitespace after non-whitespace.
729797
/// Whitespace is included in fields at the start.
730798
/// The result is stored into `token_buffer`.
731-
fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
799+
fn tokenize_default(
800+
line: &[u8],
801+
token_buffer: &mut Vec<Field>,
802+
blank_thousands_sep: bool,
803+
allow_unit_after_blank: bool,
804+
) {
732805
token_buffer.push(0..0);
733806
// pretend that there was whitespace in front of the line
734807
let mut previous_was_whitespace = true;
735808
for (idx, char) in line.iter().enumerate() {
736-
if char.is_ascii_whitespace() {
809+
let is_whitespace = char.is_ascii_whitespace();
810+
let treat_as_separator = if is_whitespace {
811+
if blank_thousands_sep && *char == b' ' {
812+
!is_blank_thousands_sep(line, idx, allow_unit_after_blank)
813+
} else {
814+
true
815+
}
816+
} else {
817+
false
818+
};
819+
820+
if treat_as_separator {
737821
if !previous_was_whitespace {
738822
token_buffer.last_mut().unwrap().end = idx;
739823
token_buffer.push(idx..0);
@@ -746,6 +830,31 @@ fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
746830
token_buffer.last_mut().unwrap().end = line.len();
747831
}
748832

833+
fn is_blank_thousands_sep(line: &[u8], idx: usize, allow_unit_after_blank: bool) -> bool {
834+
if line.get(idx) != Some(&b' ') {
835+
return false;
836+
}
837+
838+
let prev_is_digit = idx
839+
.checked_sub(1)
840+
.and_then(|prev_idx| line.get(prev_idx))
841+
.is_some_and(u8::is_ascii_digit);
842+
if !prev_is_digit {
843+
return false;
844+
}
845+
846+
let next = line.get(idx + 1).copied();
847+
match next {
848+
Some(c) if c.is_ascii_digit() => true,
849+
Some(b'K' | b'k' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R' | b'Q')
850+
if allow_unit_after_blank =>
851+
{
852+
true
853+
}
854+
_ => false,
855+
}
856+
}
857+
749858
/// Split between separators. These separators are not included in fields.
750859
/// The result is stored into `token_buffer`.
751860
fn tokenize_with_separator(line: &[u8], separator: u8, token_buffer: &mut Vec<Field>) {
@@ -943,7 +1052,12 @@ impl FieldSelector {
9431052

9441053
/// Get the selection that corresponds to this selector for the line.
9451054
/// If `needs_fields` returned false, tokens may be empty.
946-
fn get_selection<'a>(&self, line: &'a [u8], tokens: &[Field]) -> Selection<'a> {
1055+
fn get_selection<'a>(
1056+
&self,
1057+
line: &'a [u8],
1058+
tokens: &[Field],
1059+
numeric_locale: &NumericLocaleSettings,
1060+
) -> Selection<'a> {
9471061
// `get_range` expects `None` when we don't need tokens and would get confused by an empty vector.
9481062
let tokens = if self.needs_tokens {
9491063
Some(tokens)
@@ -955,10 +1069,7 @@ impl FieldSelector {
9551069
// Parse NumInfo for this number.
9561070
let (info, num_range) = NumInfo::parse(
9571071
range_str,
958-
&NumInfoParseSettings {
959-
accept_si_units: self.settings.mode == SortMode::HumanNumeric,
960-
..Default::default()
961-
},
1072+
&numeric_locale.num_info_settings(self.settings.mode == SortMode::HumanNumeric),
9621073
);
9631074
// Shorten the range to what we need to pass to numeric_str_cmp later.
9641075
range_str = &range_str[num_range];
@@ -1067,6 +1178,41 @@ impl FieldSelector {
10671178
}
10681179
}
10691180

1181+
#[cfg(unix)]
1182+
fn detect_numeric_locale() -> NumericLocaleSettings {
1183+
unsafe {
1184+
libc::setlocale(libc::LC_NUMERIC, c"".as_ptr());
1185+
let mut settings = NumericLocaleSettings::default();
1186+
let conv = libc::localeconv();
1187+
if conv.is_null() {
1188+
return settings;
1189+
}
1190+
1191+
let decimal_ptr = (*conv).decimal_point;
1192+
if !decimal_ptr.is_null() {
1193+
let decimal_point = CStr::from_ptr(decimal_ptr).to_bytes();
1194+
if decimal_point.len() == 1 {
1195+
settings.decimal_pt = Some(decimal_point[0]);
1196+
}
1197+
}
1198+
1199+
let thousands_ptr = (*conv).thousands_sep;
1200+
if !thousands_ptr.is_null() {
1201+
let thousands_sep = CStr::from_ptr(thousands_ptr).to_bytes();
1202+
if thousands_sep.len() == 1 {
1203+
settings.thousands_sep = Some(thousands_sep[0]);
1204+
}
1205+
}
1206+
1207+
settings
1208+
}
1209+
}
1210+
1211+
#[cfg(not(unix))]
1212+
fn detect_numeric_locale() -> NumericLocaleSettings {
1213+
NumericLocaleSettings::default()
1214+
}
1215+
10701216
/// Creates an `Arg` that conflicts with all other sort modes.
10711217
fn make_sort_mode_arg(mode: &'static str, short: char, help: String) -> Arg {
10721218
Arg::new(mode)
@@ -1275,6 +1421,7 @@ fn default_merge_batch_size() -> usize {
12751421
#[allow(clippy::cognitive_complexity)]
12761422
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
12771423
let mut settings = GlobalSettings::default();
1424+
settings.numeric_locale = detect_numeric_locale();
12781425

12791426
let matches = uucore::clap_localization::handle_clap_result_with_exit_code(
12801427
uu_app(),
@@ -2278,7 +2425,8 @@ mod tests {
22782425

22792426
fn tokenize_helper(line: &[u8], separator: Option<u8>) -> Vec<Field> {
22802427
let mut buffer = vec![];
2281-
tokenize(line, separator, &mut buffer);
2428+
let precomputed = Precomputed::default();
2429+
tokenize(line, separator, &mut buffer, &precomputed);
22822430
buffer
22832431
}
22842432

0 commit comments

Comments
 (0)