@@ -57,6 +57,11 @@ use uucore::version_cmp::version_cmp;
5757use crate :: buffer_hint:: automatic_buffer_size;
5858use crate :: tmp_dir:: TmpDirWrapper ;
5959
60+ #[ cfg( unix) ]
61+ use nix:: libc;
62+ #[ cfg( unix) ]
63+ use std:: ffi:: CStr ;
64+
6065mod options {
6166 pub mod modes {
6267 pub const SORT : & str = "sort" ;
@@ -294,9 +299,35 @@ pub struct GlobalSettings {
294299 buffer_size_is_explicit : bool ,
295300 compress_prog : Option < String > ,
296301 merge_batch_size : usize ,
302+ numeric_locale : NumericLocaleSettings ,
297303 precomputed : Precomputed ,
298304}
299305
306+ #[ derive( Clone , Copy , Debug ) ]
307+ struct NumericLocaleSettings {
308+ thousands_sep : Option < u8 > ,
309+ decimal_pt : Option < u8 > ,
310+ }
311+
312+ impl Default for NumericLocaleSettings {
313+ fn default ( ) -> Self {
314+ Self {
315+ thousands_sep : None ,
316+ decimal_pt : Some ( DECIMAL_PT ) ,
317+ }
318+ }
319+ }
320+
321+ impl NumericLocaleSettings {
322+ fn num_info_settings ( & self , accept_si_units : bool ) -> NumInfoParseSettings {
323+ NumInfoParseSettings {
324+ accept_si_units,
325+ thousands_separator : self . thousands_sep ,
326+ decimal_pt : self . decimal_pt ,
327+ }
328+ }
329+ }
330+
300331/// Data needed for sorting. Should be computed once before starting to sort
301332/// by calling `GlobalSettings::init_precomputed`.
302333#[ derive( Clone , Debug , Default ) ]
@@ -307,6 +338,8 @@ struct Precomputed {
307338 selections_per_line : usize ,
308339 fast_lexicographic : bool ,
309340 fast_ascii_insensitive : bool ,
341+ tokenize_blank_thousands_sep : bool ,
342+ tokenize_allow_unit_after_blank : bool ,
310343}
311344
312345impl GlobalSettings {
@@ -348,6 +381,20 @@ impl GlobalSettings {
348381 . filter ( |s| matches ! ( s. settings. mode, SortMode :: GeneralNumeric ) )
349382 . count ( ) ;
350383
384+ let uses_numeric = self
385+ . selectors
386+ . iter ( )
387+ . any ( |s| matches ! ( s. settings. mode, SortMode :: Numeric | SortMode :: HumanNumeric ) ) ;
388+ let uses_human_numeric = self
389+ . selectors
390+ . iter ( )
391+ . any ( |s| matches ! ( s. settings. mode, SortMode :: HumanNumeric ) ) ;
392+ self . precomputed . tokenize_blank_thousands_sep = self . separator . is_none ( )
393+ && uses_numeric
394+ && self . numeric_locale . thousands_sep == Some ( b' ' ) ;
395+ self . precomputed . tokenize_allow_unit_after_blank =
396+ self . precomputed . tokenize_blank_thousands_sep && uses_human_numeric;
397+
351398 self . precomputed . fast_lexicographic = self . can_use_fast_lexicographic ( ) ;
352399 self . precomputed . fast_ascii_insensitive = self . can_use_fast_ascii_insensitive ( ) ;
353400 }
@@ -415,6 +462,7 @@ impl Default for GlobalSettings {
415462 buffer_size_is_explicit : false ,
416463 compress_prog : None ,
417464 merge_batch_size : default_merge_batch_size ( ) ,
465+ numeric_locale : NumericLocaleSettings :: default ( ) ,
418466 precomputed : Precomputed :: default ( ) ,
419467 }
420468 }
@@ -524,7 +572,12 @@ impl<'a> Line<'a> {
524572 ) -> Self {
525573 token_buffer. clear ( ) ;
526574 if settings. precomputed . needs_tokens {
527- tokenize ( line, settings. separator , token_buffer) ;
575+ tokenize (
576+ line,
577+ settings. separator ,
578+ token_buffer,
579+ & settings. precomputed ,
580+ ) ;
528581 }
529582 if settings. mode == SortMode :: Numeric {
530583 // exclude inf, nan, scientific notation
@@ -534,11 +587,12 @@ impl<'a> Line<'a> {
534587 . and_then ( |s| s. parse :: < f64 > ( ) . ok ( ) ) ;
535588 line_data. line_num_floats . push ( line_num_float) ;
536589 }
537- for ( selector, selection) in settings
538- . selectors
539- . iter ( )
540- . map ( |selector| ( selector, selector. get_selection ( line, token_buffer) ) )
541- {
590+ for ( selector, selection) in settings. selectors . iter ( ) . map ( |selector| {
591+ (
592+ selector,
593+ selector. get_selection ( line, token_buffer, & settings. numeric_locale ) ,
594+ )
595+ } ) {
542596 match selection {
543597 Selection :: AsBigDecimal ( parsed_float) => line_data. parsed_floats . push ( parsed_float) ,
544598 Selection :: WithNumInfo ( str, num_info) => {
@@ -587,18 +641,22 @@ impl<'a> Line<'a> {
587641 writeln ! ( writer) ?;
588642
589643 let mut fields = vec ! [ ] ;
590- tokenize ( self . line , settings. separator , & mut fields) ;
644+ tokenize (
645+ self . line ,
646+ settings. separator ,
647+ & mut fields,
648+ & settings. precomputed ,
649+ ) ;
591650 for selector in & settings. selectors {
592651 let mut selection = selector. get_range ( self . line , Some ( & fields) ) ;
593652 match selector. settings . mode {
594653 SortMode :: Numeric | SortMode :: HumanNumeric => {
595654 // find out which range is used for numeric comparisons
596655 let ( _, num_range) = NumInfo :: parse (
597656 & self . line [ selection. clone ( ) ] ,
598- & NumInfoParseSettings {
599- accept_si_units : selector. settings . mode == SortMode :: HumanNumeric ,
600- ..Default :: default ( )
601- } ,
657+ & settings
658+ . numeric_locale
659+ . num_info_settings ( selector. settings . mode == SortMode :: HumanNumeric ) ,
602660 ) ;
603661 let initial_selection = selection. clone ( ) ;
604662
@@ -716,24 +774,50 @@ impl<'a> Line<'a> {
716774}
717775
718776/// Tokenize a line into fields. The result is stored into `token_buffer`.
719- fn tokenize ( line : & [ u8 ] , separator : Option < u8 > , token_buffer : & mut Vec < Field > ) {
777+ fn tokenize (
778+ line : & [ u8 ] ,
779+ separator : Option < u8 > ,
780+ token_buffer : & mut Vec < Field > ,
781+ precomputed : & Precomputed ,
782+ ) {
720783 assert ! ( token_buffer. is_empty( ) ) ;
721784 if let Some ( separator) = separator {
722785 tokenize_with_separator ( line, separator, token_buffer) ;
723786 } else {
724- tokenize_default ( line, token_buffer) ;
787+ tokenize_default (
788+ line,
789+ token_buffer,
790+ precomputed. tokenize_blank_thousands_sep ,
791+ precomputed. tokenize_allow_unit_after_blank ,
792+ ) ;
725793 }
726794}
727795
728796/// By default fields are separated by the first whitespace after non-whitespace.
729797/// Whitespace is included in fields at the start.
730798/// The result is stored into `token_buffer`.
731- fn tokenize_default ( line : & [ u8 ] , token_buffer : & mut Vec < Field > ) {
799+ fn tokenize_default (
800+ line : & [ u8 ] ,
801+ token_buffer : & mut Vec < Field > ,
802+ blank_thousands_sep : bool ,
803+ allow_unit_after_blank : bool ,
804+ ) {
732805 token_buffer. push ( 0 ..0 ) ;
733806 // pretend that there was whitespace in front of the line
734807 let mut previous_was_whitespace = true ;
735808 for ( idx, char) in line. iter ( ) . enumerate ( ) {
736- if char. is_ascii_whitespace ( ) {
809+ let is_whitespace = char. is_ascii_whitespace ( ) ;
810+ let treat_as_separator = if is_whitespace {
811+ if blank_thousands_sep && * char == b' ' {
812+ !is_blank_thousands_sep ( line, idx, allow_unit_after_blank)
813+ } else {
814+ true
815+ }
816+ } else {
817+ false
818+ } ;
819+
820+ if treat_as_separator {
737821 if !previous_was_whitespace {
738822 token_buffer. last_mut ( ) . unwrap ( ) . end = idx;
739823 token_buffer. push ( idx..0 ) ;
@@ -746,6 +830,31 @@ fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
746830 token_buffer. last_mut ( ) . unwrap ( ) . end = line. len ( ) ;
747831}
748832
833+ fn is_blank_thousands_sep ( line : & [ u8 ] , idx : usize , allow_unit_after_blank : bool ) -> bool {
834+ if line. get ( idx) != Some ( & b' ' ) {
835+ return false ;
836+ }
837+
838+ let prev_is_digit = idx
839+ . checked_sub ( 1 )
840+ . and_then ( |prev_idx| line. get ( prev_idx) )
841+ . is_some_and ( u8:: is_ascii_digit) ;
842+ if !prev_is_digit {
843+ return false ;
844+ }
845+
846+ let next = line. get ( idx + 1 ) . copied ( ) ;
847+ match next {
848+ Some ( c) if c. is_ascii_digit ( ) => true ,
849+ Some ( b'K' | b'k' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R' | b'Q' )
850+ if allow_unit_after_blank =>
851+ {
852+ true
853+ }
854+ _ => false ,
855+ }
856+ }
857+
749858/// Split between separators. These separators are not included in fields.
750859/// The result is stored into `token_buffer`.
751860fn tokenize_with_separator ( line : & [ u8 ] , separator : u8 , token_buffer : & mut Vec < Field > ) {
@@ -943,7 +1052,12 @@ impl FieldSelector {
9431052
9441053 /// Get the selection that corresponds to this selector for the line.
9451054 /// If `needs_fields` returned false, tokens may be empty.
946- fn get_selection < ' a > ( & self , line : & ' a [ u8 ] , tokens : & [ Field ] ) -> Selection < ' a > {
1055+ fn get_selection < ' a > (
1056+ & self ,
1057+ line : & ' a [ u8 ] ,
1058+ tokens : & [ Field ] ,
1059+ numeric_locale : & NumericLocaleSettings ,
1060+ ) -> Selection < ' a > {
9471061 // `get_range` expects `None` when we don't need tokens and would get confused by an empty vector.
9481062 let tokens = if self . needs_tokens {
9491063 Some ( tokens)
@@ -955,10 +1069,7 @@ impl FieldSelector {
9551069 // Parse NumInfo for this number.
9561070 let ( info, num_range) = NumInfo :: parse (
9571071 range_str,
958- & NumInfoParseSettings {
959- accept_si_units : self . settings . mode == SortMode :: HumanNumeric ,
960- ..Default :: default ( )
961- } ,
1072+ & numeric_locale. num_info_settings ( self . settings . mode == SortMode :: HumanNumeric ) ,
9621073 ) ;
9631074 // Shorten the range to what we need to pass to numeric_str_cmp later.
9641075 range_str = & range_str[ num_range] ;
@@ -1067,6 +1178,41 @@ impl FieldSelector {
10671178 }
10681179}
10691180
1181+ #[ cfg( unix) ]
1182+ fn detect_numeric_locale ( ) -> NumericLocaleSettings {
1183+ unsafe {
1184+ libc:: setlocale ( libc:: LC_NUMERIC , c"" . as_ptr ( ) ) ;
1185+ let mut settings = NumericLocaleSettings :: default ( ) ;
1186+ let conv = libc:: localeconv ( ) ;
1187+ if conv. is_null ( ) {
1188+ return settings;
1189+ }
1190+
1191+ let decimal_ptr = ( * conv) . decimal_point ;
1192+ if !decimal_ptr. is_null ( ) {
1193+ let decimal_point = CStr :: from_ptr ( decimal_ptr) . to_bytes ( ) ;
1194+ if decimal_point. len ( ) == 1 {
1195+ settings. decimal_pt = Some ( decimal_point[ 0 ] ) ;
1196+ }
1197+ }
1198+
1199+ let thousands_ptr = ( * conv) . thousands_sep ;
1200+ if !thousands_ptr. is_null ( ) {
1201+ let thousands_sep = CStr :: from_ptr ( thousands_ptr) . to_bytes ( ) ;
1202+ if thousands_sep. len ( ) == 1 {
1203+ settings. thousands_sep = Some ( thousands_sep[ 0 ] ) ;
1204+ }
1205+ }
1206+
1207+ settings
1208+ }
1209+ }
1210+
1211+ #[ cfg( not( unix) ) ]
1212+ fn detect_numeric_locale ( ) -> NumericLocaleSettings {
1213+ NumericLocaleSettings :: default ( )
1214+ }
1215+
10701216/// Creates an `Arg` that conflicts with all other sort modes.
10711217fn make_sort_mode_arg ( mode : & ' static str , short : char , help : String ) -> Arg {
10721218 Arg :: new ( mode)
@@ -1275,6 +1421,7 @@ fn default_merge_batch_size() -> usize {
12751421#[ allow( clippy:: cognitive_complexity) ]
12761422pub fn uumain ( args : impl uucore:: Args ) -> UResult < ( ) > {
12771423 let mut settings = GlobalSettings :: default ( ) ;
1424+ settings. numeric_locale = detect_numeric_locale ( ) ;
12781425
12791426 let matches = uucore:: clap_localization:: handle_clap_result_with_exit_code (
12801427 uu_app ( ) ,
@@ -2278,7 +2425,8 @@ mod tests {
22782425
22792426 fn tokenize_helper ( line : & [ u8 ] , separator : Option < u8 > ) -> Vec < Field > {
22802427 let mut buffer = vec ! [ ] ;
2281- tokenize ( line, separator, & mut buffer) ;
2428+ let precomputed = Precomputed :: default ( ) ;
2429+ tokenize ( line, separator, & mut buffer, & precomputed) ;
22822430 buffer
22832431 }
22842432
0 commit comments