Skip to content

Commit c5c3a8e

Browse files
authored
Merge pull request #10079 from ChrisDryden/fix-numfmt-non-utf8
numfmt: support non-UTF8 delimiters for locales like GB18030
1 parent 0f18267 commit c5c3a8e

File tree

4 files changed

+120
-58
lines changed

4 files changed

+120
-58
lines changed

src/uu/numfmt/src/format.rs

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -358,32 +358,56 @@ fn format_string(
358358
))
359359
}
360360

361-
fn format_and_print_delimited(s: &str, options: &NumfmtOptions) -> Result<()> {
361+
fn split_bytes<'a>(input: &'a [u8], delim: &'a [u8]) -> impl Iterator<Item = &'a [u8]> {
362+
let mut remainder = Some(input);
363+
std::iter::from_fn(move || {
364+
let input = remainder.take()?;
365+
match input.windows(delim.len()).position(|w| w == delim) {
366+
Some(pos) => {
367+
remainder = Some(&input[pos + delim.len()..]);
368+
Some(&input[..pos])
369+
}
370+
None => Some(input),
371+
}
372+
})
373+
}
374+
375+
pub fn format_and_print_delimited(input: &[u8], options: &NumfmtOptions) -> Result<()> {
362376
let delimiter = options.delimiter.as_ref().unwrap();
363-
let mut output = String::new();
377+
let mut output: Vec<u8> = Vec::new();
378+
let eol = if options.zero_terminated {
379+
b'\0'
380+
} else {
381+
b'\n'
382+
};
364383

365-
for (n, field) in (1..).zip(s.split(delimiter)) {
384+
for (n, field) in (1..).zip(split_bytes(input, delimiter)) {
366385
let field_selected = uucore::ranges::contain(&options.fields, n);
367386

368387
// add delimiter before second and subsequent fields
369388
if n > 1 {
370-
output.push_str(delimiter);
389+
output.extend_from_slice(delimiter);
371390
}
372391

373392
if field_selected {
374-
output.push_str(&format_string(field.trim_start(), options, None)?);
393+
// Field must be valid UTF-8 for numeric conversion
394+
let field_str = std::str::from_utf8(field)
395+
.map_err(|_| translate!("numfmt-error-invalid-number", "input" => String::from_utf8_lossy(field).into_owned().quote()))?
396+
.trim_start();
397+
let formatted = format_string(field_str, options, None)?;
398+
output.extend_from_slice(formatted.as_bytes());
375399
} else {
376400
// add unselected field without conversion
377-
output.push_str(field);
401+
output.extend_from_slice(field);
378402
}
379403
}
380404

381-
println!("{output}");
405+
output.push(eol);
406+
std::io::Write::write_all(&mut std::io::stdout(), &output).map_err(|e| e.to_string())?;
382407

383408
Ok(())
384409
}
385-
386-
fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> {
410+
pub fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> {
387411
let mut output = String::new();
388412

389413
for (n, (prefix, field)) in (1..).zip(WhitespaceSplitter { s: Some(s) }) {
@@ -428,18 +452,6 @@ fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> {
428452
Ok(())
429453
}
430454

431-
/// Format a line of text according to the selected options.
432-
///
433-
/// Given a line of text `s`, split the line into fields, transform and format
434-
/// any selected numeric fields, and print the result to stdout. Fields not
435-
/// selected for conversion are passed through unmodified.
436-
pub fn format_and_print(s: &str, options: &NumfmtOptions) -> Result<()> {
437-
match &options.delimiter {
438-
Some(_) => format_and_print_delimited(s, options),
439-
None => format_and_print_whitespace(s, options),
440-
}
441-
}
442-
443455
#[cfg(test)]
444456
mod tests {
445457
use super::*;

src/uu/numfmt/src/numfmt.rs

Lines changed: 67 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,19 @@
44
// file that was distributed with this source code.
55

66
use crate::errors::*;
7-
use crate::format::format_and_print;
7+
use crate::format::{format_and_print_delimited, format_and_print_whitespace};
88
use crate::options::*;
99
use crate::units::{Result, Unit};
10-
use clap::{Arg, ArgAction, ArgMatches, Command, parser::ValueSource};
10+
use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser, parser::ValueSource};
11+
use std::ffi::OsString;
1112
use std::io::{BufRead, Error, Write};
1213
use std::result::Result as StdResult;
1314
use std::str::FromStr;
1415

1516
use units::{IEC_BASES, SI_BASES};
1617
use uucore::display::Quotable;
1718
use uucore::error::UResult;
19+
use uucore::os_str_as_bytes;
1820
use uucore::translate;
1921

2022
use uucore::parser::shortcut_value_parser::ShortcutValueParser;
@@ -26,7 +28,7 @@ pub mod format;
2628
pub mod options;
2729
mod units;
2830

29-
fn handle_args<'a>(args: impl Iterator<Item = &'a str>, options: &NumfmtOptions) -> UResult<()> {
31+
fn handle_args<'a>(args: impl Iterator<Item = &'a [u8]>, options: &NumfmtOptions) -> UResult<()> {
3032
for l in args {
3133
format_and_handle_validation(l, options)?;
3234
}
@@ -37,40 +39,45 @@ fn handle_buffer<R>(input: R, options: &NumfmtOptions) -> UResult<()>
3739
where
3840
R: BufRead,
3941
{
40-
if options.zero_terminated {
41-
handle_buffer_iterator(
42-
input
43-
.split(0)
44-
// FIXME: This panics on UTF8 decoding, but this util in general doesn't handle
45-
// invalid UTF8
46-
.map(|bytes| Ok(String::from_utf8(bytes?).unwrap())),
47-
options,
48-
)
49-
} else {
50-
handle_buffer_iterator(input.lines(), options)
51-
}
42+
let terminator = if options.zero_terminated { 0u8 } else { b'\n' };
43+
handle_buffer_iterator(input.split(terminator), options, terminator)
5244
}
5345

5446
fn handle_buffer_iterator(
55-
iter: impl Iterator<Item = StdResult<String, Error>>,
47+
iter: impl Iterator<Item = StdResult<Vec<u8>, Error>>,
5648
options: &NumfmtOptions,
49+
terminator: u8,
5750
) -> UResult<()> {
58-
let eol = if options.zero_terminated { '\0' } else { '\n' };
5951
for (idx, line_result) in iter.enumerate() {
6052
match line_result {
6153
Ok(line) if idx < options.header => {
62-
print!("{line}{eol}");
54+
std::io::stdout().write_all(&line)?;
55+
std::io::stdout().write_all(&[terminator])?;
6356
Ok(())
6457
}
65-
Ok(line) => format_and_handle_validation(line.as_ref(), options),
58+
Ok(line) => format_and_handle_validation(&line, options),
6659
Err(err) => return Err(Box::new(NumfmtError::IoError(err.to_string()))),
6760
}?;
6861
}
6962
Ok(())
7063
}
7164

72-
fn format_and_handle_validation(input_line: &str, options: &NumfmtOptions) -> UResult<()> {
73-
let handled_line = format_and_print(input_line, options);
65+
fn format_and_handle_validation(input_line: &[u8], options: &NumfmtOptions) -> UResult<()> {
66+
let eol = if options.zero_terminated {
67+
b'\0'
68+
} else {
69+
b'\n'
70+
};
71+
72+
let handled_line = if options.delimiter.is_some() {
73+
format_and_print_delimited(input_line, options)
74+
} else {
75+
// Whitespace mode requires valid UTF-8
76+
match std::str::from_utf8(input_line) {
77+
Ok(s) => format_and_print_whitespace(s, options),
78+
Err(_) => Err(translate!("numfmt-error-invalid-input")),
79+
}
80+
};
7481

7582
if let Err(error_message) = handled_line {
7683
match options.invalid {
@@ -85,7 +92,8 @@ fn format_and_handle_validation(input_line: &str, options: &NumfmtOptions) -> UR
8592
}
8693
InvalidModes::Ignore => {}
8794
}
88-
println!("{input_line}");
95+
std::io::stdout().write_all(input_line)?;
96+
std::io::stdout().write_all(&[eol])?;
8997
}
9098

9199
Ok(())
@@ -150,6 +158,22 @@ fn parse_unit_size_suffix(s: &str) -> Option<usize> {
150158
None
151159
}
152160

161+
/// Parse delimiter argument, ensuring it's a single character.
162+
/// For non-UTF8 locales, we allow up to 4 bytes (max UTF-8 char length).
163+
fn parse_delimiter(arg: &OsString) -> Result<Vec<u8>> {
164+
let bytes = os_str_as_bytes(arg).map_err(|e| e.to_string())?;
165+
// TODO: Cut, NL and here need to find a better way to do locale specific character count
166+
if arg.to_str().is_some_and(|s| s.chars().count() > 1)
167+
|| (arg.to_str().is_none() && bytes.len() > 4)
168+
{
169+
Err(translate!(
170+
"numfmt-error-delimiter-must-be-single-character"
171+
))
172+
} else {
173+
Ok(bytes.to_vec())
174+
}
175+
}
176+
153177
fn parse_options(args: &ArgMatches) -> Result<NumfmtOptions> {
154178
let from = parse_unit(args.get_one::<String>(FROM).unwrap())?;
155179
let to = parse_unit(args.get_one::<String>(TO).unwrap())?;
@@ -212,15 +236,10 @@ fn parse_options(args: &ArgMatches) -> Result<NumfmtOptions> {
212236
));
213237
}
214238

215-
let delimiter = args.get_one::<String>(DELIMITER).map_or(Ok(None), |arg| {
216-
if arg.len() == 1 {
217-
Ok(Some(arg.to_owned()))
218-
} else {
219-
Err(translate!(
220-
"numfmt-error-delimiter-must-be-single-character"
221-
))
222-
}
223-
})?;
239+
let delimiter = args
240+
.get_one::<OsString>(DELIMITER)
241+
.map(parse_delimiter)
242+
.transpose()?;
224243

225244
// unwrap is fine because the argument has a default value
226245
let round = match args.get_one::<String>(ROUND).unwrap().as_str() {
@@ -264,8 +283,14 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
264283

265284
let options = parse_options(&matches).map_err(NumfmtError::IllegalArgument)?;
266285

267-
let result = match matches.get_many::<String>(NUMBER) {
268-
Some(values) => handle_args(values.map(|s| s.as_str()), &options),
286+
let result = match matches.get_many::<OsString>(NUMBER) {
287+
Some(values) => {
288+
let byte_args: Vec<&[u8]> = values
289+
.map(|s| os_str_as_bytes(s).map_err(|e| e.to_string()))
290+
.collect::<std::result::Result<Vec<_>, _>>()
291+
.map_err(NumfmtError::IllegalArgument)?;
292+
handle_args(byte_args.into_iter(), &options)
293+
}
269294
None => {
270295
let stdin = std::io::stdin();
271296
let mut locked_stdin = stdin.lock();
@@ -296,6 +321,7 @@ pub fn uu_app() -> Command {
296321
.short('d')
297322
.long(DELIMITER)
298323
.value_name("X")
324+
.value_parser(ValueParser::os_string())
299325
.help(translate!("numfmt-help-delimiter")),
300326
)
301327
.arg(
@@ -397,7 +423,12 @@ pub fn uu_app() -> Command {
397423
.help(translate!("numfmt-help-zero-terminated"))
398424
.action(ArgAction::SetTrue),
399425
)
400-
.arg(Arg::new(NUMBER).hide(true).action(ArgAction::Append))
426+
.arg(
427+
Arg::new(NUMBER)
428+
.hide(true)
429+
.action(ArgAction::Append)
430+
.value_parser(ValueParser::os_string()),
431+
)
401432
}
402433

403434
#[cfg(test)]
@@ -528,7 +559,7 @@ mod tests {
528559

529560
#[test]
530561
fn args_fail_returns_status_2_for_invalid_input() {
531-
let input_value = ["5", "4Q"].into_iter();
562+
let input_value = [b"5".as_slice(), b"4Q"].into_iter();
532563
let mut options = get_valid_options();
533564
options.invalid = InvalidModes::Fail;
534565
handle_args(input_value, &options).unwrap();
@@ -541,7 +572,7 @@ mod tests {
541572

542573
#[test]
543574
fn args_warn_returns_status_0_for_invalid_input() {
544-
let input_value = ["5", "4Q"].into_iter();
575+
let input_value = [b"5".as_slice(), b"4Q"].into_iter();
545576
let mut options = get_valid_options();
546577
options.invalid = InvalidModes::Warn;
547578
let result = handle_args(input_value, &options);

src/uu/numfmt/src/options.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ pub struct NumfmtOptions {
5050
pub padding: isize,
5151
pub header: usize,
5252
pub fields: Vec<Range>,
53-
pub delimiter: Option<String>,
53+
pub delimiter: Option<Vec<u8>>,
5454
pub round: RoundMethod,
5555
pub suffix: Option<String>,
5656
pub unit_separator: String,

tests/by-util/test_numfmt.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,6 +1116,25 @@ fn test_zero_terminated_embedded_newline() {
11161116
.stdout_is("1000 2000\x003000 4000\x00");
11171117
}
11181118

1119+
#[cfg(unix)]
1120+
#[test]
1121+
fn test_non_utf8_delimiter() {
1122+
use std::ffi::OsStr;
1123+
use std::os::unix::ffi::OsStrExt;
1124+
1125+
// Single-byte non-UTF8 (0xFF) and multi-byte (0xA2E3, e.g. GB18030)
1126+
for delim in [&[0xFFu8][..], &[0xA2, 0xE3]] {
1127+
let input: Vec<u8> = [b"1", delim, b"2K"].concat();
1128+
let expected: Vec<u8> = [b"1", delim, b"2000\n"].concat();
1129+
new_ucmd!()
1130+
.args(&["--from=si", "--field=2", "-d"])
1131+
.arg(OsStr::from_bytes(delim))
1132+
.arg(OsStr::from_bytes(&input))
1133+
.succeeds()
1134+
.stdout_is_bytes(expected);
1135+
}
1136+
}
1137+
11191138
#[test]
11201139
fn test_unit_separator() {
11211140
for (args, expected) in [

0 commit comments

Comments
 (0)