Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 33 additions & 21 deletions src/uu/numfmt/src/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,32 +358,56 @@ fn format_string(
))
}

fn format_and_print_delimited(s: &str, options: &NumfmtOptions) -> Result<()> {
fn split_bytes<'a>(input: &'a [u8], delim: &'a [u8]) -> impl Iterator<Item = &'a [u8]> {
let mut remainder = Some(input);
std::iter::from_fn(move || {
let input = remainder.take()?;
match input.windows(delim.len()).position(|w| w == delim) {
Some(pos) => {
remainder = Some(&input[pos + delim.len()..]);
Some(&input[..pos])
}
None => Some(input),
}
})
}

pub fn format_and_print_delimited(input: &[u8], options: &NumfmtOptions) -> Result<()> {
let delimiter = options.delimiter.as_ref().unwrap();
let mut output = String::new();
let mut output: Vec<u8> = Vec::new();
let eol = if options.zero_terminated {
b'\0'
} else {
b'\n'
};

for (n, field) in (1..).zip(s.split(delimiter)) {
for (n, field) in (1..).zip(split_bytes(input, delimiter)) {
let field_selected = uucore::ranges::contain(&options.fields, n);

// add delimiter before second and subsequent fields
if n > 1 {
output.push_str(delimiter);
output.extend_from_slice(delimiter);
}

if field_selected {
output.push_str(&format_string(field.trim_start(), options, None)?);
// Field must be valid UTF-8 for numeric conversion
let field_str = std::str::from_utf8(field)
.map_err(|_| translate!("numfmt-error-invalid-number", "input" => String::from_utf8_lossy(field).into_owned().quote()))?
.trim_start();
let formatted = format_string(field_str, options, None)?;
output.extend_from_slice(formatted.as_bytes());
} else {
// add unselected field without conversion
output.push_str(field);
output.extend_from_slice(field);
}
}

println!("{output}");
output.push(eol);
std::io::Write::write_all(&mut std::io::stdout(), &output).map_err(|e| e.to_string())?;

Ok(())
}

fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> {
pub fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> {
let mut output = String::new();

for (n, (prefix, field)) in (1..).zip(WhitespaceSplitter { s: Some(s) }) {
Expand Down Expand Up @@ -428,18 +452,6 @@ fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> {
Ok(())
}

/// Format a line of text according to the selected options.
///
/// Given a line of text `s`, split the line into fields, transform and format
/// any selected numeric fields, and print the result to stdout. Fields not
/// selected for conversion are passed through unmodified.
pub fn format_and_print(s: &str, options: &NumfmtOptions) -> Result<()> {
match &options.delimiter {
Some(_) => format_and_print_delimited(s, options),
None => format_and_print_whitespace(s, options),
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
103 changes: 67 additions & 36 deletions src/uu/numfmt/src/numfmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@
// file that was distributed with this source code.

use crate::errors::*;
use crate::format::format_and_print;
use crate::format::{format_and_print_delimited, format_and_print_whitespace};
use crate::options::*;
use crate::units::{Result, Unit};
use clap::{Arg, ArgAction, ArgMatches, Command, parser::ValueSource};
use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser, parser::ValueSource};
use std::ffi::OsString;
use std::io::{BufRead, Error, Write};
use std::result::Result as StdResult;
use std::str::FromStr;

use units::{IEC_BASES, SI_BASES};
use uucore::display::Quotable;
use uucore::error::UResult;
use uucore::os_str_as_bytes;
use uucore::translate;

use uucore::parser::shortcut_value_parser::ShortcutValueParser;
Expand All @@ -26,7 +28,7 @@ pub mod format;
pub mod options;
mod units;

fn handle_args<'a>(args: impl Iterator<Item = &'a str>, options: &NumfmtOptions) -> UResult<()> {
fn handle_args<'a>(args: impl Iterator<Item = &'a [u8]>, options: &NumfmtOptions) -> UResult<()> {
for l in args {
format_and_handle_validation(l, options)?;
}
Expand All @@ -37,40 +39,45 @@ fn handle_buffer<R>(input: R, options: &NumfmtOptions) -> UResult<()>
where
R: BufRead,
{
if options.zero_terminated {
handle_buffer_iterator(
input
.split(0)
// FIXME: This panics on UTF8 decoding, but this util in general doesn't handle
// invalid UTF8
.map(|bytes| Ok(String::from_utf8(bytes?).unwrap())),
options,
)
} else {
handle_buffer_iterator(input.lines(), options)
}
let terminator = if options.zero_terminated { 0u8 } else { b'\n' };
handle_buffer_iterator(input.split(terminator), options, terminator)
}

fn handle_buffer_iterator(
iter: impl Iterator<Item = StdResult<String, Error>>,
iter: impl Iterator<Item = StdResult<Vec<u8>, Error>>,
options: &NumfmtOptions,
terminator: u8,
) -> UResult<()> {
let eol = if options.zero_terminated { '\0' } else { '\n' };
for (idx, line_result) in iter.enumerate() {
match line_result {
Ok(line) if idx < options.header => {
print!("{line}{eol}");
std::io::stdout().write_all(&line)?;
std::io::stdout().write_all(&[terminator])?;
Ok(())
}
Ok(line) => format_and_handle_validation(line.as_ref(), options),
Ok(line) => format_and_handle_validation(&line, options),
Err(err) => return Err(Box::new(NumfmtError::IoError(err.to_string()))),
}?;
}
Ok(())
}

fn format_and_handle_validation(input_line: &str, options: &NumfmtOptions) -> UResult<()> {
let handled_line = format_and_print(input_line, options);
fn format_and_handle_validation(input_line: &[u8], options: &NumfmtOptions) -> UResult<()> {
let eol = if options.zero_terminated {
b'\0'
} else {
b'\n'
};

let handled_line = if options.delimiter.is_some() {
format_and_print_delimited(input_line, options)
} else {
// Whitespace mode requires valid UTF-8
match std::str::from_utf8(input_line) {
Ok(s) => format_and_print_whitespace(s, options),
Err(_) => Err(translate!("numfmt-error-invalid-input")),
}
};

if let Err(error_message) = handled_line {
match options.invalid {
Expand All @@ -85,7 +92,8 @@ fn format_and_handle_validation(input_line: &str, options: &NumfmtOptions) -> UR
}
InvalidModes::Ignore => {}
}
println!("{input_line}");
std::io::stdout().write_all(input_line)?;
std::io::stdout().write_all(&[eol])?;
}

Ok(())
Expand Down Expand Up @@ -150,6 +158,22 @@ fn parse_unit_size_suffix(s: &str) -> Option<usize> {
None
}

/// Parse delimiter argument, ensuring it's a single character.
/// For non-UTF8 locales, we allow up to 4 bytes (max UTF-8 char length).
fn parse_delimiter(arg: &OsString) -> Result<Vec<u8>> {
let bytes = os_str_as_bytes(arg).map_err(|e| e.to_string())?;
// TODO: Cut, NL and here need to find a better way to do locale specific character count
if arg.to_str().is_some_and(|s| s.chars().count() > 1)
|| (arg.to_str().is_none() && bytes.len() > 4)
{
Err(translate!(
"numfmt-error-delimiter-must-be-single-character"
))
} else {
Ok(bytes.to_vec())
}
}

fn parse_options(args: &ArgMatches) -> Result<NumfmtOptions> {
let from = parse_unit(args.get_one::<String>(FROM).unwrap())?;
let to = parse_unit(args.get_one::<String>(TO).unwrap())?;
Expand Down Expand Up @@ -212,15 +236,10 @@ fn parse_options(args: &ArgMatches) -> Result<NumfmtOptions> {
));
}

let delimiter = args.get_one::<String>(DELIMITER).map_or(Ok(None), |arg| {
if arg.len() == 1 {
Ok(Some(arg.to_owned()))
} else {
Err(translate!(
"numfmt-error-delimiter-must-be-single-character"
))
}
})?;
let delimiter = args
.get_one::<OsString>(DELIMITER)
.map(parse_delimiter)
.transpose()?;

// unwrap is fine because the argument has a default value
let round = match args.get_one::<String>(ROUND).unwrap().as_str() {
Expand Down Expand Up @@ -264,8 +283,14 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {

let options = parse_options(&matches).map_err(NumfmtError::IllegalArgument)?;

let result = match matches.get_many::<String>(NUMBER) {
Some(values) => handle_args(values.map(|s| s.as_str()), &options),
let result = match matches.get_many::<OsString>(NUMBER) {
Some(values) => {
let byte_args: Vec<&[u8]> = values
.map(|s| os_str_as_bytes(s).map_err(|e| e.to_string()))
.collect::<std::result::Result<Vec<_>, _>>()
.map_err(NumfmtError::IllegalArgument)?;
handle_args(byte_args.into_iter(), &options)
}
None => {
let stdin = std::io::stdin();
let mut locked_stdin = stdin.lock();
Expand Down Expand Up @@ -296,6 +321,7 @@ pub fn uu_app() -> Command {
.short('d')
.long(DELIMITER)
.value_name("X")
.value_parser(ValueParser::os_string())
.help(translate!("numfmt-help-delimiter")),
)
.arg(
Expand Down Expand Up @@ -397,7 +423,12 @@ pub fn uu_app() -> Command {
.help(translate!("numfmt-help-zero-terminated"))
.action(ArgAction::SetTrue),
)
.arg(Arg::new(NUMBER).hide(true).action(ArgAction::Append))
.arg(
Arg::new(NUMBER)
.hide(true)
.action(ArgAction::Append)
.value_parser(ValueParser::os_string()),
)
}

#[cfg(test)]
Expand Down Expand Up @@ -528,7 +559,7 @@ mod tests {

#[test]
fn args_fail_returns_status_2_for_invalid_input() {
let input_value = ["5", "4Q"].into_iter();
let input_value = [b"5".as_slice(), b"4Q"].into_iter();
let mut options = get_valid_options();
options.invalid = InvalidModes::Fail;
handle_args(input_value, &options).unwrap();
Expand All @@ -541,7 +572,7 @@ mod tests {

#[test]
fn args_warn_returns_status_0_for_invalid_input() {
let input_value = ["5", "4Q"].into_iter();
let input_value = [b"5".as_slice(), b"4Q"].into_iter();
let mut options = get_valid_options();
options.invalid = InvalidModes::Warn;
let result = handle_args(input_value, &options);
Expand Down
2 changes: 1 addition & 1 deletion src/uu/numfmt/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ pub struct NumfmtOptions {
pub padding: isize,
pub header: usize,
pub fields: Vec<Range>,
pub delimiter: Option<String>,
pub delimiter: Option<Vec<u8>>,
pub round: RoundMethod,
pub suffix: Option<String>,
pub unit_separator: String,
Expand Down
19 changes: 19 additions & 0 deletions tests/by-util/test_numfmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,25 @@ fn test_zero_terminated_embedded_newline() {
.stdout_is("1000 2000\x003000 4000\x00");
}

#[cfg(unix)]
#[test]
fn test_non_utf8_delimiter() {
use std::ffi::OsStr;
use std::os::unix::ffi::OsStrExt;

// Single-byte non-UTF8 (0xFF) and multi-byte (0xA2E3, e.g. GB18030)
for delim in [&[0xFFu8][..], &[0xA2, 0xE3]] {
let input: Vec<u8> = [b"1", delim, b"2K"].concat();
let expected: Vec<u8> = [b"1", delim, b"2000\n"].concat();
new_ucmd!()
.args(&["--from=si", "--field=2", "-d"])
.arg(OsStr::from_bytes(delim))
.arg(OsStr::from_bytes(&input))
.succeeds()
.stdout_is_bytes(expected);
}
}

#[test]
fn test_unit_separator() {
for (args, expected) in [
Expand Down
Loading