diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 4df76634843..a7c27818dc2 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -6,11 +6,11 @@ use clap::builder::ValueParser; use clap::{Arg, ArgAction, Command}; use std::env; -use std::ffi::{OsStr, OsString}; +use std::ffi::OsString; use std::io::{self, StdoutLock, Write}; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; use uucore::format::{FormatChar, OctalParsing, parse_escape_only}; -use uucore::{format_usage, help_about, help_section, help_usage}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes}; const ABOUT: &str = help_about!("echo.md"); const USAGE: &str = help_usage!("echo.md"); @@ -178,13 +178,9 @@ fn execute( escaped: bool, ) -> UResult<()> { for (i, input) in arguments_after_options.into_iter().enumerate() { - let Some(bytes) = bytes_from_os_string(input.as_os_str()) else { - return Err(USimpleError::new( - 1, - "Non-UTF-8 arguments provided, but this platform does not support them", - )); - }; + let bytes = os_str_as_bytes(&input)?; + // Don't print a space before the first argument if i > 0 { stdout_lock.write_all(b" ")?; } @@ -206,19 +202,3 @@ fn execute( Ok(()) } - -fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> { - #[cfg(target_family = "unix")] - { - use std::os::unix::ffi::OsStrExt; - - Some(input.as_bytes()) - } - - #[cfg(not(target_family = "unix"))] - { - // TODO - // Verify that this works correctly on these platforms - input.to_str().map(|st| st.as_bytes()) - } -} diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index 887ad4107a7..b564fbae905 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -3,6 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. use clap::{Arg, ArgAction, Command}; +use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; @@ -19,21 +20,19 @@ mod options { pub const FORMAT: &str = "FORMAT"; pub const ARGUMENT: &str = "ARGUMENT"; } + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().get_matches_from(args); let format = matches - .get_one::(options::FORMAT) + .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, "missing operand"))?; let format = os_str_as_bytes(format)?; - let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { - // FIXME: use os_str_as_bytes once FormatArgument supports Vec + let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { Some(s) => s - .map(|os_string| { - FormatArgument::Unparsed(std::ffi::OsStr::to_string_lossy(os_string).to_string()) - }) + .map(|os_string| FormatArgument::Unparsed(os_string.to_owned())) .collect(), None => vec![], }; @@ -59,7 +58,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let Some(FormatArgument::Unparsed(arg_str)) = args.peek_arg() else { unreachable!("All args are transformed to Unparsed") }; - show_warning!("ignoring excess arguments, starting with '{arg_str}'"); + show_warning!( + "ignoring excess arguments, starting with '{}'", + arg_str.to_string_lossy() + ); } return Ok(()); } @@ -98,10 +100,10 @@ pub fn uu_app() -> Command { .help("Print version information") .action(ArgAction::Version), ) - .arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(std::ffi::OsString))) + .arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(OsString))) .arg( Arg::new(options::ARGUMENT) .action(ArgAction::Append) - .value_parser(clap::value_parser!(std::ffi::OsString)), + .value_parser(clap::value_parser!(OsString)), ) } diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index f7b36a9b8a8..a93e811cd0d 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -956,7 +956,7 @@ fn process_checksum_line( cached_line_format: &mut Option, last_algo: &mut Option, ) -> Result<(), LineCheckError> { - let line_bytes = os_str_as_bytes(line)?; + let line_bytes = os_str_as_bytes(line).map_err(|e| LineCheckError::UError(Box::new(e)))?; // Early return on empty or commented lines. if line.is_empty() || line_bytes.starts_with(b"#") { diff --git a/src/uucore/src/lib/features/extendedbigdecimal.rs b/src/uucore/src/lib/features/extendedbigdecimal.rs index 396b6f35941..5748b6f1ab9 100644 --- a/src/uucore/src/lib/features/extendedbigdecimal.rs +++ b/src/uucore/src/lib/features/extendedbigdecimal.rs @@ -101,6 +101,18 @@ impl From for ExtendedBigDecimal { } } +impl From for ExtendedBigDecimal { + fn from(val: u8) -> Self { + Self::BigDecimal(val.into()) + } +} + +impl From for ExtendedBigDecimal { + fn from(val: u32) -> Self { + Self::BigDecimal(val.into()) + } +} + impl ExtendedBigDecimal { pub fn zero() -> Self { Self::BigDecimal(0.into()) diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index f3edbae5576..cb813577280 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -3,16 +3,20 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use super::ExtendedBigDecimal; +use super::{ExtendedBigDecimal, FormatError}; use crate::format::spec::ArgumentLocation; use crate::{ error::set_exit_code, + os_str_as_bytes, parser::num_parser::{ExtendedParser, ExtendedParserError}, quoting_style::{Quotes, QuotingStyle, escape_name}, show_error, show_warning, }; use os_display::Quotable; -use std::{ffi::OsStr, num::NonZero}; +use std::{ + ffi::{OsStr, OsString}, + num::NonZero, +}; /// An argument for formatting /// @@ -24,12 +28,12 @@ use std::{ffi::OsStr, num::NonZero}; #[derive(Clone, Debug, PartialEq)] pub enum FormatArgument { Char(char), - String(String), + String(OsString), UnsignedInt(u64), SignedInt(i64), Float(ExtendedBigDecimal), /// Special argument that gets coerced into the other variants - Unparsed(String), + Unparsed(OsString), } /// A struct that holds a slice of format arguments and provides methods to access them @@ -69,64 +73,86 @@ impl<'a> FormatArguments<'a> { self.next_arg_position = self.current_offset; } - pub fn next_char(&mut self, position: &ArgumentLocation) -> u8 { + pub fn next_char(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::Char(c)) => *c as u8, - Some(FormatArgument::Unparsed(s)) => s.bytes().next().unwrap_or(b'\0'), - _ => b'\0', + Some(FormatArgument::Char(c)) => Ok(*c as u8), + Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes(os)?.first() { + Some(&byte) => Ok(byte), + None => Ok(b'\0'), + }, + _ => Ok(b'\0'), } } - pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a str { + pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a OsStr { match self.next_arg(position) { - Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s, - _ => "", + Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os, + _ => "".as_ref(), } } - pub fn next_i64(&mut self, position: &ArgumentLocation) -> i64 { + pub fn next_i64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::SignedInt(n)) => *n, - Some(FormatArgument::Unparsed(s)) => extract_value(i64::extended_parse(s), s), - _ => 0, + Some(FormatArgument::SignedInt(n)) => Ok(*n), + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), + _ => Ok(0), } } - pub fn next_u64(&mut self, position: &ArgumentLocation) -> u64 { + pub fn next_u64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::UnsignedInt(n)) => *n, - Some(FormatArgument::Unparsed(s)) => { - // Check if the string is a character literal enclosed in quotes - if s.starts_with(['"', '\'']) { - // Extract the content between the quotes safely using chars - let mut chars = s.trim_matches(|c| c == '"' || c == '\'').chars(); - if let Some(first_char) = chars.next() { - if chars.clone().count() > 0 { - // Emit a warning if there are additional characters - let remaining: String = chars.collect(); - show_warning!( - "{}: character(s) following character constant have been ignored", - remaining - ); - } - return first_char as u64; // Use only the first character - } - return 0; // Empty quotes - } - extract_value(u64::extended_parse(s), s) - } - _ => 0, + Some(FormatArgument::UnsignedInt(n)) => Ok(*n), + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), + _ => Ok(0), } } - pub fn next_extended_big_decimal(&mut self, position: &ArgumentLocation) -> ExtendedBigDecimal { + pub fn next_extended_big_decimal( + &mut self, + position: &ArgumentLocation, + ) -> Result { match self.next_arg(position) { - Some(FormatArgument::Float(n)) => n.clone(), - Some(FormatArgument::Unparsed(s)) => { - extract_value(ExtendedBigDecimal::extended_parse(s), s) + Some(FormatArgument::Float(n)) => Ok(n.clone()), + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), + _ => Ok(ExtendedBigDecimal::zero()), + } + } + + fn get_num(os: &OsStr) -> Result + where + T: ExtendedParser + From + From + Default, + { + let s = os_str_as_bytes(os)?; + // Check if the string begins with a quote, and is therefore a literal + if let Some((&first, bytes)) = s.split_first() { + if (first == b'"' || first == b'\'') && !bytes.is_empty() { + let (val, len) = if let Some(c) = bytes + .utf8_chunks() + .next() + .expect("bytes should not be empty") + .valid() + .chars() + .next() + { + // Valid UTF-8 character, cast the codepoint to u32 then T + // (largest unicode codepoint is only 3 bytes, so this is safe) + ((c as u32).into(), c.len_utf8()) + } else { + // Not a valid UTF-8 character, use the first byte + (bytes[0].into(), 1) + }; + // Emit a warning if there are additional characters + if bytes.len() > len { + show_warning!( + "{}: character(s) following character constant have been ignored", + String::from_utf8_lossy(&bytes[len..]) + ); + } + return Ok(val); } - _ => ExtendedBigDecimal::zero(), } + let s = os.to_string_lossy(); + Ok(extract_value(T::extended_parse(&s), &s)) } fn get_at_relative_position(&mut self, pos: NonZero) -> Option<&'a FormatArgument> { @@ -181,6 +207,7 @@ fn extract_value(p: Result>, input: &s } else { show_error!("{}: value not completely converted", input.quote()); } + v } } @@ -205,7 +232,10 @@ mod tests { assert!(!args.is_exhausted()); assert_eq!(Some(&FormatArgument::Char('a')), args.peek_arg()); assert!(!args.is_exhausted()); // Peek shouldn't consume - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(args.is_exhausted()); // After batch, exhausted with a single arg assert_eq!(None, args.peek_arg()); @@ -226,26 +256,50 @@ mod tests { ]); // First batch - two sequential calls - assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'y', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'z', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'y', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'x', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'w', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'x', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'w', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern - assert_eq!(b'v', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'u', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'v', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'u', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Fourth batch - same pattern (last batch) - assert_eq!(b't', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b's', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b't', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b's', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -255,28 +309,37 @@ mod tests { // Test with different method types in sequence let args = [ FormatArgument::Char('a'), - FormatArgument::String("hello".to_string()), - FormatArgument::Unparsed("123".to_string()), - FormatArgument::String("world".to_string()), + FormatArgument::String("hello".into()), + FormatArgument::Unparsed("123".into()), + FormatArgument::String("world".into()), FormatArgument::Char('z'), - FormatArgument::String("test".to_string()), + FormatArgument::String("test".into()), ]; let mut args = FormatArguments::new(&args); // First batch - next_char followed by next_string - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'1', args.next_char(&ArgumentLocation::NextArgument)); // First byte of 123 + assert_eq!( + b'1', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // First byte of 123 assert_eq!("world", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern (last batch) - assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'z', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); assert_eq!("test", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(args.is_exhausted()); @@ -302,23 +365,23 @@ mod tests { ]); // First batch - positional access - assert_eq!(b'b', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'a', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'b', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same positional pattern - assert_eq!(b'e', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'd', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'e', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'd', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same positional pattern (last batch) - assert_eq!(b'h', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'g', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'i', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'h', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'g', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'i', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(args.is_exhausted()); } @@ -338,20 +401,29 @@ mod tests { ]); // First batch - mix of sequential and positional - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Positional + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same mixed pattern - assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Positional + assert_eq!( + b'd', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Last batch - same mixed pattern - assert_eq!(b'g', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds + assert_eq!( + b'g', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } @@ -370,17 +442,21 @@ mod tests { let mut args = FormatArguments::new(&args); // First batch - i64, u64, decimal - assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument)); - assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument)); - let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); + assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); + let result = args + .next_extended_big_decimal(&ArgumentLocation::NextArgument) + .unwrap(); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument)); - assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument)); - let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); + assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); + let result = args + .next_extended_big_decimal(&ArgumentLocation::NextArgument) + .unwrap(); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(args.is_exhausted()); @@ -390,22 +466,22 @@ mod tests { fn test_unparsed_arguments() { // Test with unparsed arguments that get coerced let args = [ - FormatArgument::Unparsed("hello".to_string()), - FormatArgument::Unparsed("123".to_string()), - FormatArgument::Unparsed("hello".to_string()), - FormatArgument::Unparsed("456".to_string()), + FormatArgument::Unparsed("hello".into()), + FormatArgument::Unparsed("123".into()), + FormatArgument::Unparsed("hello".into()), + FormatArgument::Unparsed("456".into()), ]; let mut args = FormatArguments::new(&args); // First batch - string, number assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -415,25 +491,25 @@ mod tests { // Test with mixed types and positional access let args = [ FormatArgument::Char('a'), - FormatArgument::String("test".to_string()), + FormatArgument::String("test".into()), FormatArgument::UnsignedInt(42), FormatArgument::Char('b'), - FormatArgument::String("more".to_string()), + FormatArgument::String("more".into()), FormatArgument::UnsignedInt(99), ]; let mut args = FormatArguments::new(&args); // First batch - positional access of different types - assert_eq!(b'a', args.next_char(&non_zero_pos(1))); + assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); assert_eq!("test", args.next_string(&non_zero_pos(2))); - assert_eq!(42, args.next_u64(&non_zero_pos(3))); + assert_eq!(42, args.next_u64(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'b', args.next_char(&non_zero_pos(1))); + assert_eq!(b'b', args.next_char(&non_zero_pos(1)).unwrap()); assert_eq!("more", args.next_string(&non_zero_pos(2))); - assert_eq!(99, args.next_u64(&non_zero_pos(3))); + assert_eq!(99, args.next_u64(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -450,14 +526,20 @@ mod tests { ]); // First batch - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch (partial) - assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds + assert_eq!( + b'd', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index ee17d96da79..dc8a61dda22 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -37,8 +37,12 @@ pub mod human; pub mod num_format; mod spec; +pub use self::escape::{EscapedChar, OctalParsing}; use crate::extendedbigdecimal::ExtendedBigDecimal; -pub use argument::*; +pub use argument::{FormatArgument, FormatArguments}; + +use self::{escape::parse_escape_code, num_format::Formatter}; +use crate::{NonUtf8OsStrError, error::UError}; pub use spec::Spec; use std::{ error::Error, @@ -50,13 +54,6 @@ use std::{ use os_display::Quotable; -use crate::error::UError; - -pub use self::{ - escape::{EscapedChar, OctalParsing, parse_escape_code}, - num_format::Formatter, -}; - #[derive(Debug)] pub enum FormatError { SpecError(Vec), @@ -74,6 +71,7 @@ pub enum FormatError { /// The hexadecimal characters represent a code point that cannot represent a /// Unicode character (e.g., a surrogate code point) InvalidCharacter(char, Vec), + InvalidEncoding(NonUtf8OsStrError), } impl Error for FormatError {} @@ -85,6 +83,12 @@ impl From for FormatError { } } +impl From for FormatError { + fn from(value: NonUtf8OsStrError) -> FormatError { + FormatError::InvalidEncoding(value) + } +} + impl Display for FormatError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -119,6 +123,7 @@ impl Display for FormatError { escape_char, String::from_utf8_lossy(digits) ), + Self::InvalidEncoding(no) => no.fmt(f), } } } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index d2262659012..06cf4498553 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -5,8 +5,6 @@ // spell-checker:ignore (vars) intmax ptrdiff padlen -use crate::quoting_style::{QuotingStyle, escape_name}; - use super::{ ExtendedBigDecimal, FormatChar, FormatError, OctalParsing, num_format::{ @@ -15,7 +13,11 @@ use super::{ }, parse_escape_only, }; -use crate::format::FormatArguments; +use crate::{ + format::FormatArguments, + os_str_as_bytes, + quoting_style::{QuotingStyle, escape_name}, +}; use std::{io::Write, num::NonZero, ops::ControlFlow}; /// A parsed specification for formatting a value @@ -355,7 +357,7 @@ impl Spec { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or_default(); write_padded( writer, - &[args.next_char(position)], + &[args.next_char(position)?], width, *align_left || neg_width, ) @@ -375,22 +377,21 @@ impl Spec { // TODO: We need to not use Rust's formatting for aligning the output, // so that we can just write bytes to stdout without panicking. let precision = resolve_asterisk_precision(*precision, args); - let s = args.next_string(position); + let os_str = args.next_string(position); + let bytes = os_str_as_bytes(os_str)?; + let truncated = match precision { - Some(p) if p < s.len() => &s[..p], - _ => s, + Some(p) if p < os_str.len() => &bytes[..p], + _ => bytes, }; - write_padded( - writer, - truncated.as_bytes(), - width, - *align_left || neg_width, - ) + write_padded(writer, truncated, width, *align_left || neg_width) } Self::EscapedString { position } => { - let s = args.next_string(position); - let mut parsed = Vec::new(); - for c in parse_escape_only(s.as_bytes(), OctalParsing::ThreeDigits) { + let os_str = args.next_string(position); + let bytes = os_str_as_bytes(os_str)?; + let mut parsed = Vec::::new(); + + for c in parse_escape_only(bytes, OctalParsing::ThreeDigits) { match c.write(&mut parsed)? { ControlFlow::Continue(()) => {} ControlFlow::Break(()) => { @@ -403,19 +404,15 @@ impl Spec { } Self::QuotedString { position } => { let s = escape_name( - args.next_string(position).as_ref(), + args.next_string(position), &QuotingStyle::Shell { escape: true, always_quote: false, show_control: false, }, ); - #[cfg(unix)] - let bytes = std::os::unix::ffi::OsStringExt::into_vec(s); - #[cfg(not(unix))] - let bytes = s.to_string_lossy().as_bytes().to_owned(); - - writer.write_all(&bytes).map_err(FormatError::IoError) + let bytes = os_str_as_bytes(&s)?; + writer.write_all(bytes).map_err(FormatError::IoError) } Self::SignedInt { width, @@ -426,7 +423,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_i64(position); + let i = args.next_i64(position)?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -454,7 +451,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_u64(position); + let i = args.next_u64(position)?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -485,7 +482,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args); - let f: ExtendedBigDecimal = args.next_extended_big_decimal(position); + let f: ExtendedBigDecimal = args.next_extended_big_decimal(position)?; if precision.is_some_and(|p| p as u64 > i32::MAX as u64) { return Err(FormatError::InvalidPrecision( @@ -522,7 +519,7 @@ fn resolve_asterisk_width( match option { None => None, Some(CanAsterisk::Asterisk(loc)) => { - let nb = args.next_i64(&loc); + let nb = args.next_i64(&loc).unwrap_or(0); if nb < 0 { Some((usize::try_from(-(nb as isize)).ok().unwrap_or(0), true)) } else { @@ -541,7 +538,7 @@ fn resolve_asterisk_precision( ) -> Option { match option { None => None, - Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc) { + Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc).unwrap_or(0) { v if v >= 0 => usize::try_from(v).ok(), v if v < 0 => Some(0usize), _ => None, @@ -650,7 +647,7 @@ mod tests { Some((42, false)), resolve_asterisk_width( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("42".into())]), ) ); @@ -665,7 +662,7 @@ mod tests { Some((42, true)), resolve_asterisk_width( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".into())]), ) ); @@ -676,9 +673,9 @@ mod tests { NonZero::new(2).unwrap() ))), &mut FormatArguments::new(&[ - FormatArgument::Unparsed("1".to_string()), - FormatArgument::Unparsed("2".to_string()), - FormatArgument::Unparsed("3".to_string()) + FormatArgument::Unparsed("1".into()), + FormatArgument::Unparsed("2".into()), + FormatArgument::Unparsed("3".into()) ]), ) ); @@ -721,7 +718,7 @@ mod tests { Some(42), resolve_asterisk_precision( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("42".into())]), ) ); @@ -736,7 +733,7 @@ mod tests { Some(0), resolve_asterisk_precision( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".into())]), ) ); assert_eq!( @@ -746,9 +743,9 @@ mod tests { NonZero::new(2).unwrap() ))), &mut FormatArguments::new(&[ - FormatArgument::Unparsed("1".to_string()), - FormatArgument::Unparsed("2".to_string()), - FormatArgument::Unparsed("3".to_string()) + FormatArgument::Unparsed("1".into()), + FormatArgument::Unparsed("2".into()), + FormatArgument::Unparsed("3".into()) ]), ) ); diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs index 3ee07e41357..655c17eece4 100644 --- a/src/uucore/src/lib/features/parser/num_parser.rs +++ b/src/uucore/src/lib/features/parser/num_parser.rs @@ -481,21 +481,6 @@ pub(crate) fn parse<'a>( target: ParseTarget, allowed_suffixes: &[(char, u32)], ) -> Result> { - // Parse the " and ' prefixes separately - if target != ParseTarget::Duration { - if let Some(rest) = input.strip_prefix(['\'', '"']) { - let mut chars = rest.char_indices().fuse(); - let v = chars - .next() - .map(|(_, c)| ExtendedBigDecimal::BigDecimal(u32::from(c).into())); - return match (v, chars.next()) { - (Some(v), None) => Ok(v), - (Some(v), Some((i, _))) => Err(ExtendedParserError::PartialMatch(v, &rest[i..])), - (None, _) => Err(ExtendedParserError::NotNumeric), - }; - } - } - let trimmed_input = input.trim_ascii_start(); // Initial minus/plus sign diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index b1a9363f728..14254cb5fcb 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -278,23 +278,39 @@ pub fn read_yes() -> bool { } } +#[derive(Debug)] +pub struct NonUtf8OsStrError { + input_lossy_string: String, +} + +impl std::fmt::Display for NonUtf8OsStrError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use os_display::Quotable; + let quoted = self.input_lossy_string.quote(); + f.write_fmt(format_args!( + "invalid UTF-8 input {quoted} encountered when converting to bytes on a platform that doesn't expose byte arguments", + )) + } +} + +impl std::error::Error for NonUtf8OsStrError {} +impl error::UError for NonUtf8OsStrError {} + /// Converts an `OsStr` to a UTF-8 `&[u8]`. /// /// This always succeeds on unix platforms, /// and fails on other platforms if the string can't be coerced to UTF-8. -pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { +pub fn os_str_as_bytes(os_string: &OsStr) -> Result<&[u8], NonUtf8OsStrError> { #[cfg(unix)] - let bytes = os_string.as_bytes(); + return Ok(os_string.as_bytes()); #[cfg(not(unix))] - let bytes = os_string + os_string .to_str() - .ok_or_else(|| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })? - .as_bytes(); - - Ok(bytes) + .ok_or_else(|| NonUtf8OsStrError { + input_lossy_string: os_string.to_string_lossy().into_owned(), + }) + .map(|s| s.as_bytes()) } /// Performs a potentially lossy conversion from `OsStr` to UTF-8 bytes. @@ -303,15 +319,13 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { /// and wraps [`OsStr::to_string_lossy`] on non-unix platforms. pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> { #[cfg(unix)] - let bytes = Cow::from(os_string.as_bytes()); + return Cow::from(os_string.as_bytes()); #[cfg(not(unix))] - let bytes = match os_string.to_string_lossy() { + match os_string.to_string_lossy() { Cow::Borrowed(slice) => Cow::from(slice.as_bytes()), Cow::Owned(owned) => Cow::from(owned.into_bytes()), - }; - - bytes + } } /// Converts a `&[u8]` to an `&OsStr`, @@ -321,13 +335,12 @@ pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> { /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { #[cfg(unix)] - let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); - #[cfg(not(unix))] - let os_str = Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { - mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") - })?)); + return Ok(Cow::Borrowed(OsStr::from_bytes(bytes))); - Ok(os_str) + #[cfg(not(unix))] + Ok(Cow::Owned(OsString::from(str::from_utf8(bytes).map_err( + |_| mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr"), + )?))) } /// Converts a `Vec` into an `OsString`, parsing as UTF-8 on non-unix platforms. @@ -336,13 +349,12 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { #[cfg(unix)] - let s = OsString::from_vec(vec); + return Ok(OsString::from_vec(vec)); + #[cfg(not(unix))] - let s = OsString::from(String::from_utf8(vec).map_err(|_| { + Ok(OsString::from(String::from_utf8(vec).map_err(|_| { mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })?); - - Ok(s) + })?)) } /// Equivalent to `std::BufRead::lines` which outputs each line as a `Vec`, diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index c0e9c41b3aa..959d3501cd1 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -807,7 +807,7 @@ fn test_overflow() { fn partial_char() { new_ucmd!() .args(&["%d", "'abc"]) - .fails_with_code(1) + .succeeds() .stdout_is("97") .stderr_is( "printf: warning: bc: character(s) following character constant have been ignored\n", @@ -1291,23 +1291,80 @@ fn float_arg_with_whitespace() { #[test] fn mb_input() { - for format in ["\"á", "\'á", "'\u{e1}"] { + let cases = vec![ + ("%04x\n", "\"á", "00e1\n"), + ("%04x\n", "'á", "00e1\n"), + ("%04x\n", "'\u{e1}", "00e1\n"), + ("%i\n", "\"á", "225\n"), + ("%i\n", "'á", "225\n"), + ("%i\n", "'\u{e1}", "225\n"), + ("%f\n", "'á", "225.000000\n"), + ]; + for (format, arg, stdout) in cases { new_ucmd!() - .args(&["%04x\n", format]) + .args(&[format, arg]) .succeeds() - .stdout_only("00e1\n"); + .stdout_only(stdout); } let cases = vec![ - ("\"á=", "="), - ("\'á-", "-"), - ("\'á=-==", "=-=="), - ("'\u{e1}++", "++"), + ("%04x\n", "\"á=", "00e1\n", "="), + ("%04x\n", "'á-", "00e1\n", "-"), + ("%04x\n", "'á=-==", "00e1\n", "=-=="), + ("%04x\n", "'á'", "00e1\n", "'"), + ("%04x\n", "'\u{e1}++", "00e1\n", "++"), + ("%04x\n", "''á'", "0027\n", "á'"), + ("%i\n", "\"á=", "225\n", "="), ]; + for (format, arg, stdout, stderr) in cases { + new_ucmd!() + .args(&[format, arg]) + .succeeds() + .stdout_is(stdout) + .stderr_is(format!("printf: warning: {stderr}: character(s) following character constant have been ignored\n")); + } - for (format, expected) in cases { + for arg in ["\"", "'"] { new_ucmd!() - .args(&["%04x\n", format]) + .args(&["%04x\n", arg]) + .fails() + .stderr_contains("expected a numeric value"); + } +} + +#[test] +#[cfg(target_family = "unix")] +fn mb_invalid_unicode() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let cases = vec![ + ("%04x\n", b"\"\xe1", "00e1\n"), + ("%04x\n", b"'\xe1", "00e1\n"), + ("%i\n", b"\"\xe1", "225\n"), + ("%i\n", b"'\xe1", "225\n"), + ("%f\n", b"'\xe1", "225.000000\n"), + ]; + for (format, arg, stdout) in cases { + new_ucmd!() + .arg(format) + .arg(OsStr::from_bytes(arg)) + .succeeds() + .stdout_only(stdout); + } + + let cases = vec![ + (b"\"\xe1=".as_slice(), "="), + (b"'\xe1-".as_slice(), "-"), + (b"'\xe1=-==".as_slice(), "=-=="), + (b"'\xe1'".as_slice(), "'"), + // unclear if original or replacement character is better in stderr + //(b"''\xe1'".as_slice(), "'�'"), + ]; + for (arg, expected) in cases { + new_ucmd!() + .arg("%04x\n") + .arg(OsStr::from_bytes(arg)) .succeeds() .stdout_is("00e1\n") .stderr_is(format!("printf: warning: {expected}: character(s) following character constant have been ignored\n")); @@ -1362,3 +1419,35 @@ fn positional_format_specifiers() { .succeeds() .stdout_only("Octal: 115, Int: 42, Float: 3.141590, String: hello, Hex: ff, Scientific: 1.000000e-05, Char: A, Unsigned: 100, Integer: 123"); } + +#[test] +#[cfg(target_family = "unix")] +fn non_utf_8_input() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // ISO-8859-1 encoded text + // spell-checker:disable + const INPUT_AND_OUTPUT: &[u8] = + b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre."; + // spell-checker:enable + + let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%s") + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%d") + .arg(os_str) + .fails() + .stderr_contains("expected a numeric value"); +} diff --git a/util/why-error.md b/util/why-error.md index 978545b26fa..095caef4965 100644 --- a/util/why-error.md +++ b/util/why-error.md @@ -39,11 +39,7 @@ This file documents why some tests are failing: * gnu/tests/mv/part-hardlink.sh * gnu/tests/od/od-N.sh * gnu/tests/od/od-float.sh -* gnu/tests/printf/printf-cov.pl -* gnu/tests/printf/printf-indexed.sh -* gnu/tests/printf/printf-mb.sh * gnu/tests/printf/printf-quote.sh -* gnu/tests/printf/printf.sh * gnu/tests/ptx/ptx-overrun.sh * gnu/tests/ptx/ptx.pl * gnu/tests/rm/empty-inacc.sh - https://github.com/uutils/coreutils/issues/7033