Skip to content

Commit 5227178

Browse files
authored
Merge pull request #7681 from sargas/pass-printf-cov-test
printf: Error handling with unicode parsing
2 parents 617b5fb + 7df2205 commit 5227178

File tree

5 files changed

+145
-29
lines changed

5 files changed

+145
-29
lines changed

src/uucore/src/lib/features/format/escape.rs

Lines changed: 97 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
//! Parsing of escape sequences
77
8+
use crate::format::FormatError;
9+
810
#[derive(Debug)]
911
pub enum EscapedChar {
1012
/// A single byte
@@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {
9092

9193
// spell-checker:disable-next
9294
/// Parse `\uHHHH` and `\UHHHHHHHH`
93-
// TODO: This should print warnings and possibly halt execution when it fails to parse
94-
// TODO: If the character cannot be converted to u32, the input should be printed.
95-
fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
96-
let (c, rest) = input.split_first()?;
97-
let mut ret = Base::Hex.convert_digit(*c)? as u32;
98-
*input = rest;
99-
100-
for _ in 1..digits {
101-
let (c, rest) = input.split_first()?;
102-
let n = Base::Hex.convert_digit(*c)?;
103-
ret = ret
104-
.wrapping_mul(Base::Hex.as_base() as u32)
105-
.wrapping_add(n as u32);
95+
fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> {
96+
if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) {
10697
*input = rest;
98+
let ret = new_digits
99+
.iter()
100+
.map(|c| Base::Hex.convert_digit(*c))
101+
.collect::<Option<Vec<u8>>>()
102+
.ok_or(EscapeError::MissingHexadecimalNumber)?
103+
.iter()
104+
.map(|n| *n as u32)
105+
.reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n))
106+
.expect("must have multiple digits in unicode string");
107+
char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec()))
108+
} else {
109+
Err(EscapeError::MissingHexadecimalNumber)
107110
}
108-
109-
char::from_u32(ret)
110111
}
111112

112113
/// Represents an invalid escape sequence.
113-
#[derive(Debug)]
114-
pub struct EscapeError {}
114+
#[derive(Debug, PartialEq)]
115+
pub enum EscapeError {
116+
InvalidCharacters(Vec<u8>),
117+
MissingHexadecimalNumber,
118+
}
115119

116120
/// Parse an escape sequence, like `\n` or `\xff`, etc.
117121
pub fn parse_escape_code(
118122
rest: &mut &[u8],
119123
zero_octal_parsing: OctalParsing,
120-
) -> Result<EscapedChar, EscapeError> {
124+
) -> Result<EscapedChar, FormatError> {
121125
if let [c, new_rest @ ..] = rest {
122126
// This is for the \NNN syntax for octal sequences.
123127
// Note that '0' is intentionally omitted because that
@@ -145,17 +149,89 @@ pub fn parse_escape_code(
145149
if let Some(c) = parse_code(rest, Base::Hex) {
146150
Ok(EscapedChar::Byte(c))
147151
} else {
148-
Err(EscapeError {})
152+
Err(FormatError::MissingHex)
149153
}
150154
}
151155
b'0' => Ok(EscapedChar::Byte(
152156
parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'),
153157
)),
154-
b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))),
155-
b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))),
158+
b'u' => match parse_unicode(rest, 4) {
159+
Ok(c) => Ok(EscapedChar::Char(c)),
160+
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
161+
Err(EscapeError::InvalidCharacters(chars)) => {
162+
Err(FormatError::InvalidCharacter('u', chars))
163+
}
164+
},
165+
b'U' => match parse_unicode(rest, 8) {
166+
Ok(c) => Ok(EscapedChar::Char(c)),
167+
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
168+
Err(EscapeError::InvalidCharacters(chars)) => {
169+
Err(FormatError::InvalidCharacter('U', chars))
170+
}
171+
},
156172
c => Ok(EscapedChar::Backslash(*c)),
157173
}
158174
} else {
159175
Ok(EscapedChar::Byte(b'\\'))
160176
}
161177
}
178+
179+
#[cfg(test)]
180+
mod tests {
181+
use super::*;
182+
183+
mod parse_unicode {
184+
use super::*;
185+
186+
#[test]
187+
fn parse_ascii() {
188+
let input = b"2a";
189+
assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*'));
190+
191+
let input = b"002A";
192+
assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*'));
193+
}
194+
195+
#[test]
196+
fn parse_emoji_codepoint() {
197+
let input = b"0001F60A";
198+
assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊'));
199+
}
200+
201+
#[test]
202+
fn no_characters() {
203+
let input = b"";
204+
assert_eq!(
205+
parse_unicode(&mut &input[..], 8),
206+
Err(EscapeError::MissingHexadecimalNumber)
207+
);
208+
}
209+
210+
#[test]
211+
fn incomplete_hexadecimal_number() {
212+
let input = b"123";
213+
assert_eq!(
214+
parse_unicode(&mut &input[..], 4),
215+
Err(EscapeError::MissingHexadecimalNumber)
216+
);
217+
}
218+
219+
#[test]
220+
fn invalid_hex() {
221+
let input = b"duck";
222+
assert_eq!(
223+
parse_unicode(&mut &input[..], 4),
224+
Err(EscapeError::MissingHexadecimalNumber)
225+
);
226+
}
227+
228+
#[test]
229+
fn surrogate_code_point() {
230+
let input = b"d800";
231+
assert_eq!(
232+
parse_unicode(&mut &input[..], 4),
233+
Err(EscapeError::InvalidCharacters(Vec::from(b"d800")))
234+
);
235+
}
236+
}
237+
}

src/uucore/src/lib/features/format/mod.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ pub enum FormatError {
7171
EndsWithPercent(Vec<u8>),
7272
/// The escape sequence `\x` appears without a literal hexadecimal value.
7373
MissingHex,
74+
/// The hexadecimal characters represent a code point that cannot represent a
75+
/// Unicode character (e.g., a surrogate code point)
76+
InvalidCharacter(char, Vec<u8>),
7477
}
7578

7679
impl Error for FormatError {}
@@ -110,6 +113,12 @@ impl Display for FormatError {
110113
Self::NoMoreArguments => write!(f, "no more arguments"),
111114
Self::InvalidArgument(_) => write!(f, "invalid argument"),
112115
Self::MissingHex => write!(f, "missing hexadecimal number in escape"),
116+
Self::InvalidCharacter(escape_char, digits) => write!(
117+
f,
118+
"invalid universal character name \\{}{}",
119+
escape_char,
120+
String::from_utf8_lossy(digits)
121+
),
113122
}
114123
}
115124
}
@@ -186,12 +195,7 @@ pub fn parse_spec_and_escape(
186195
}
187196
[b'\\', rest @ ..] => {
188197
current = rest;
189-
Some(
190-
match parse_escape_code(&mut current, OctalParsing::default()) {
191-
Ok(c) => Ok(FormatItem::Char(c)),
192-
Err(_) => Err(FormatError::MissingHex),
193-
},
194-
)
198+
Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char))
195199
}
196200
[c, rest @ ..] => {
197201
current = rest;

src/uucore/src/lib/features/format/spec.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ struct Flags {
9595
space: bool,
9696
hash: bool,
9797
zero: bool,
98+
quote: bool,
9899
}
99100

100101
impl Flags {
@@ -108,6 +109,11 @@ impl Flags {
108109
b' ' => flags.space = true,
109110
b'#' => flags.hash = true,
110111
b'0' => flags.zero = true,
112+
b'\'' => {
113+
// the thousands separator is printed with numbers using the ' flag, but
114+
// this is a no-op in the "C" locale. We only save this flag for reporting errors
115+
flags.quote = true;
116+
}
111117
_ => break,
112118
}
113119
*index += 1;
@@ -181,7 +187,7 @@ impl Spec {
181187
}
182188
}
183189
b's' => {
184-
if flags.zero || flags.hash {
190+
if flags.zero || flags.hash || flags.quote {
185191
return Err(&start[..index]);
186192
}
187193
Self::String {

src/uucore/src/lib/features/parser/num_parser.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ fn parse(
502502

503503
let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent);
504504

505-
// Return what has been parsed so far. It there are extra characters, mark the
505+
// Return what has been parsed so far. If there are extra characters, mark the
506506
// parsing as a partial match.
507507
if let Some((first_unparsed, _)) = chars.next() {
508508
Err(ExtendedParserError::PartialMatch(

tests/by-util/test_printf.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,26 @@ fn escaped_unicode_null_byte() {
112112
.stdout_is_bytes([1u8, b'_']);
113113
}
114114

115+
#[test]
116+
fn escaped_unicode_incomplete() {
117+
for arg in ["\\u", "\\U", "\\uabc", "\\Uabcd"] {
118+
new_ucmd!()
119+
.arg(arg)
120+
.fails_with_code(1)
121+
.stderr_only("printf: missing hexadecimal number in escape\n");
122+
}
123+
}
124+
125+
#[test]
126+
fn escaped_unicode_invalid() {
127+
for arg in ["\\ud9d0", "\\U0000D8F9"] {
128+
new_ucmd!().arg(arg).fails_with_code(1).stderr_only(format!(
129+
"printf: invalid universal character name {}\n",
130+
arg
131+
));
132+
}
133+
}
134+
115135
#[test]
116136
fn escaped_percent_sign() {
117137
new_ucmd!()
@@ -317,6 +337,16 @@ fn sub_num_int_char_const_in() {
317337
.stdout_only("emoji is 128579");
318338
}
319339

340+
#[test]
341+
fn sub_num_thousands() {
342+
// For "C" locale, the thousands separator is ignored but should
343+
// not result in an error
344+
new_ucmd!()
345+
.args(&["%'i", "123456"])
346+
.succeeds()
347+
.stdout_only("123456");
348+
}
349+
320350
#[test]
321351
fn sub_num_uint() {
322352
new_ucmd!()

0 commit comments

Comments
 (0)