Skip to content

Commit ed15ca1

Browse files
committed
checksum: keep a cache of the first used regex for non-algo-based regexes
1 parent 5cbe876 commit ed15ca1

File tree

1 file changed

+146
-72
lines changed

1 file changed

+146
-72
lines changed

src/uucore/src/lib/features/checksum.rs

Lines changed: 146 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66

77
use data_encoding::BASE64;
88
use os_display::Quotable;
9-
use regex::bytes::{Captures, Regex};
9+
use regex::bytes::{Match, Regex};
1010
use std::{
11+
borrow::Cow,
1112
ffi::OsStr,
1213
fmt::Display,
1314
fs::File,
@@ -427,6 +428,67 @@ const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>
427428
// In this case, we ignore the *
428429
const SINGLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<filename>\*?(?-u:.*))$";
429430

431+
/// Hold the data extracted from a checksum line.
432+
struct LineInfo {
433+
algo_name: Option<String>,
434+
algo_bit_len: Option<usize>,
435+
checksum: String,
436+
filename: Vec<u8>,
437+
438+
regex: Regex,
439+
}
440+
441+
impl LineInfo {
442+
fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<Regex>) -> Option<Self> {
443+
let regexes = [
444+
(Regex::new(ALGO_BASED_REGEX).unwrap(), true),
445+
(Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false),
446+
(Regex::new(SINGLE_SPACE_REGEX).unwrap(), false),
447+
(Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), false),
448+
];
449+
450+
let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed");
451+
452+
for (regex, algo_based) in &regexes {
453+
if !regex.is_match(line_bytes) {
454+
continue;
455+
}
456+
457+
let mut r = regex.clone();
458+
if !algo_based && cached_regex.is_some() {
459+
r = cached_regex.clone().unwrap();
460+
}
461+
462+
if let Some(caps) = r.captures(line_bytes) {
463+
// These unwraps are safe thanks to the regex
464+
let match_to_string = |m: Match| String::from_utf8(m.as_bytes().into()).unwrap();
465+
466+
return Some(Self {
467+
algo_name: caps.name("algo").map(match_to_string),
468+
algo_bit_len: caps
469+
.name("bits")
470+
.map(|m| match_to_string(m).parse::<usize>().unwrap()),
471+
checksum: caps.name("checksum").map(match_to_string).unwrap(),
472+
filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(),
473+
regex: r.clone(),
474+
});
475+
}
476+
}
477+
478+
None
479+
}
480+
481+
#[inline]
482+
fn is_algo_based(&self) -> bool {
483+
self.algo_name.is_some()
484+
}
485+
486+
#[inline]
487+
fn regex_str(&self) -> &str {
488+
self.regex.as_str()
489+
}
490+
}
491+
430492
fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
431493
if input_is_stdin {
432494
"standard input"
@@ -437,34 +499,18 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
437499
.to_string()
438500
}
439501

440-
/// Determines the appropriate regular expression to use based on the provided lines.
441-
fn determine_regex(line: impl AsRef<OsStr>) -> Option<(Regex, bool)> {
442-
let regexes = [
443-
(Regex::new(ALGO_BASED_REGEX).unwrap(), true),
444-
(Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false),
445-
(Regex::new(SINGLE_SPACE_REGEX).unwrap(), false),
446-
(Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), true),
447-
];
448-
449-
let line_bytes = os_str_as_bytes(line.as_ref()).expect("UTF-8 decoding failed");
450-
for (regex, is_algo_based) in &regexes {
451-
if regex.is_match(line_bytes) {
452-
return Some((regex.clone(), *is_algo_based));
453-
}
454-
}
455-
456-
None
457-
}
458-
459502
/// Extract the expected digest from the checksum string
460-
fn get_expected_digest_as_hex_string(caps: &Captures, chosen_regex: &Regex) -> Option<String> {
461-
// Unwraps are safe, ensured by regex.
462-
let ck = caps.name("checksum").unwrap().as_bytes();
463-
464-
if chosen_regex.as_str() == ALGO_BASED_REGEX_BASE64 {
465-
BASE64.decode(ck).map(hex::encode).ok()
503+
fn get_expected_digest_as_hex_string(line_info: &LineInfo) -> Option<Cow<str>> {
504+
let ck = &line_info.checksum;
505+
506+
if line_info.regex_str() == ALGO_BASED_REGEX_BASE64 {
507+
BASE64
508+
.decode(ck.as_bytes())
509+
.map(hex::encode)
510+
.map(Cow::Owned)
511+
.ok()
466512
} else if ck.len() % 2 == 0 {
467-
Some(str::from_utf8(ck).unwrap().to_string())
513+
Some(Cow::Borrowed(ck))
468514
} else {
469515
// If the length of the digest is not a multiple of 2, then it
470516
// must be improperly formatted (1 hex digit is 2 characters)
@@ -545,15 +591,14 @@ fn get_input_file(filename: &OsStr) -> UResult<Box<dyn Read>> {
545591

546592
/// Extracts the algorithm name and length from the regex captures if the algo-based format is matched.
547593
fn identify_algo_name_and_length(
548-
caps: &Captures,
594+
line_info: &LineInfo,
549595
algo_name_input: Option<&str>,
550596
) -> Option<(String, Option<usize>)> {
551597
// When the algo-based format is matched, extract details from regex captures
552-
let algorithm = caps
553-
.name("algo")
554-
.map_or(String::new(), |m| {
555-
String::from_utf8(m.as_bytes().into()).unwrap()
556-
})
598+
let algorithm = line_info
599+
.algo_name
600+
.clone()
601+
.unwrap_or_default()
557602
.to_lowercase();
558603

559604
// check if we are called with XXXsum (example: md5sum) but we detected a different algo parsing the file
@@ -568,13 +613,9 @@ fn identify_algo_name_and_length(
568613
return None;
569614
}
570615

571-
let bits = caps.name("bits").map_or(Some(None), |m| {
572-
let bits_value = String::from_utf8(m.as_bytes().into())
573-
.unwrap()
574-
.parse::<usize>()
575-
.unwrap();
576-
if bits_value % 8 == 0 {
577-
Some(Some(bits_value / 8))
616+
let bits = line_info.algo_bitlen.map_or(Some(None), |bits| {
617+
if bits % 8 == 0 {
618+
Some(Some(bits / 8))
578619
} else {
579620
None // Return None to signal a divisibility issue
580621
}
@@ -597,6 +638,7 @@ fn process_checksum_line(
597638
cli_algo_name: Option<&str>,
598639
cli_algo_length: Option<usize>,
599640
opts: ChecksumOptions,
641+
cached_regex: &mut Option<Regex>,
600642
) -> Result<(), LineCheckError> {
601643
let line_bytes = os_str_as_bytes(line)?;
602644

@@ -605,26 +647,30 @@ fn process_checksum_line(
605647
return Err(LineCheckError::Skipped);
606648
}
607649

608-
let (chosen_regex, is_algo_based_format) =
609-
determine_regex(line).ok_or(LineCheckError::ImproperlyFormatted)?;
650+
if let Some(line_info) = LineInfo::parse(line, cached_regex) {
651+
// The cached regex ensures that when processing non-algo based regexes,
652+
// its cannot be changed (can't have single and double space regexes
653+
// used in the same file).
654+
if cached_regex.is_none() && !line_info.is_algo_based() {
655+
let _ = cached_regex.insert(line_info.regex.clone());
656+
}
610657

611-
if let Some(caps) = chosen_regex.captures(line_bytes) {
612-
let mut filename_to_check = caps.name("filename").unwrap().as_bytes();
658+
let mut filename_to_check = line_info.filename.as_slice();
613659

614660
if filename_to_check.starts_with(b"*")
615661
&& i == 0
616-
&& chosen_regex.as_str() == SINGLE_SPACE_REGEX
662+
&& line_info.regex_str() == SINGLE_SPACE_REGEX
617663
{
618664
// Remove the leading asterisk if present - only for the first line
619665
filename_to_check = &filename_to_check[1..];
620666
}
621667

622-
let expected_checksum = get_expected_digest_as_hex_string(&caps, &chosen_regex)
668+
let expected_checksum = get_expected_digest_as_hex_string(&line_info)
623669
.ok_or(LineCheckError::ImproperlyFormatted)?;
624670

625671
// If the algo_name is provided, we use it, otherwise we try to detect it
626-
let (algo_name, length) = if is_algo_based_format {
627-
identify_algo_name_and_length(&caps, cli_algo_name)
672+
let (algo_name, length) = if line_info.is_algo_based() {
673+
identify_algo_name_and_length(&line_info, cli_algo_name)
628674
.ok_or(LineCheckError::ImproperlyFormatted)?
629675
} else if let Some(a) = cli_algo_name {
630676
// When a specific algorithm name is input, use it and use the provided bits
@@ -721,6 +767,10 @@ fn process_checksum_file(
721767
let reader = BufReader::new(file);
722768
let lines = read_os_string_lines(reader).collect::<Vec<_>>();
723769

770+
// cached_regex is used to ensure that several non algo-based checksum line
771+
// will use the same regex.
772+
let mut cached_regex = None;
773+
724774
for (i, line) in lines.iter().enumerate() {
725775
let line_result = process_checksum_line(
726776
filename_input,
@@ -729,6 +779,7 @@ fn process_checksum_file(
729779
cli_algo_name,
730780
cli_algo_length,
731781
opts,
782+
&mut cached_regex,
732783
);
733784

734785
// Match a first time to elude critical UErrors, and increment the total
@@ -1149,52 +1200,75 @@ mod tests {
11491200
}
11501201

11511202
#[test]
1152-
fn test_determine_regex() {
1203+
fn test_line_info() {
1204+
let mut cached_regex = None;
1205+
11531206
// Test algo-based regex
11541207
let line_algo_based =
11551208
OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
1156-
let (regex, algo_based) = determine_regex(&line_algo_based).unwrap();
1157-
assert!(algo_based);
1158-
assert!(regex.is_match(os_str_as_bytes(&line_algo_based).unwrap()));
1209+
let line_info = LineInfo::parse(&line_algo_based, &mut cached_regex).unwrap();
1210+
assert!(line_info.is_algo_based());
1211+
assert_eq!(line_info.algo_name.as_deref(), Some("MD5"));
1212+
assert!(line_info.algo_bit_len.is_none());
1213+
assert_eq!(line_info.filename, b"example.txt");
1214+
assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
1215+
assert_eq!(line_info.regex_str(), ALGO_BASED_REGEX);
1216+
assert!(cached_regex.is_none());
11591217

11601218
// Test double-space regex
11611219
let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt");
1162-
let (regex, algo_based) = determine_regex(&line_double_space).unwrap();
1163-
assert!(!algo_based);
1164-
assert!(regex.is_match(os_str_as_bytes(&line_double_space).unwrap()));
1220+
let line_info = LineInfo::parse(&line_double_space, &mut cached_regex).unwrap();
1221+
assert!(!line_info.is_algo_based());
1222+
assert!(line_info.algo_name.is_none());
1223+
assert!(line_info.algo_bit_len.is_none());
1224+
assert_eq!(line_info.filename, b"example.txt");
1225+
assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
1226+
assert_eq!(line_info.regex_str(), DOUBLE_SPACE_REGEX);
1227+
assert!(cached_regex.is_some());
1228+
1229+
cached_regex = None;
11651230

11661231
// Test single-space regex
11671232
let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt");
1168-
let (regex, algo_based) = determine_regex(&line_single_space).unwrap();
1169-
assert!(!algo_based);
1170-
assert!(regex.is_match(os_str_as_bytes(&line_single_space).unwrap()));
1233+
let line_info = LineInfo::parse(&line_single_space, &mut cached_regex).unwrap();
1234+
assert!(!line_info.is_algo_based());
1235+
assert!(line_info.algo_name.is_none());
1236+
assert!(line_info.algo_bit_len.is_none());
1237+
assert_eq!(line_info.filename, b"example.txt");
1238+
assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
1239+
assert_eq!(line_info.regex_str(), SINGLE_SPACE_REGEX);
1240+
assert!(cached_regex.is_some());
1241+
1242+
cached_regex = None;
11711243

11721244
// Test invalid checksum line
11731245
let line_invalid = OsString::from("invalid checksum line");
1174-
assert!(determine_regex(&line_invalid).is_none());
1246+
assert!(LineInfo::parse(&line_invalid, &mut cached_regex).is_none());
1247+
assert!(cached_regex.is_none());
11751248

11761249
// Test leading space before checksum line
11771250
let line_algo_based_leading_space =
11781251
OsString::from(" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
1179-
let res = determine_regex(&line_algo_based_leading_space);
1252+
let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex);
11801253
assert!(res.is_some());
1181-
assert_eq!(res.unwrap().0.as_str(), ALGO_BASED_REGEX);
1254+
assert_eq!(res.unwrap().regex_str(), ALGO_BASED_REGEX);
1255+
assert!(cached_regex.is_none());
11821256

11831257
// Test trailing space after checksum line (should fail)
11841258
let line_algo_based_leading_space =
11851259
OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e ");
1186-
let res = determine_regex(&line_algo_based_leading_space);
1260+
let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex);
11871261
assert!(res.is_none());
1262+
assert!(cached_regex.is_none());
11881263
}
11891264

11901265
#[test]
11911266
fn test_get_expected_digest() {
1192-
let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap();
1193-
let caps = re
1194-
.captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=")
1195-
.unwrap();
1267+
let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=");
1268+
let mut cached_regex = None;
1269+
let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
11961270

1197-
let result = get_expected_digest_as_hex_string(&caps, &re);
1271+
let result = get_expected_digest_as_hex_string(&line_info);
11981272

11991273
assert_eq!(
12001274
result.unwrap(),
@@ -1204,12 +1278,12 @@ mod tests {
12041278

12051279
#[test]
12061280
fn test_get_expected_checksum_invalid() {
1207-
let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap();
1208-
let caps = re
1209-
.captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU")
1210-
.unwrap();
1281+
// The line misses a '=' at the end to be valid base64
1282+
let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU");
1283+
let mut cached_regex = None;
1284+
let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
12111285

1212-
let result = get_expected_digest_as_hex_string(&caps, &re);
1286+
let result = get_expected_digest_as_hex_string(&line_info);
12131287

12141288
assert!(result.is_none());
12151289
}

0 commit comments

Comments
 (0)