66
77use data_encoding:: BASE64 ;
88use os_display:: Quotable ;
9- use regex:: bytes:: { Captures , Regex } ;
9+ use regex:: bytes:: { Match , Regex } ;
1010use std:: {
11+ borrow:: Cow ,
1112 ffi:: OsStr ,
1213 fmt:: Display ,
1314 fs:: File ,
@@ -427,6 +428,67 @@ const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>
427428// In this case, we ignore the *
428429const SINGLE_SPACE_REGEX : & str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<filename>\*?(?-u:.*))$" ;
429430
431+ /// Hold the data extracted from a checksum line.
432+ struct LineInfo {
433+ algo_name : Option < String > ,
434+ algo_bit_len : Option < usize > ,
435+ checksum : String ,
436+ filename : Vec < u8 > ,
437+
438+ regex : Regex ,
439+ }
440+
441+ impl LineInfo {
442+ fn parse ( s : impl AsRef < OsStr > , cached_regex : & mut Option < Regex > ) -> Option < Self > {
443+ let regexes = [
444+ ( Regex :: new ( ALGO_BASED_REGEX ) . unwrap ( ) , true ) ,
445+ ( Regex :: new ( DOUBLE_SPACE_REGEX ) . unwrap ( ) , false ) ,
446+ ( Regex :: new ( SINGLE_SPACE_REGEX ) . unwrap ( ) , false ) ,
447+ ( Regex :: new ( ALGO_BASED_REGEX_BASE64 ) . unwrap ( ) , false ) ,
448+ ] ;
449+
450+ let line_bytes = os_str_as_bytes ( s. as_ref ( ) ) . expect ( "UTF-8 decoding failed" ) ;
451+
452+ for ( regex, algo_based) in & regexes {
453+ if !regex. is_match ( line_bytes) {
454+ continue ;
455+ }
456+
457+ let mut r = regex. clone ( ) ;
458+ if !algo_based && cached_regex. is_some ( ) {
459+ r = cached_regex. clone ( ) . unwrap ( ) ;
460+ }
461+
462+ if let Some ( caps) = r. captures ( line_bytes) {
463+ // These unwraps are safe thanks to the regex
464+ let match_to_string = |m : Match | String :: from_utf8 ( m. as_bytes ( ) . into ( ) ) . unwrap ( ) ;
465+
466+ return Some ( Self {
467+ algo_name : caps. name ( "algo" ) . map ( match_to_string) ,
468+ algo_bit_len : caps
469+ . name ( "bits" )
470+ . map ( |m| match_to_string ( m) . parse :: < usize > ( ) . unwrap ( ) ) ,
471+ checksum : caps. name ( "checksum" ) . map ( match_to_string) . unwrap ( ) ,
472+ filename : caps. name ( "filename" ) . map ( |m| m. as_bytes ( ) . into ( ) ) . unwrap ( ) ,
473+ regex : r. clone ( ) ,
474+ } ) ;
475+ }
476+ }
477+
478+ None
479+ }
480+
481+ #[ inline]
482+ fn is_algo_based ( & self ) -> bool {
483+ self . algo_name . is_some ( )
484+ }
485+
486+ #[ inline]
487+ fn regex_str ( & self ) -> & str {
488+ self . regex . as_str ( )
489+ }
490+ }
491+
430492fn get_filename_for_output ( filename : & OsStr , input_is_stdin : bool ) -> String {
431493 if input_is_stdin {
432494 "standard input"
@@ -437,34 +499,18 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
437499 . to_string ( )
438500}
439501
440- /// Determines the appropriate regular expression to use based on the provided lines.
441- fn determine_regex ( line : impl AsRef < OsStr > ) -> Option < ( Regex , bool ) > {
442- let regexes = [
443- ( Regex :: new ( ALGO_BASED_REGEX ) . unwrap ( ) , true ) ,
444- ( Regex :: new ( DOUBLE_SPACE_REGEX ) . unwrap ( ) , false ) ,
445- ( Regex :: new ( SINGLE_SPACE_REGEX ) . unwrap ( ) , false ) ,
446- ( Regex :: new ( ALGO_BASED_REGEX_BASE64 ) . unwrap ( ) , true ) ,
447- ] ;
448-
449- let line_bytes = os_str_as_bytes ( line. as_ref ( ) ) . expect ( "UTF-8 decoding failed" ) ;
450- for ( regex, is_algo_based) in & regexes {
451- if regex. is_match ( line_bytes) {
452- return Some ( ( regex. clone ( ) , * is_algo_based) ) ;
453- }
454- }
455-
456- None
457- }
458-
459502/// Extract the expected digest from the checksum string
460- fn get_expected_digest_as_hex_string ( caps : & Captures , chosen_regex : & Regex ) -> Option < String > {
461- // Unwraps are safe, ensured by regex.
462- let ck = caps. name ( "checksum" ) . unwrap ( ) . as_bytes ( ) ;
463-
464- if chosen_regex. as_str ( ) == ALGO_BASED_REGEX_BASE64 {
465- BASE64 . decode ( ck) . map ( hex:: encode) . ok ( )
503+ fn get_expected_digest_as_hex_string ( line_info : & LineInfo ) -> Option < Cow < str > > {
504+ let ck = & line_info. checksum ;
505+
506+ if line_info. regex_str ( ) == ALGO_BASED_REGEX_BASE64 {
507+ BASE64
508+ . decode ( ck. as_bytes ( ) )
509+ . map ( hex:: encode)
510+ . map ( Cow :: Owned )
511+ . ok ( )
466512 } else if ck. len ( ) % 2 == 0 {
467- Some ( str :: from_utf8 ( ck) . unwrap ( ) . to_string ( ) )
513+ Some ( Cow :: Borrowed ( ck) )
468514 } else {
469515 // If the length of the digest is not a multiple of 2, then it
470516 // must be improperly formatted (1 hex digit is 2 characters)
@@ -545,15 +591,14 @@ fn get_input_file(filename: &OsStr) -> UResult<Box<dyn Read>> {
545591
546592/// Extracts the algorithm name and length from the regex captures if the algo-based format is matched.
547593fn identify_algo_name_and_length (
548- caps : & Captures ,
594+ line_info : & LineInfo ,
549595 algo_name_input : Option < & str > ,
550596) -> Option < ( String , Option < usize > ) > {
551597 // When the algo-based format is matched, extract details from regex captures
552- let algorithm = caps
553- . name ( "algo" )
554- . map_or ( String :: new ( ) , |m| {
555- String :: from_utf8 ( m. as_bytes ( ) . into ( ) ) . unwrap ( )
556- } )
598+ let algorithm = line_info
599+ . algo_name
600+ . clone ( )
601+ . unwrap_or_default ( )
557602 . to_lowercase ( ) ;
558603
559604 // check if we are called with XXXsum (example: md5sum) but we detected a different algo parsing the file
@@ -568,13 +613,9 @@ fn identify_algo_name_and_length(
568613 return None ;
569614 }
570615
571- let bits = caps. name ( "bits" ) . map_or ( Some ( None ) , |m| {
572- let bits_value = String :: from_utf8 ( m. as_bytes ( ) . into ( ) )
573- . unwrap ( )
574- . parse :: < usize > ( )
575- . unwrap ( ) ;
576- if bits_value % 8 == 0 {
577- Some ( Some ( bits_value / 8 ) )
616+ let bits = line_info. algo_bitlen . map_or ( Some ( None ) , |bits| {
617+ if bits % 8 == 0 {
618+ Some ( Some ( bits / 8 ) )
578619 } else {
579620 None // Return None to signal a divisibility issue
580621 }
@@ -597,6 +638,7 @@ fn process_checksum_line(
597638 cli_algo_name : Option < & str > ,
598639 cli_algo_length : Option < usize > ,
599640 opts : ChecksumOptions ,
641+ cached_regex : & mut Option < Regex > ,
600642) -> Result < ( ) , LineCheckError > {
601643 let line_bytes = os_str_as_bytes ( line) ?;
602644
@@ -605,26 +647,30 @@ fn process_checksum_line(
605647 return Err ( LineCheckError :: Skipped ) ;
606648 }
607649
608- let ( chosen_regex, is_algo_based_format) =
609- determine_regex ( line) . ok_or ( LineCheckError :: ImproperlyFormatted ) ?;
650+ if let Some ( line_info) = LineInfo :: parse ( line, cached_regex) {
651+ // The cached regex ensures that when processing non-algo based regexes,
652+ // its cannot be changed (can't have single and double space regexes
653+ // used in the same file).
654+ if cached_regex. is_none ( ) && !line_info. is_algo_based ( ) {
655+ let _ = cached_regex. insert ( line_info. regex . clone ( ) ) ;
656+ }
610657
611- if let Some ( caps) = chosen_regex. captures ( line_bytes) {
612- let mut filename_to_check = caps. name ( "filename" ) . unwrap ( ) . as_bytes ( ) ;
658+ let mut filename_to_check = line_info. filename . as_slice ( ) ;
613659
614660 if filename_to_check. starts_with ( b"*" )
615661 && i == 0
616- && chosen_regex . as_str ( ) == SINGLE_SPACE_REGEX
662+ && line_info . regex_str ( ) == SINGLE_SPACE_REGEX
617663 {
618664 // Remove the leading asterisk if present - only for the first line
619665 filename_to_check = & filename_to_check[ 1 ..] ;
620666 }
621667
622- let expected_checksum = get_expected_digest_as_hex_string ( & caps , & chosen_regex )
668+ let expected_checksum = get_expected_digest_as_hex_string ( & line_info )
623669 . ok_or ( LineCheckError :: ImproperlyFormatted ) ?;
624670
625671 // If the algo_name is provided, we use it, otherwise we try to detect it
626- let ( algo_name, length) = if is_algo_based_format {
627- identify_algo_name_and_length ( & caps , cli_algo_name)
672+ let ( algo_name, length) = if line_info . is_algo_based ( ) {
673+ identify_algo_name_and_length ( & line_info , cli_algo_name)
628674 . ok_or ( LineCheckError :: ImproperlyFormatted ) ?
629675 } else if let Some ( a) = cli_algo_name {
630676 // When a specific algorithm name is input, use it and use the provided bits
@@ -721,6 +767,10 @@ fn process_checksum_file(
721767 let reader = BufReader :: new ( file) ;
722768 let lines = read_os_string_lines ( reader) . collect :: < Vec < _ > > ( ) ;
723769
770+ // cached_regex is used to ensure that several non algo-based checksum line
771+ // will use the same regex.
772+ let mut cached_regex = None ;
773+
724774 for ( i, line) in lines. iter ( ) . enumerate ( ) {
725775 let line_result = process_checksum_line (
726776 filename_input,
@@ -729,6 +779,7 @@ fn process_checksum_file(
729779 cli_algo_name,
730780 cli_algo_length,
731781 opts,
782+ & mut cached_regex,
732783 ) ;
733784
734785 // Match a first time to elude critical UErrors, and increment the total
@@ -1149,52 +1200,75 @@ mod tests {
11491200 }
11501201
11511202 #[ test]
1152- fn test_determine_regex ( ) {
1203+ fn test_line_info ( ) {
1204+ let mut cached_regex = None ;
1205+
11531206 // Test algo-based regex
11541207 let line_algo_based =
11551208 OsString :: from ( "MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e" ) ;
1156- let ( regex, algo_based) = determine_regex ( & line_algo_based) . unwrap ( ) ;
1157- assert ! ( algo_based) ;
1158- assert ! ( regex. is_match( os_str_as_bytes( & line_algo_based) . unwrap( ) ) ) ;
1209+ let line_info = LineInfo :: parse ( & line_algo_based, & mut cached_regex) . unwrap ( ) ;
1210+ assert ! ( line_info. is_algo_based( ) ) ;
1211+ assert_eq ! ( line_info. algo_name. as_deref( ) , Some ( "MD5" ) ) ;
1212+ assert ! ( line_info. algo_bit_len. is_none( ) ) ;
1213+ assert_eq ! ( line_info. filename, b"example.txt" ) ;
1214+ assert_eq ! ( line_info. checksum, "d41d8cd98f00b204e9800998ecf8427e" ) ;
1215+ assert_eq ! ( line_info. regex_str( ) , ALGO_BASED_REGEX ) ;
1216+ assert ! ( cached_regex. is_none( ) ) ;
11591217
11601218 // Test double-space regex
11611219 let line_double_space = OsString :: from ( "d41d8cd98f00b204e9800998ecf8427e example.txt" ) ;
1162- let ( regex, algo_based) = determine_regex ( & line_double_space) . unwrap ( ) ;
1163- assert ! ( !algo_based) ;
1164- assert ! ( regex. is_match( os_str_as_bytes( & line_double_space) . unwrap( ) ) ) ;
1220+ let line_info = LineInfo :: parse ( & line_double_space, & mut cached_regex) . unwrap ( ) ;
1221+ assert ! ( !line_info. is_algo_based( ) ) ;
1222+ assert ! ( line_info. algo_name. is_none( ) ) ;
1223+ assert ! ( line_info. algo_bit_len. is_none( ) ) ;
1224+ assert_eq ! ( line_info. filename, b"example.txt" ) ;
1225+ assert_eq ! ( line_info. checksum, "d41d8cd98f00b204e9800998ecf8427e" ) ;
1226+ assert_eq ! ( line_info. regex_str( ) , DOUBLE_SPACE_REGEX ) ;
1227+ assert ! ( cached_regex. is_some( ) ) ;
1228+
1229+ cached_regex = None ;
11651230
11661231 // Test single-space regex
11671232 let line_single_space = OsString :: from ( "d41d8cd98f00b204e9800998ecf8427e example.txt" ) ;
1168- let ( regex, algo_based) = determine_regex ( & line_single_space) . unwrap ( ) ;
1169- assert ! ( !algo_based) ;
1170- assert ! ( regex. is_match( os_str_as_bytes( & line_single_space) . unwrap( ) ) ) ;
1233+ let line_info = LineInfo :: parse ( & line_single_space, & mut cached_regex) . unwrap ( ) ;
1234+ assert ! ( !line_info. is_algo_based( ) ) ;
1235+ assert ! ( line_info. algo_name. is_none( ) ) ;
1236+ assert ! ( line_info. algo_bit_len. is_none( ) ) ;
1237+ assert_eq ! ( line_info. filename, b"example.txt" ) ;
1238+ assert_eq ! ( line_info. checksum, "d41d8cd98f00b204e9800998ecf8427e" ) ;
1239+ assert_eq ! ( line_info. regex_str( ) , SINGLE_SPACE_REGEX ) ;
1240+ assert ! ( cached_regex. is_some( ) ) ;
1241+
1242+ cached_regex = None ;
11711243
11721244 // Test invalid checksum line
11731245 let line_invalid = OsString :: from ( "invalid checksum line" ) ;
1174- assert ! ( determine_regex( & line_invalid) . is_none( ) ) ;
1246+ assert ! ( LineInfo :: parse( & line_invalid, & mut cached_regex) . is_none( ) ) ;
1247+ assert ! ( cached_regex. is_none( ) ) ;
11751248
11761249 // Test leading space before checksum line
11771250 let line_algo_based_leading_space =
11781251 OsString :: from ( " MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e" ) ;
1179- let res = determine_regex ( & line_algo_based_leading_space) ;
1252+ let res = LineInfo :: parse ( & line_algo_based_leading_space, & mut cached_regex ) ;
11801253 assert ! ( res. is_some( ) ) ;
1181- assert_eq ! ( res. unwrap( ) . 0 . as_str( ) , ALGO_BASED_REGEX ) ;
1254+ assert_eq ! ( res. unwrap( ) . regex_str( ) , ALGO_BASED_REGEX ) ;
1255+ assert ! ( cached_regex. is_none( ) ) ;
11821256
11831257 // Test trailing space after checksum line (should fail)
11841258 let line_algo_based_leading_space =
11851259 OsString :: from ( "MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e " ) ;
1186- let res = determine_regex ( & line_algo_based_leading_space) ;
1260+ let res = LineInfo :: parse ( & line_algo_based_leading_space, & mut cached_regex ) ;
11871261 assert ! ( res. is_none( ) ) ;
1262+ assert ! ( cached_regex. is_none( ) ) ;
11881263 }
11891264
11901265 #[ test]
11911266 fn test_get_expected_digest ( ) {
1192- let re = Regex :: new ( ALGO_BASED_REGEX_BASE64 ) . unwrap ( ) ;
1193- let caps = re
1194- . captures ( b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=" )
1195- . unwrap ( ) ;
1267+ let line = OsString :: from ( "SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=" ) ;
1268+ let mut cached_regex = None ;
1269+ let line_info = LineInfo :: parse ( & line, & mut cached_regex) . unwrap ( ) ;
11961270
1197- let result = get_expected_digest_as_hex_string ( & caps , & re ) ;
1271+ let result = get_expected_digest_as_hex_string ( & line_info ) ;
11981272
11991273 assert_eq ! (
12001274 result. unwrap( ) ,
@@ -1204,12 +1278,12 @@ mod tests {
12041278
12051279 #[ test]
12061280 fn test_get_expected_checksum_invalid ( ) {
1207- let re = Regex :: new ( ALGO_BASED_REGEX_BASE64 ) . unwrap ( ) ;
1208- let caps = re
1209- . captures ( b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU" )
1210- . unwrap ( ) ;
1281+ // The line misses a '=' at the end to be valid base64
1282+ let line = OsString :: from ( "SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU" ) ;
1283+ let mut cached_regex = None ;
1284+ let line_info = LineInfo :: parse ( & line , & mut cached_regex ) . unwrap ( ) ;
12111285
1212- let result = get_expected_digest_as_hex_string ( & caps , & re ) ;
1286+ let result = get_expected_digest_as_hex_string ( & line_info ) ;
12131287
12141288 assert ! ( result. is_none( ) ) ;
12151289 }
0 commit comments