@@ -27,44 +27,12 @@ use crate::sed::fast_io::IOChunk;
2727
2828/// REs requiring the fancy_regex capabilities rather than the
2929/// faster regex::bytes engine
30- // Consider . as one character that requires fancy_regex,
31- // because it can match more than one byte when matching a
32- // two or more byte Unicode UTF-8 representation.
33- // It is an RE . rather than a literal one in the following
34- // example sitations.
35- // . First character of the line
36- // [^\\]. Second character after non \
37- //
38- // \*. A consumed backslash anywhere on the line
39- // \\. An escaped backslash anywhere on the line
40- // xx. A non-escaped sequence anywhere on the line
41- // But the following are literal dots and can be captured by bytes:
42- // \. escaped at the beginning of the line
43- // x\. escaped after a non escaped \ anywhere on the line
44- //
45- // The following RE captures these situations.
46- static NEEDS_FANCY_RE : Lazy < RustRegex > = Lazy :: new ( || {
47- regex:: Regex :: new (
48- r"(?x) # Turn on verbose mode
49- ( # An ASCII-incompatible RE
50- ( ^ # Non-escaped: i.e. at BOL
51- | ^[^\\] # or after a BOL non \
52- | [^\\] {2} # or after two non \ characters
53- | \\. # or after a consumed or escaped \
54- )
55- ( # A potentially incompatible match
56- \. # . matches any Unicode character
57- | \[\^ # Bracketed -ve character class
58- | \(\?i # (Unicode) case insensitive
59- | \\[WwDdSsBbPp] # Unicode classes
60- | \\[0-9] # Back-references need fancy
61- )
62- )
63- | [^\x01-\x7f] # Any non-ASCII character
64- " ,
65- )
66- . unwrap ( )
67- } ) ;
30+ // False positives only result in a small performance pessimization,
31+ // so this is just a maximally sensitive, good-enough approximation.
32+ // For example, r"\\1" and r"[\1]" will match, whereas only a number
33+ // after an odd number of backslashes and outside a character class
34+ // should match.
35+ static NEEDS_FANCY_RE : Lazy < RustRegex > = Lazy :: new ( || regex:: Regex :: new ( r"\\[1-9]" ) . unwrap ( ) ) ;
6836
6937/// All characters signifying that the match must be handled by an RE
7038/// rather than by plain string pattern matching.
@@ -476,52 +444,22 @@ mod tests {
476444 #[ test]
477445 fn test_needs_fancy_re_matches ( ) {
478446 let should_match = [
479- // Unicode classes BOL
480- r"\p{L}+" , // Unicode letter class
481- r"\W" , // \W is Unicode-aware.
482- r"\S+" , // \S is Unicode-aware.
483- r"\d" , // \d includes all Unicode digits.
484- // Unicode classes non-BOL
485- r"x\p{L}+" , // Unicode letter class
486- r"x\W" , // \W is Unicode-aware.
487- r"x\S+" , // \S is Unicode-aware.
488- r"x\d" , // \d includes all Unicode digits.
489- // .
490- r"." ,
491- r"x." ,
492- r"xx." ,
493- // Consumed \
494- r"\*." ,
495- r"x\*." ,
496- // Escaped \
497- r"\\." ,
498- r"x\\." ,
499- // Inline flags
500- r"(?i)abc" , // Unicode case-insensitive
501- r"x(?i)abc" , // Unicode case-insensitive
502447 r"(\w+):\1" , // back-reference \1
503- // Non-ASCII literals
504- "naïve" , // Contains literal non-ASCII.
505- "café" , // Contains literal non-ASCII.
506448 ] ;
507449
508450 for pat in & should_match {
509451 assert ! (
510452 NEEDS_FANCY_RE . is_match( pat) ,
511- "Expected NEEDS_FANCY_RE to match: {:?}" ,
512- pat
453+ "Expected NEEDS_FANCY_RE to match: {pat:?}"
513454 ) ;
514455 }
515456 }
516457
517458 #[ test]
518459 fn test_needs_fancy_re_does_not_match ( ) {
519460 let should_not_match = [
520- r"\." , // Escaped . at BOL
521- r"x\." , // Escaped . at non BOL
522- r"\[^x]" , // Escaped character class
523- r"\(?i\)" , // Escaped case insesitive flag
524- r"\\w" , // Escaped Unicode class
461+ r"\ 1" , // Non-adjacent
462+ r"\0" , // Only \[1-9]
525463 // Simple ASCII
526464 r"foo" ,
527465 r"foo|bar" ,
@@ -531,8 +469,7 @@ mod tests {
531469 for pat in & should_not_match {
532470 assert ! (
533471 !NEEDS_FANCY_RE . is_match( pat) ,
534- "Expected NEEDS_FANCY_RE to NOT match: {:?}" ,
535- pat
472+ "Expected NEEDS_FANCY_RE to NOT match: {pat:?}"
536473 ) ;
537474 }
538475 }
@@ -558,8 +495,7 @@ mod tests {
558495 for pat in & should_match {
559496 assert ! (
560497 NEEDS_RE . is_match( pat) ,
561- "Expected NEEDS_RE to match: {:?}" ,
562- pat
498+ "Expected NEEDS_RE to match: {pat:?}"
563499 ) ;
564500 }
565501 }
@@ -579,8 +515,7 @@ mod tests {
579515 for pat in & should_not_match {
580516 assert ! (
581517 !NEEDS_RE . is_match( pat) ,
582- "Expected NEEDS_RE to NOT match: {:?}" ,
583- pat
518+ "Expected NEEDS_RE to NOT match: {pat:?}"
584519 ) ;
585520 }
586521 }
@@ -594,7 +529,7 @@ mod tests {
594529
595530 #[ test]
596531 fn assert_fancy ( ) {
597- let re = Regex :: new ( r"\d " ) . unwrap ( ) ;
532+ let re = Regex :: new ( r"(.)\1 " ) . unwrap ( ) ;
598533 assert ! ( matches!( re, Regex :: Fancy ( _) ) ) ;
599534 }
600535
@@ -609,8 +544,7 @@ mod tests {
609544 let err = Regex :: new ( "(" ) . unwrap_err ( ) . to_string ( ) ;
610545 assert ! (
611546 err. contains( "unclosed group" ) || err. contains( "error parsing" ) ,
612- "Unexpected error: {}" ,
613- err
547+ "Unexpected error: {err:?}"
614548 ) ;
615549 }
616550
0 commit comments