Use regex::bytes also for Unicode matching

dspinellis · sylvestre · commit 11f235d6f024 · 2025-12-16T21:07:09.000+01:00
Keep using regex_fancy for regexes containing backreferences. This removes a complex unneeded matcher selection RE and also provides an overall small performance boost. access-log-no-del pre-use-bytes is 1.02 times faster than use-re-bytes access-log-all-del use-re-bytes is 1.02 times faster than pre-use-bytes access-log-translit pre-use-bytes is similarly fast as use-re-bytes access-log-complex-sub pre-use-bytes is similarly fast as use-re-bytes access-log-append pre-use-bytes is similarly fast as use-re-bytes remove-cr use-re-bytes is 1.08 times faster than pre-use-bytes genome-subst use-re-bytes is 1.05 times faster than pre-use-bytes number-fix pre-use-bytes is 1.02 times faster than use-re-bytes long-script use-re-bytes is 1.01 times faster than pre-use-bytes hanoi pre-use-bytes is similarly fast as use-re-bytes factorial use-re-bytes is 1.03 times faster than pre-use-bytes Suggested by: Andrew Gallant @BurntSushi
diff --git a/src/sed/fast_regex.rs b/src/sed/fast_regex.rs
@@ -27,44 +27,12 @@ use crate::sed::fast_io::IOChunk;
 
 /// REs requiring the fancy_regex capabilities rather than the
 /// faster regex::bytes engine
-// Consider . as one character that requires fancy_regex,
-// because it can match more than one byte when matching a
-// two or more byte Unicode UTF-8 representation.
-// It is an RE . rather than a literal one in the following
-// example sitations.
-// .        First character of the line
-// [^\\].   Second character after non \
-//
-//   \*.    A consumed backslash anywhere on the line
-//   \\.    An escaped backslash anywhere on the line
-//   xx.    A non-escaped sequence anywhere on the line
-// But the following are literal dots and can be captured by bytes:
-// \.       escaped at the beginning of the line
-//   x\.    escaped after a non escaped \ anywhere on the line
-//
-// The following RE captures these situations.
-static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| {
-    regex::Regex::new(
-        r"(?x) # Turn on verbose mode
-          (                       # An ASCII-incompatible RE
-            ( ^                   # Non-escaped: i.e. at BOL
-              | ^[^\\]            # or after a BOL non \
-              | [^\\] {2}         # or after two non \ characters
-              | \\.               # or after a consumed or escaped \
-            )
-            (                     # A potentially incompatible match
-              \.                  # . matches any Unicode character
-              | \[\^              # Bracketed -ve character class
-              | \(\?i             # (Unicode) case insensitive
-              | \\[WwDdSsBbPp]    # Unicode classes
-              | \\[0-9]           # Back-references need fancy
-            )
-          )
-          | [^\x01-\x7f]          # Any non-ASCII character
-        ",
-    )
-    .unwrap()
-});
+// False positives only result in a small performance pessimization,
+// so this is just a maximally sensitive, good-enough approximation.
+// For example, r"\\1" and r"[\1]" will match, whereas only a number
+// after an odd number of backslashes and outside a character class
+// should match.
+static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| regex::Regex::new(r"\\[1-9]").unwrap());
 
 /// All characters signifying that the match must be handled by an RE
 /// rather than by plain string pattern matching.
@@ -476,52 +444,22 @@ mod tests {
     #[test]
     fn test_needs_fancy_re_matches() {
         let should_match = [
-            // Unicode classes BOL
-            r"\p{L}+", // Unicode letter class
-            r"\W",     // \W is Unicode-aware.
-            r"\S+",    // \S is Unicode-aware.
-            r"\d",     // \d includes all Unicode digits.
-            // Unicode classes non-BOL
-            r"x\p{L}+", // Unicode letter class
-            r"x\W",     // \W is Unicode-aware.
-            r"x\S+",    // \S is Unicode-aware.
-            r"x\d",     // \d includes all Unicode digits.
-            // .
-            r".",
-            r"x.",
-            r"xx.",
-            // Consumed \
-            r"\*.",
-            r"x\*.",
-            // Escaped \
-            r"\\.",
-            r"x\\.",
-            // Inline flags
-            r"(?i)abc",  // Unicode case-insensitive
-            r"x(?i)abc", // Unicode case-insensitive
             r"(\w+):\1", // back-reference \1
-            // Non-ASCII literals
-            "naïve", // Contains literal non-ASCII.
-            "café",  // Contains literal non-ASCII.
         ];
 
         for pat in &should_match {
             assert!(
                 NEEDS_FANCY_RE.is_match(pat),
-                "Expected NEEDS_FANCY_RE to match: {:?}",
-                pat
+                "Expected NEEDS_FANCY_RE to match: {pat:?}"
             );
         }
     }
 
     #[test]
     fn test_needs_fancy_re_does_not_match() {
         let should_not_match = [
-            r"\.",     // Escaped . at BOL
-            r"x\.",    // Escaped . at non BOL
-            r"\[^x]",  // Escaped character class
-            r"\(?i\)", // Escaped case insesitive flag
-            r"\\w",    // Escaped Unicode class
+            r"\ 1", // Non-adjacent
+            r"\0",  // Only \[1-9]
             // Simple ASCII
             r"foo",
             r"foo|bar",
@@ -531,8 +469,7 @@ mod tests {
         for pat in &should_not_match {
             assert!(
                 !NEEDS_FANCY_RE.is_match(pat),
-                "Expected NEEDS_FANCY_RE to NOT match: {:?}",
-                pat
+                "Expected NEEDS_FANCY_RE to NOT match: {pat:?}"
             );
         }
     }
@@ -558,8 +495,7 @@ mod tests {
         for pat in &should_match {
             assert!(
                 NEEDS_RE.is_match(pat),
-                "Expected NEEDS_RE to match: {:?}",
-                pat
+                "Expected NEEDS_RE to match: {pat:?}"
             );
         }
     }
@@ -579,8 +515,7 @@ mod tests {
         for pat in &should_not_match {
             assert!(
                 !NEEDS_RE.is_match(pat),
-                "Expected NEEDS_RE to NOT match: {:?}",
-                pat
+                "Expected NEEDS_RE to NOT match: {pat:?}"
             );
         }
     }
@@ -594,7 +529,7 @@ mod tests {
 
     #[test]
     fn assert_fancy() {
-        let re = Regex::new(r"\d").unwrap();
+        let re = Regex::new(r"(.)\1").unwrap();
         assert!(matches!(re, Regex::Fancy(_)));
     }
 
@@ -609,8 +544,7 @@ mod tests {
         let err = Regex::new("(").unwrap_err().to_string();
         assert!(
             err.contains("unclosed group") || err.contains("error parsing"),
-            "Unexpected error: {}",
-            err
+            "Unexpected error: {err:?}"
         );
     }
 
diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs
@@ -295,6 +295,21 @@ check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]);
 check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]);
 check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]);
 
+// Check appropriate selection and behavior of fast_Regex matcher
+// Literal matcher
+check_output!(subst_literal_start, ["-e", r"s/^l1/L1/", LINES1]);
+check_output!(subst_literal_end, ["-e", r"s/2$/TWO/", LINES1]);
+check_output!(subst_literal, ["-e", r"s/_/-/", LINES1]);
+
+// Fancy matcher
+check_output!(subst_backref, ["-e", r"s/l\(.\)_\1/same-number/", LINES1]);
+
+// Bytes matcher with Unicode
+check_output!(subst_greek, ["-e", r"s/[α-ω]/G/g", "input/unicode"]);
+check_output!(subst_any_unicode, ["-e", r"s/.$/:-)/", "input/unicode"]);
+check_output!(subst_lcase, ["-e", r"s/κ/*/gi", "input/unicode"]);
+check_output!(subst_word, ["-E", "-e", r"s/\w+/WORD/g", "input/unicode"]);
+
 #[test]
 fn subst_write_file() -> std::io::Result<()> {
     let temp = NamedTempFile::new()?;
diff --git a/tests/fixtures/sed/output/subst_any_unicode b/tests/fixtures/sed/output/subst_any_unicode
@@ -0,0 +1 @@
+Hello World or Καλημέρα κόσμε or こんにちは 世界 :-)
diff --git a/tests/fixtures/sed/output/subst_backref b/tests/fixtures/sed/output/subst_backref
@@ -0,0 +1,14 @@
+same-number
+l1_2
+l1_3
+l1_4
+l1_5
+l1_6
+l1_7
+l1_8
+l1_9
+same-number0
+same-number1
+same-number2
+same-number3
+same-number4
diff --git a/tests/fixtures/sed/output/subst_greek b/tests/fixtures/sed/output/subst_greek
@@ -0,0 +1 @@
+Hello World or ΚGGGGέGG GόGGG or こんにちは 世界 😀
diff --git a/tests/fixtures/sed/output/subst_lcase b/tests/fixtures/sed/output/subst_lcase
@@ -0,0 +1 @@
+Hello World or *αλημέρα *όσμε or こんにちは 世界 😀
diff --git a/tests/fixtures/sed/output/subst_literal b/tests/fixtures/sed/output/subst_literal
@@ -0,0 +1,14 @@
+l1-1
+l1-2
+l1-3
+l1-4
+l1-5
+l1-6
+l1-7
+l1-8
+l1-9
+l1-10
+l1-11
+l1-12
+l1-13
+l1-14
diff --git a/tests/fixtures/sed/output/subst_literal_end b/tests/fixtures/sed/output/subst_literal_end
@@ -0,0 +1,14 @@
+l1_1
+l1_TWO
+l1_3
+l1_4
+l1_5
+l1_6
+l1_7
+l1_8
+l1_9
+l1_10
+l1_11
+l1_1TWO
+l1_13
+l1_14
diff --git a/tests/fixtures/sed/output/subst_literal_start b/tests/fixtures/sed/output/subst_literal_start
@@ -0,0 +1,14 @@
+L1_1
+L1_2
+L1_3
+L1_4
+L1_5
+L1_6
+L1_7
+L1_8
+L1_9
+L1_10
+L1_11
+L1_12
+L1_13
+L1_14
diff --git a/tests/fixtures/sed/output/subst_word b/tests/fixtures/sed/output/subst_word
@@ -0,0 +1 @@
+WORD WORD WORD WORD WORD WORD WORD WORD 😀

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Hello World or Καλημέρα κόσμε or こんにちは世界 :-)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Hello World or ΚGGGGέGG GόGGG or こんにちは世界 😀`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Hello World or αλημέρα όσμε or こんにちは世界 😀`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,14 @@ @@
 +l1-1
 +l1-2
 +l1-3
 +l1-4
 +l1-5
 +l1-6
 +l1-7
 +l1-8
 +l1-9
 +l1-10
 +l1-11
 +l1-12
 +l1-13
 +l1-14
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+WORD WORD WORD WORD WORD WORD WORD WORD 😀`