Skip to content

Commit 11f235d

Browse files
dspinellissylvestre
authored andcommitted
Use regex::bytes also for Unicode matching
Keep using regex_fancy for regexes containing backreferences. This removes a complex unneeded matcher selection RE and also provides an overall small performance boost. access-log-no-del pre-use-bytes is 1.02 times faster than use-re-bytes access-log-all-del use-re-bytes is 1.02 times faster than pre-use-bytes access-log-translit pre-use-bytes is similarly fast as use-re-bytes access-log-complex-sub pre-use-bytes is similarly fast as use-re-bytes access-log-append pre-use-bytes is similarly fast as use-re-bytes remove-cr use-re-bytes is 1.08 times faster than pre-use-bytes genome-subst use-re-bytes is 1.05 times faster than pre-use-bytes number-fix pre-use-bytes is 1.02 times faster than use-re-bytes long-script use-re-bytes is 1.01 times faster than pre-use-bytes hanoi pre-use-bytes is similarly fast as use-re-bytes factorial use-re-bytes is 1.03 times faster than pre-use-bytes Suggested by: Andrew Gallant @BurntSushi
1 parent 8665bf6 commit 11f235d

File tree

10 files changed

+89
-80
lines changed

10 files changed

+89
-80
lines changed

src/sed/fast_regex.rs

Lines changed: 14 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -27,44 +27,12 @@ use crate::sed::fast_io::IOChunk;
2727

2828
/// REs requiring the fancy_regex capabilities rather than the
2929
/// faster regex::bytes engine
30-
// Consider . as one character that requires fancy_regex,
31-
// because it can match more than one byte when matching a
32-
// two or more byte Unicode UTF-8 representation.
33-
// It is an RE . rather than a literal one in the following
34-
// example sitations.
35-
// . First character of the line
36-
// [^\\]. Second character after non \
37-
//
38-
// \*. A consumed backslash anywhere on the line
39-
// \\. An escaped backslash anywhere on the line
40-
// xx. A non-escaped sequence anywhere on the line
41-
// But the following are literal dots and can be captured by bytes:
42-
// \. escaped at the beginning of the line
43-
// x\. escaped after a non escaped \ anywhere on the line
44-
//
45-
// The following RE captures these situations.
46-
static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| {
47-
regex::Regex::new(
48-
r"(?x) # Turn on verbose mode
49-
( # An ASCII-incompatible RE
50-
( ^ # Non-escaped: i.e. at BOL
51-
| ^[^\\] # or after a BOL non \
52-
| [^\\] {2} # or after two non \ characters
53-
| \\. # or after a consumed or escaped \
54-
)
55-
( # A potentially incompatible match
56-
\. # . matches any Unicode character
57-
| \[\^ # Bracketed -ve character class
58-
| \(\?i # (Unicode) case insensitive
59-
| \\[WwDdSsBbPp] # Unicode classes
60-
| \\[0-9] # Back-references need fancy
61-
)
62-
)
63-
| [^\x01-\x7f] # Any non-ASCII character
64-
",
65-
)
66-
.unwrap()
67-
});
30+
// False positives only result in a small performance pessimization,
31+
// so this is just a maximally sensitive, good-enough approximation.
32+
// For example, r"\\1" and r"[\1]" will match, whereas only a number
33+
// after an odd number of backslashes and outside a character class
34+
// should match.
35+
static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| regex::Regex::new(r"\\[1-9]").unwrap());
6836

6937
/// All characters signifying that the match must be handled by an RE
7038
/// rather than by plain string pattern matching.
@@ -476,52 +444,22 @@ mod tests {
476444
#[test]
477445
fn test_needs_fancy_re_matches() {
478446
let should_match = [
479-
// Unicode classes BOL
480-
r"\p{L}+", // Unicode letter class
481-
r"\W", // \W is Unicode-aware.
482-
r"\S+", // \S is Unicode-aware.
483-
r"\d", // \d includes all Unicode digits.
484-
// Unicode classes non-BOL
485-
r"x\p{L}+", // Unicode letter class
486-
r"x\W", // \W is Unicode-aware.
487-
r"x\S+", // \S is Unicode-aware.
488-
r"x\d", // \d includes all Unicode digits.
489-
// .
490-
r".",
491-
r"x.",
492-
r"xx.",
493-
// Consumed \
494-
r"\*.",
495-
r"x\*.",
496-
// Escaped \
497-
r"\\.",
498-
r"x\\.",
499-
// Inline flags
500-
r"(?i)abc", // Unicode case-insensitive
501-
r"x(?i)abc", // Unicode case-insensitive
502447
r"(\w+):\1", // back-reference \1
503-
// Non-ASCII literals
504-
"naïve", // Contains literal non-ASCII.
505-
"café", // Contains literal non-ASCII.
506448
];
507449

508450
for pat in &should_match {
509451
assert!(
510452
NEEDS_FANCY_RE.is_match(pat),
511-
"Expected NEEDS_FANCY_RE to match: {:?}",
512-
pat
453+
"Expected NEEDS_FANCY_RE to match: {pat:?}"
513454
);
514455
}
515456
}
516457

517458
#[test]
518459
fn test_needs_fancy_re_does_not_match() {
519460
let should_not_match = [
520-
r"\.", // Escaped . at BOL
521-
r"x\.", // Escaped . at non BOL
522-
r"\[^x]", // Escaped character class
523-
r"\(?i\)", // Escaped case insesitive flag
524-
r"\\w", // Escaped Unicode class
461+
r"\ 1", // Non-adjacent
462+
r"\0", // Only \[1-9]
525463
// Simple ASCII
526464
r"foo",
527465
r"foo|bar",
@@ -531,8 +469,7 @@ mod tests {
531469
for pat in &should_not_match {
532470
assert!(
533471
!NEEDS_FANCY_RE.is_match(pat),
534-
"Expected NEEDS_FANCY_RE to NOT match: {:?}",
535-
pat
472+
"Expected NEEDS_FANCY_RE to NOT match: {pat:?}"
536473
);
537474
}
538475
}
@@ -558,8 +495,7 @@ mod tests {
558495
for pat in &should_match {
559496
assert!(
560497
NEEDS_RE.is_match(pat),
561-
"Expected NEEDS_RE to match: {:?}",
562-
pat
498+
"Expected NEEDS_RE to match: {pat:?}"
563499
);
564500
}
565501
}
@@ -579,8 +515,7 @@ mod tests {
579515
for pat in &should_not_match {
580516
assert!(
581517
!NEEDS_RE.is_match(pat),
582-
"Expected NEEDS_RE to NOT match: {:?}",
583-
pat
518+
"Expected NEEDS_RE to NOT match: {pat:?}"
584519
);
585520
}
586521
}
@@ -594,7 +529,7 @@ mod tests {
594529

595530
#[test]
596531
fn assert_fancy() {
597-
let re = Regex::new(r"\d").unwrap();
532+
let re = Regex::new(r"(.)\1").unwrap();
598533
assert!(matches!(re, Regex::Fancy(_)));
599534
}
600535

@@ -609,8 +544,7 @@ mod tests {
609544
let err = Regex::new("(").unwrap_err().to_string();
610545
assert!(
611546
err.contains("unclosed group") || err.contains("error parsing"),
612-
"Unexpected error: {}",
613-
err
547+
"Unexpected error: {err:?}"
614548
);
615549
}
616550

tests/by-util/test_sed.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,21 @@ check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]);
295295
check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]);
296296
check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]);
297297

298+
// Check appropriate selection and behavior of fast_Regex matcher
299+
// Literal matcher
300+
check_output!(subst_literal_start, ["-e", r"s/^l1/L1/", LINES1]);
301+
check_output!(subst_literal_end, ["-e", r"s/2$/TWO/", LINES1]);
302+
check_output!(subst_literal, ["-e", r"s/_/-/", LINES1]);
303+
304+
// Fancy matcher
305+
check_output!(subst_backref, ["-e", r"s/l\(.\)_\1/same-number/", LINES1]);
306+
307+
// Bytes matcher with Unicode
308+
check_output!(subst_greek, ["-e", r"s/[α-ω]/G/g", "input/unicode"]);
309+
check_output!(subst_any_unicode, ["-e", r"s/.$/:-)/", "input/unicode"]);
310+
check_output!(subst_lcase, ["-e", r"s/κ/*/gi", "input/unicode"]);
311+
check_output!(subst_word, ["-E", "-e", r"s/\w+/WORD/g", "input/unicode"]);
312+
298313
#[test]
299314
fn subst_write_file() -> std::io::Result<()> {
300315
let temp = NamedTempFile::new()?;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Hello World or Καλημέρα κόσμε or こんにちは 世界 :-)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
same-number
2+
l1_2
3+
l1_3
4+
l1_4
5+
l1_5
6+
l1_6
7+
l1_7
8+
l1_8
9+
l1_9
10+
same-number0
11+
same-number1
12+
same-number2
13+
same-number3
14+
same-number4
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Hello World or ΚGGGGέGG GόGGG or こんにちは 世界 😀
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Hello World or *αλημέρα *όσμε or こんにちは 世界 😀
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
l1-1
2+
l1-2
3+
l1-3
4+
l1-4
5+
l1-5
6+
l1-6
7+
l1-7
8+
l1-8
9+
l1-9
10+
l1-10
11+
l1-11
12+
l1-12
13+
l1-13
14+
l1-14
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
l1_1
2+
l1_TWO
3+
l1_3
4+
l1_4
5+
l1_5
6+
l1_6
7+
l1_7
8+
l1_8
9+
l1_9
10+
l1_10
11+
l1_11
12+
l1_1TWO
13+
l1_13
14+
l1_14
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
L1_1
2+
L1_2
3+
L1_3
4+
L1_4
5+
L1_5
6+
L1_6
7+
L1_7
8+
L1_8
9+
L1_9
10+
L1_10
11+
L1_11
12+
L1_12
13+
L1_13
14+
L1_14
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
WORD WORD WORD WORD WORD WORD WORD WORD 😀

0 commit comments

Comments
 (0)