Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/target
.DS_Store
*.dSYM/
regex-filtered/re2/flake.lock
regex-filtered/re2/bench
flake.lock
.tox/
*/uv.lock
__pycache__
24 changes: 24 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
CXXFLAGS += -std=c++20 -Wall -Werror -g -fPIC -O3
LDFLAGS += -lre2

.PHONY: bench

bench: target/bench_re2 target/devices.regexes target/release/examples/bench_regex
/usr/bin/time -l target/bench_re2 \
target/devices.regexes regex-filtered/samples/useragents.txt 100 -q
/usr/bin/time -l target/release/examples/bench_regex \
target/devices.regexes regex-filtered/samples/useragents.txt -r 100 -q

target/bench_re2: regex-filtered/re2/bench.cpp
# build re2 bench, requires re2 to be LD-able, can `nix develop` for setup
@mkdir -p target
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

target/release/examples/bench_regex: regex-filtered/examples/bench_regex.rs regex-filtered/src/*
# build regex bench
cargo build --release --example bench_regex -q

target/devices.regexes: scripts/devices ua-parser/uap-core/regexes.yaml
# compiles regexe.yaml to a list of just the device regex (with embedded flags)
@mkdir -p target
uv run --script $^ > $@
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.lines()
.collect::<Result<Vec<String>, _>>()?;

let f = regex_filtered::Builder::new().push_all(&regexes)?.build()?;
let f = regex_filtered::Builder::new_atom_len(2)
.push_all(&regexes)?
.build()?;
eprintln!(
"{} regexes in {}s",
regexes.len(),
Expand Down
10 changes: 0 additions & 10 deletions regex-filtered/re2/Makefile

This file was deleted.

2 changes: 1 addition & 1 deletion regex-filtered/re2/bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ int main(const int argc, const char* argv[]) {
std::ifstream regexes_f(argv[1]);

re2::RE2::Options opt;
re2::FilteredRE2 f(3);
re2::FilteredRE2 f(2);
int id;

std::string line;
Expand Down
121 changes: 121 additions & 0 deletions scripts/devices
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pyyaml",
# ]
# ///
r"""Compiles regexes.yaml to just the device regexps, with rewriting:

- Rust's `regex` implements perl-style character classes with full
unicode semantics making them much more expensive than re2's
ascii-only semantics, so compile down the most frequent ones down to
ascii classes.

regexes.yaml uses \d, \w, \s, \S, \b, and the first one is the most
common by two orders of magnitude (but convert \w as well because I
dun so already, converting \s might be a good idea too)

- Both `regex` and `re2` suffer tremendously from large bounded
repetitions as they need to create a *ton* of states to keep track
of the limit. This mostly affects memory consumption (and the issue
compounds when captures are added to the mix), but there is a minor
CPU hit as well.

In regexes.yaml, large bounded repetitions were introduced only to
limit the risks of catastrophic backtracking in backtracking
engines. Which neither re2 nor regex are.

So compile large bounded repetition (where heuristically "large" is
3 digits in the upper bound) back to simple unbounded repetitions.
Note that this is only done for a lower bound of `0` or `1`, but
that's the case of all large bounded repetitions in regexes.yaml.
"""
import string
import sys

from yaml import SafeLoader, load

def main() -> None:
with open(sys.argv[1]) as f:
regexes = load(f, Loader=SafeLoader)
for dev in regexes['device_parsers']:
print(
f'(?{f})' if (f := dev.get('regex_flag')) else '',
rewrite(dev['regex']),
sep='',
)

def rewrite(re: str) -> str:
from_ = 0
out = []
it = enumerate(re)
escape = False
inclass = False

cont = True
while cont and (e := next(it, None)):
idx, c = e
match c:
case '\\' if not escape:
escape = True
continue
case '[' if not escape:
inclass = True
case ']' if not escape:
inclass = False
case 'd' if escape:
out.append(re[from_:idx-1])
from_ = idx+1
if inclass:
out.append('0-9')
else:
out.append('[0-9]')
case 'w' if escape:
out.append(re[from_:idx-1])
from_ = idx+1
if inclass:
out.append('A-Za-z0-9_')
else:
out.append('[A-Za-z0-9_]')
case '{' if not escape and not inclass:
if not idx:
return re

try:
_, start = next(it)
except StopIteration:
continue
if start not in '01':
continue

try:
_, comma = next(it)
except StopIteration:
continue
else:
if comma != ',':
continue

digits = 0
for ri, rc in it:
match rc:
case c if c in string.digits:
digits += 1
case '}' if digits > 2:
out.append(re[from_:idx])
from_ = ri + 1
out.append('*' if start == '0' else '+')
case _:
break
case _:
pass
escape = False

if from_ == 0:
return re
out.append(re[from_:])
return ''.join(out)

if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions ua-parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ pub mod user_agent {
/// Initialise an empty builder.
pub fn new() -> Self {
Self {
builder: regex_filtered::Builder::new_atom_len(2),
builder: regex_filtered::Builder::new_atom_len(3),
repl: Vec::new(),
}
}
Expand Down Expand Up @@ -343,7 +343,7 @@ pub mod os {
///
pub fn new() -> Self {
Self {
builder: regex_filtered::Builder::new_atom_len(2),
builder: regex_filtered::Builder::new_atom_len(3),
repl: Vec::new(),
}
}
Expand Down