diff --git a/.gitignore b/.gitignore index 53f7eb9..f0f12dc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ /target .DS_Store *.dSYM/ -regex-filtered/re2/flake.lock -regex-filtered/re2/bench +flake.lock .tox/ +*/uv.lock __pycache__ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e4f6357 --- /dev/null +++ b/Makefile @@ -0,0 +1,24 @@ +CXXFLAGS += -std=c++20 -Wall -Werror -g -fPIC -O3 +LDFLAGS += -lre2 + +.PHONY: bench + +bench: target/bench_re2 target/devices.regexes target/release/examples/bench_regex + /usr/bin/time -l target/bench_re2 \ + target/devices.regexes regex-filtered/samples/useragents.txt 100 -q + /usr/bin/time -l target/release/examples/bench_regex \ + target/devices.regexes regex-filtered/samples/useragents.txt -r 100 -q + +target/bench_re2: regex-filtered/re2/bench.cpp + # build re2 bench, requires re2 to be LD-able, can `nix develop` for setup + @mkdir -p target + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + +target/release/examples/bench_regex: regex-filtered/examples/bench_regex.rs regex-filtered/src/* + # build regex bench + cargo build --release --example bench_regex -q + +target/devices.regexes: scripts/devices ua-parser/uap-core/regexes.yaml + # compiles regexe.yaml to a list of just the device regex (with embedded flags) + @mkdir -p target + uv run --script $^ > $@ diff --git a/regex-filtered/re2/flake.nix b/flake.nix similarity index 100% rename from regex-filtered/re2/flake.nix rename to flake.nix diff --git a/regex-filtered/examples/bench.rs b/regex-filtered/examples/bench_regex.rs similarity index 93% rename from regex-filtered/examples/bench.rs rename to regex-filtered/examples/bench_regex.rs index 17646df..7e00d1b 100644 --- a/regex-filtered/examples/bench.rs +++ b/regex-filtered/examples/bench_regex.rs @@ -27,7 +27,9 @@ fn main() -> Result<(), Box> { .lines() .collect::, _>>()?; - let f = regex_filtered::Builder::new().push_all(®exes)?.build()?; + let f = regex_filtered::Builder::new_atom_len(2) + .push_all(®exes)? + .build()?; eprintln!( "{} regexes in {}s", regexes.len(), diff --git a/regex-filtered/re2/Makefile b/regex-filtered/re2/Makefile deleted file mode 100644 index 2e972bb..0000000 --- a/regex-filtered/re2/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -CXXFLAGS += -std=c++20 -Wall -Werror -g -fPIC -O3 -LDFLAGS += -lre2 - -.PHONY: clean - -bench: bench.cpp - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) - -clean: - @rm bench diff --git a/regex-filtered/re2/bench.cpp b/regex-filtered/re2/bench.cpp index 942e925..a85a8d0 100644 --- a/regex-filtered/re2/bench.cpp +++ b/regex-filtered/re2/bench.cpp @@ -36,7 +36,7 @@ int main(const int argc, const char* argv[]) { std::ifstream regexes_f(argv[1]); re2::RE2::Options opt; - re2::FilteredRE2 f(3); + re2::FilteredRE2 f(2); int id; std::string line; diff --git a/scripts/devices b/scripts/devices new file mode 100755 index 0000000..e1fa488 --- /dev/null +++ b/scripts/devices @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pyyaml", +# ] +# /// +r"""Compiles regexes.yaml to just the device regexps, with rewriting: + +- Rust's `regex` implements perl-style character classes with full + unicode semantics making them much more expensive than re2's + ascii-only semantics, so compile down the most frequent ones down to + ascii classes. + + regexes.yaml uses \d, \w, \s, \S, \b, and the first one is the most + common by two orders of magnitude (but convert \w as well because I + dun so already, converting \s might be a good idea too) + +- Both `regex` and `re2` suffer tremendously from large bounded + repetitions as they need to create a *ton* of states to keep track + of the limit. This mostly affects memory consumption (and the issue + compounds when captures are added to the mix), but there is a minor + CPU hit as well. + + In regexes.yaml, large bounded repetitions were introduced only to + limit the risks of catastrophic backtracking in backtracking + engines. Which neither re2 nor regex are. + + So compile large bounded repetition (where heuristically "large" is + 3 digits in the upper bound) back to simple unbounded repetitions. + Note that this is only done for a lower bound of `0` or `1`, but + that's the case of all large bounded repetitions in regexes.yaml. +""" +import string +import sys + +from yaml import SafeLoader, load + +def main() -> None: + with open(sys.argv[1]) as f: + regexes = load(f, Loader=SafeLoader) + for dev in regexes['device_parsers']: + print( + f'(?{f})' if (f := dev.get('regex_flag')) else '', + rewrite(dev['regex']), + sep='', + ) + +def rewrite(re: str) -> str: + from_ = 0 + out = [] + it = enumerate(re) + escape = False + inclass = False + + cont = True + while cont and (e := next(it, None)): + idx, c = e + match c: + case '\\' if not escape: + escape = True + continue + case '[' if not escape: + inclass = True + case ']' if not escape: + inclass = False + case 'd' if escape: + out.append(re[from_:idx-1]) + from_ = idx+1 + if inclass: + out.append('0-9') + else: + out.append('[0-9]') + case 'w' if escape: + out.append(re[from_:idx-1]) + from_ = idx+1 + if inclass: + out.append('A-Za-z0-9_') + else: + out.append('[A-Za-z0-9_]') + case '{' if not escape and not inclass: + if not idx: + return re + + try: + _, start = next(it) + except StopIteration: + continue + if start not in '01': + continue + + try: + _, comma = next(it) + except StopIteration: + continue + else: + if comma != ',': + continue + + digits = 0 + for ri, rc in it: + match rc: + case c if c in string.digits: + digits += 1 + case '}' if digits > 2: + out.append(re[from_:idx]) + from_ = ri + 1 + out.append('*' if start == '0' else '+') + case _: + break + case _: + pass + escape = False + + if from_ == 0: + return re + out.append(re[from_:]) + return ''.join(out) + +if __name__ == "__main__": + main() diff --git a/ua-parser/src/lib.rs b/ua-parser/src/lib.rs index 80fabfb..c126be6 100644 --- a/ua-parser/src/lib.rs +++ b/ua-parser/src/lib.rs @@ -176,7 +176,7 @@ pub mod user_agent { /// Initialise an empty builder. pub fn new() -> Self { Self { - builder: regex_filtered::Builder::new_atom_len(2), + builder: regex_filtered::Builder::new_atom_len(3), repl: Vec::new(), } } @@ -343,7 +343,7 @@ pub mod os { /// pub fn new() -> Self { Self { - builder: regex_filtered::Builder::new_atom_len(2), + builder: regex_filtered::Builder::new_atom_len(3), repl: Vec::new(), } }