ua-parser · masklinn · May 11, 2025 · May 11, 2025 · May 11, 2025 · May 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 /target
 .DS_Store
 *.dSYM/
-regex-filtered/re2/flake.lock
-regex-filtered/re2/bench
+flake.lock
 .tox/
+*/uv.lock
 __pycache__
diff --git a/Makefile b/Makefile
@@ -0,0 +1,24 @@
+CXXFLAGS += -std=c++20 -Wall -Werror -g -fPIC -O3
+LDFLAGS += -lre2
+
+.PHONY: bench
+
+bench: target/bench_re2 target/devices.regexes target/release/examples/bench_regex
+	/usr/bin/time -l target/bench_re2 \
+		target/devices.regexes regex-filtered/samples/useragents.txt 100 -q
+	/usr/bin/time -l target/release/examples/bench_regex \
+		target/devices.regexes regex-filtered/samples/useragents.txt -r 100 -q
+
+target/bench_re2: regex-filtered/re2/bench.cpp
+	# build re2 bench, requires re2 to be LD-able, can `nix develop` for setup
+	@mkdir -p target
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
+target/release/examples/bench_regex: regex-filtered/examples/bench_regex.rs regex-filtered/src/*
+	# build regex bench
+	cargo build --release --example bench_regex -q
+
+target/devices.regexes: scripts/devices ua-parser/uap-core/regexes.yaml
+	# compiles regexe.yaml to a list of just the device regex (with embedded flags)
+	@mkdir -p target
+	uv run --script $^ > $@
diff --git a/regex-filtered/re2/flake.nix → flake.nix b/regex-filtered/re2/flake.nix → flake.nix
diff --git a/regex-filtered/examples/bench.rs → regex-filtered/examples/bench_regex.rs b/regex-filtered/examples/bench.rs → regex-filtered/examples/bench_regex.rs
@@ -27,7 +27,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         .lines()
         .collect::<Result<Vec<String>, _>>()?;
 
-    let f = regex_filtered::Builder::new().push_all(&regexes)?.build()?;
+    let f = regex_filtered::Builder::new_atom_len(2)
+        .push_all(&regexes)?
+        .build()?;
     eprintln!(
         "{} regexes in {}s",
         regexes.len(),

diff --git a/regex-filtered/re2/Makefile b/regex-filtered/re2/Makefile
diff --git a/regex-filtered/re2/bench.cpp b/regex-filtered/re2/bench.cpp
@@ -36,7 +36,7 @@ int main(const int argc, const char* argv[]) {
   std::ifstream regexes_f(argv[1]);
 
   re2::RE2::Options opt;
-  re2::FilteredRE2 f(3);
+  re2::FilteredRE2 f(2);
   int id;
 
   std::string line;

diff --git a/scripts/devices b/scripts/devices
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "pyyaml",
+# ]
+# ///
+r"""Compiles regexes.yaml to just the device regexps, with rewriting:
+
+- Rust's `regex` implements perl-style character classes with full
+  unicode semantics making them much more expensive than re2's
+  ascii-only semantics, so compile down the most frequent ones down to
+  ascii classes.
+
+  regexes.yaml uses \d, \w, \s, \S, \b, and the first one is the most
+  common by two orders of magnitude (but convert \w as well because I
+  dun so already, converting \s might be a good idea too)
+
+- Both `regex` and `re2` suffer tremendously from large bounded
+  repetitions as they need to create a *ton* of states to keep track
+  of the limit. This mostly affects memory consumption (and the issue
+  compounds when captures are added to the mix), but there is a minor
+  CPU hit as well.
+
+  In regexes.yaml, large bounded repetitions were introduced only to
+  limit the risks of catastrophic backtracking in backtracking
+  engines. Which neither re2 nor regex are.
+
+  So compile large bounded repetition (where heuristically "large" is
+  3 digits in the upper bound) back to simple unbounded repetitions.
+  Note that this is only done for a lower bound of `0` or `1`, but
+  that's the case of all large bounded repetitions in regexes.yaml.
+"""
+import string
+import sys
+
+from yaml import SafeLoader, load
+
+def main() -> None:
+    with open(sys.argv[1]) as f:
+        regexes = load(f, Loader=SafeLoader)
+    for dev in regexes['device_parsers']:
+        print(
+            f'(?{f})' if (f := dev.get('regex_flag')) else '',
+            rewrite(dev['regex']),
+            sep='',
+        )
+
+def rewrite(re: str) -> str:
+    from_ = 0
+    out = []
+    it = enumerate(re)
+    escape = False
+    inclass = False
+
+    cont = True
+    while cont and (e := next(it, None)):
+        idx, c = e
+        match c:
+            case '\\' if not escape:
+                escape = True
+                continue
+            case '[' if not escape:
+                inclass = True
+            case ']' if not escape:
+                inclass = False
+            case 'd' if escape:
+                out.append(re[from_:idx-1])
+                from_ = idx+1
+                if inclass:
+                    out.append('0-9')
+                else:
+                    out.append('[0-9]')
+            case 'w' if escape:
+                out.append(re[from_:idx-1])
+                from_ = idx+1
+                if inclass:
+                    out.append('A-Za-z0-9_')
+                else:
+                    out.append('[A-Za-z0-9_]')
+            case '{' if not escape and not inclass:
+                if not idx:
+                    return re
+
+                try:
+                    _, start = next(it)
+                except StopIteration:
+                    continue
+                if start not in '01':
+                    continue
+
+                try:
+                    _, comma = next(it)
+                except StopIteration:
+                    continue
+                else:
+                    if comma != ',':
+                        continue
+
+                digits = 0
+                for ri, rc in it:
+                    match rc:
+                        case c if c in string.digits:
+                            digits += 1
+                        case '}' if digits > 2:
+                            out.append(re[from_:idx])
+                            from_ = ri + 1
+                            out.append('*' if start == '0' else '+')
+                        case _:
+                            break
+            case _:
+                pass
+        escape = False
+
+    if from_ == 0:
+        return re
+    out.append(re[from_:])
+    return ''.join(out)
+
+if __name__ == "__main__":
+    main()
diff --git a/ua-parser/src/lib.rs b/ua-parser/src/lib.rs
@@ -176,7 +176,7 @@ pub mod user_agent {
         /// Initialise an empty builder.
         pub fn new() -> Self {
             Self {
-                builder: regex_filtered::Builder::new_atom_len(2),
+                builder: regex_filtered::Builder::new_atom_len(3),
                 repl: Vec::new(),
             }
         }
@@ -343,7 +343,7 @@ pub mod os {
         ///
         pub fn new() -> Self {
             Self {
-                builder: regex_filtered::Builder::new_atom_len(2),
+                builder: regex_filtered::Builder::new_atom_len(3),
                 repl: Vec::new(),
             }
         }