ua-parser
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 2 deletions b/‎.gitignore‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 24 additions & 0 deletions b/‎Makefile‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎regex-filtered/re2/flake.nix‎ ‎flake.nix‎regex-filtered/re2/flake.nix renamed to flake.nix b/‎regex-filtered/re2/flake.nix‎ ‎flake.nix‎regex-filtered/re2/flake.nix renamed to flake.nix
diff --git a/‎regex-filtered/examples/bench.rs‎ ‎regex-filtered/examples/bench_regex.rs‎regex-filtered/examples/bench.rs renamed to regex-filtered/examples/bench_regex.rs b/‎regex-filtered/examples/bench.rs‎ ‎regex-filtered/examples/bench_regex.rs‎regex-filtered/examples/bench.rs renamed to regex-filtered/examples/bench_regex.rs
diff --git a/‎regex-filtered/re2/Makefile‎
Lines changed: 0 additions & 10 deletions b/‎regex-filtered/re2/Makefile‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎scripts/devices‎
Lines changed: 121 additions & 0 deletions b/‎scripts/devices‎
Lines changed: 121 additions & 0 deletions
@@ -1,7 +1,7 @@
 /target
 .DS_Store
 *.dSYM/
-regex-filtered/re2/flake.lock
-regex-filtered/re2/bench
+flake.lock
 .tox/
+*/uv.lock
 __pycache__
@@ -0,0 +1,24 @@
+CXXFLAGS += -std=c++20 -Wall -Werror -g -fPIC -O3
+LDFLAGS += -lre2
+
+.PHONY: bench
+
+bench: target/bench_re2 target/devices.regexes target/release/examples/bench_regex
+	/usr/bin/time -l target/bench_re2 \
+		target/devices.regexes regex-filtered/samples/useragents.txt 100 -q
+	/usr/bin/time -l target/release/examples/bench_regex \
+		target/devices.regexes regex-filtered/samples/useragents.txt -r 100 -q
+
+target/bench_re2: regex-filtered/re2/bench.cpp
+	# build re2 bench, requires re2 to be LD-able, can `nix develop` for setup
+	@mkdir -p target
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
+target/release/examples/bench_regex: regex-filtered/examples/bench_regex.rs regex-filtered/src/*
+	# build regex bench
+	cargo build --release --example bench_regex -q
+
+target/devices.regexes: scripts/devices ua-parser/uap-core/regexes.yaml
+	# compiles regexe.yaml to a list of just the device regex (with embedded flags)
+	@mkdir -p target
+	uv run --script $^ > $@
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "pyyaml",
+# ]
+# ///
+r"""Compiles regexes.yaml to just the device regexps, with rewriting:
+
+- Rust's `regex` implements perl-style character classes with full
+  unicode semantics making them much more expensive than re2's
+  ascii-only semantics, so compile down the most frequent ones down to
+  ascii classes.
+
+  regexes.yaml uses \d, \w, \s, \S, \b, and the first one is the most
+  common by two orders of magnitude (but convert \w as well because I
+  dun so already, converting \s might be a good idea too)
+
+- Both `regex` and `re2` suffer tremendously from large bounded
+  repetitions as they need to create a *ton* of states to keep track
+  of the limit. This mostly affects memory consumption (and the issue
+  compounds when captures are added to the mix), but there is a minor
+  CPU hit as well.
+
+  In regexes.yaml, large bounded repetitions were introduced only to
+  limit the risks of catastrophic backtracking in backtracking
+  engines. Which neither re2 nor regex are.
+
+  So compile large bounded repetition (where heuristically "large" is
+  3 digits in the upper bound) back to simple unbounded repetitions.
+  Note that this is only done for a lower bound of `0` or `1`, but
+  that's the case of all large bounded repetitions in regexes.yaml.
+"""
+import string
+import sys
+
+from yaml import SafeLoader, load
+
+def main() -> None:
+    with open(sys.argv[1]) as f:
+        regexes = load(f, Loader=SafeLoader)
+    for dev in regexes['device_parsers']:
+        print(
+            f'(?{f})' if (f := dev.get('regex_flag')) else '',
+            rewrite(dev['regex']),
+            sep='',
+        )
+
+def rewrite(re: str) -> str:
+    from_ = 0
+    out = []
+    it = enumerate(re)
+    escape = False
+    inclass = False
+
+    cont = True
+    while cont and (e := next(it, None)):
+        idx, c = e
+        match c:
+            case '\\' if not escape:
+                escape = True
+                continue
+            case '[' if not escape:
+                inclass = True
+            case ']' if not escape:
+                inclass = False
+            case 'd' if escape:
+                out.append(re[from_:idx-1])
+                from_ = idx+1
+                if inclass:
+                    out.append('0-9')
+                else:
+                    out.append('[0-9]')
+            case 'w' if escape:
+                out.append(re[from_:idx-1])
+                from_ = idx+1
+                if inclass:
+                    out.append('A-Za-z0-9_')
+                else:
+                    out.append('[A-Za-z0-9_]')
+            case '{' if not escape and not inclass:
+                if not idx:
+                    return re
+
+                try:
+                    _, start = next(it)
+                except StopIteration:
+                    continue
+                if start not in '01':
+                    continue
+
+                try:
+                    _, comma = next(it)
+                except StopIteration:
+                    continue
+                else:
+                    if comma != ',':
+                        continue
+
+                digits = 0
+                for ri, rc in it:
+                    match rc:
+                        case c if c in string.digits:
+                            digits += 1
+                        case '}' if digits > 2:
+                            out.append(re[from_:idx])
+                            from_ = ri + 1
+                            out.append('*' if start == '0' else '+')
+                        case _:
+                            break
+            case _:
+                pass
+        escape = False
+
+    if from_ == 0:
+        return re
+    out.append(re[from_:])
+    return ''.join(out)
+
+if __name__ == "__main__":
+    main()