This repo provides a PPX providing regular expression-based routing:
ppx_regexp_extended
maps to re with the conventional last-match extraction intostring
andstring option
. Two syntaxes for regular expressions available:pcre
: The syntax of regular PCRE expressionsmikmatch
: Mimics the syntax of the mikmatch tool
This syntax extension turns:
function%pcre
| {|re1|} -> e1
...
| {|reN|} -> eN
| _ -> e0
(or function%mikmatch
) into suitable invocations of the Re library, and similar for match%pcre
/match%mikmatch
.
It also accepts:
let%pcre var = {| some regex |}
(* and *)
let%mikmatch var = {| some regex |}
Full %mikmatch guide.
URL parsing:
let parse s =
let (scheme, first) =
match s.[4] with
| ':' -> `Http, 7
| 's' -> `Https, 8
| _ -> failwith "parse"
in
let last = String.index_from s first '/' in
let host = String.slice s ~first ~last in
let (host,port) =
match Stre.splitc host ':' with
| exception _ -> host, default_port scheme
| (host,port) -> host, int_of_string port
in
...
(* in mikmatch: *)
let parse s =
match%mikmatch s with
| {|/ "http" ('s' as https)? "://" ([^ '/' ':']+ as host) (":" (digit+ as port : int))? '/'? (_* as rest) /|} ->
let scheme = match https with Some _ -> `Https | None -> `Http in
let port = match port with Some p -> p | None -> default_port scheme in
...
| _ -> failwith "parse"
let rex =
let origins = "csv|pdf|html|xlsv|xml"
Re2.create_exn (sprintf {|^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)(?:\.(\d+))?\.(%s)\.(\d+)\.(\d+)$|} origins)
let of_string s =
try
let m = Re2.first_match_exn rex s in
let start = Re2.Match.get_exn ~sub:(`Index 1) m |> U.strptime "%Y-%m-%dT%H:%M:%S%z" |> U.timegm in
let shard = int_of_string (Re2.Match.get_exn ~sub:(`Index 2) m) in
let origin = origin_of_string (Re2.Match.get_exn ~sub:(`Index 3) m) in
let partition = int_of_string (Re2.Match.get_exn ~sub:(`Index 4) m) in
let worker = int_of_string (Re2.Match.get_exn ~sub:(`Index 5) m) in
{ start; shard; origin; partition; worker }
with _ -> invalid_arg (sprintf "error: %s" s)
(* in mikmatch: *)
let%mikmatch origins = {| "csv" | "pdf" | "html" | "xlsv" | "xml" |}
let of_string s =
match%mikmatch s with
| {|/ (digit{4} '-' digit{2} '-' digit{2} 'T' digit{2} ':' digit{2} ':' digit{2} 'Z' as timestamp)
('.' (digit+ as shard : int))?
'.' (origins as origin := origin_of_string)
'.' (digit+ as partition : int)
'.' (digit+ as worker : int) /|} ->
let start = U.strptime "%Y-%m-%dT%H:%M:%S%z" timestamp |> U.timegm in
let shard = match shard with Some s -> s | None -> 0 in
{ start; shard; origin; partition; worker }
| _ -> invalid_arg (sprintf "error: %s" s)
The patterns are plain strings of the form accepted by Re.Pcre
, with the following additions:
-
(?<var>...)
defines a group and binds whatever it matches asvar
. The type ofvar
will bestring
if the match is guaranteed given that the whole pattern matches, andstring option
if the variable is bound to or nested below an optionally matched group. -
(?&var)
refers to any identifier bound to a typed regular expression of type'a Re.t
and binds whatever it matches asvar
. The type ofvar
will be the same as(?<var>...)
. -
(?&name:var>)
same as above but binds whatever it matches asname
. (shortcut for(?<v>(?&qname))
) The type ofname
will be the same as(?<var>...)
. -
?<var>
at the start of a pattern binds group 0 asvar : string
. This may not be the full string if the pattern is unanchored.
A variable is allowed for the universal case and is bound to the matched string.
You can generate record types from regex patterns:
type url = {%mikmatch|
(("http" | "https") as scheme) "://"
((alnum+ ('.' alnum+)*) as host)
(':' (digit+ as port : int))?
('/' ([^'?' '#']* as path))?
('?' ([^'#']* as query))?
('#' (any* as fragment))?
|}
This generates:
- A record type with fields for each named capture
parse_url : string -> url option
- parses strings into the typepp_url : Format.formatter -> url -> unit
- pretty-prints back to string format
Warning
When printing, repetitions will be executed the minimum required amount of times.
*
prints nothing
The pretty-printer intelligently handles alternations and optional fields:
type mode =
[ `A
| `B
| `Other
]
let mk_mode = function "a" -> `A | "b" -> `B | _ -> `Other
let pp_mode fmt mode = Format.fprintf fmt @@ match mode with `A -> "a" | `B -> "b" | `Other -> "other"
let%mikmatch date_format = {| digit{4} '-' digit{2} '-' digit{2} ' ' digit{2} ':' digit{2} ':' digit{2} |}
type log = {%mikmatch|
(date_format as date)
" [" (upper+ as level) "]"
((" pid=" (digit+ as pid : int))? | (" name=" ([a-z]+ as name))?)
' '{2-3}
('a'|'b'|"other" as mode := mk_mode : mode)
": "
(any+ as message)
|}
let input = "2025-06-13 12:42:12 [INFO] pid=123 a: something happened" in
match parse_log input with
| Some log ->
(* Prints: "2025-06-13 12:42:12 [INFO] pid=123 a: something happened" *)
Format.printf "%a@." pp_log log;
(* Change from pid to name variant *)
let log' = { log with pid = None; name = Some "server" } in
(* Prints: "2025-06-13 12:42:12 [INFO] name=server a: something happened" *)
Format.printf "%a@." pp_log log'
The pretty-printer detects which alternation branch to use based on field population - if pid
is Some _
, it prints the pid
branch; if name
is Some _
, it prints the name
branch.
- For function application you are required to pass the return type.
- If the return type is itself an application (e.g.
string list
), then you must provide a type alias. - For function application with
:=
, the type must have an associatedpp
function. (Notice, in the example, themode
type and its associated functions) - If the type is provided without a conversion function, then it is assumed that in the scope there are associated
parse
andpp
functions. This guarantees compositionality with other types defined with this extension
The following prints out times and hosts for SMTP connections to the Postfix daemon:
(* Link with re, re.pcre, lwt, lwt.unix.
Preprocess with ppx_regexp_extended.
Adjust to your OS. *)
open Lwt.Infix
let check_line =
(function%pcre
| {|(?<t>.*:\d\d) .* postfix/smtpd\[[0-9]+\]: connect from (?<host>[a-z0-9.-]+)|} ->
Lwt_io.printlf "%s %s" t host
| _ ->
Lwt.return_unit)
let () = Lwt_main.run begin
Lwt_io.printl "SMTP connections from:" >>= fun () ->
Lwt_stream.iter_s check_line (Lwt_io.lines_of_file "/var/log/syslog")
end
(* Link with re, re.pcre, lwt, lwt.unix.
Preprocess with ppx_regexp_extended.
Adjust to your OS. *)
open Lwt.Infix
let%mikmatch host = {| [a-z0-9.-]+ |}
let check_line =
(function%mikmatch
| {|/ (any* ':' digit digit as t) ' ' (any*) ' ' "postfix/smtpd" '[' digit+ ']' ": connect from " (host) /|} ->
Lwt_io.printlf "%s %s" t host
| _ ->
Lwt.return_unit)
let () = Lwt_main.run begin
Lwt_io.printl "SMTP connections from:" >>= fun () ->
Lwt_stream.iter_s check_line (Lwt_io.lines_of_file "/var/log/syslog")
end
The syntax extension will always warn if no catch-all case is provided. No exhaustiveness check is attempted. Doing it right would require reimplementing full regular expression parsing and an algorithm which would ideally produce a counter-example.
The processor is currently new and not well tested. Please break it and
file bug reports in the GitHub issue tracker. Any exception raised by
generated code except for Match_failure
is a bug.