Skip to content

Commit c2e11de

Browse files
committed
case insentive match for `function | {%mik_i|some regex|} -> ..., docs
1 parent 1da0660 commit c2e11de

File tree

4 files changed

+224
-53
lines changed

4 files changed

+224
-53
lines changed

MIK.md

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
2+
# `%mik` extension
3+
4+
Accepts `mikmatch` syntax, along with some nice to haves.
5+
6+
## Grammar
7+
The grammar accepted by this extensions is the following
8+
9+
```bnf
10+
<main_match_case> ::= "/" <pattern> "/" EOF
11+
12+
<main_let_expr> ::= <pattern> EOF
13+
14+
<pattern> ::= <alt_expr>
15+
| <alt_expr> ">>>" <func_name> "as" IDENT
16+
17+
<alt_expr> ::= <seq_expr>
18+
| <seq_expr> "|" <alt_expr>
19+
20+
<seq_expr> ::= <atom_expr>
21+
| <atom_expr> <seq_expr>
22+
23+
<atom_expr> ::= <basic_atom>
24+
| <basic_atom> "*"
25+
| <basic_atom> "+"
26+
| <basic_atom> "?"
27+
| <basic_atom> "{" INT (n) "}" # match n times
28+
| <basic_atom> "{" INT (n) "-" INT (m) "}" # match at least n times, at most m times
29+
30+
<basic_atom> ::= CHAR_LITERAL
31+
| STRING_LITERAL
32+
| EMPTY_STR
33+
| "_"
34+
| "^"
35+
| PREDEFINED_CLASS
36+
| IDENT
37+
| "[" <char_set> "]" # character class
38+
| "[" "^" <char_set> "]" # negative character class
39+
| "(" <pattern> ")"
40+
| "(" IDENT ")"
41+
| "(" IDENT "as" IDENT ")"
42+
| "(" IDENT "as" IDENT ":" INT_CONVERTER ")"
43+
| "(" IDENT "as" IDENT ":" FLOAT_CONVERTER ")"
44+
| "(" IDENT "as" IDENT ":=" <func_name> ")"
45+
| "(" <pattern> "as" IDENT ")"
46+
| "(" <pattern> "as" IDENT ":" INT_CONVERTER ")"
47+
| "(" <pattern> "as" IDENT ":" FLOAT_CONVERTER ")"
48+
| "(" <pattern> "as" IDENT ":=" <func_name> ")"
49+
50+
<func_name> ::= IDENT
51+
| MOD_IDENT # qualified names
52+
53+
<char_set> ::= <char_set_item>
54+
| <char_set_item> <char_set>
55+
56+
<char_set_item> ::= CHAR_LITERAL
57+
| CHAR_LITERAL "-" CHAR_LITERAL
58+
| STRING_LITERAL
59+
| PREDEFINED_CLASS
60+
| IDENT
61+
```
62+
63+
Where `PREDEFINED_CLASS` is one of:
64+
- **POSIX character classes:** `lower`, `upper`, `alpha`, `digit`, `alnum`, `punct`, `graph`, `print`, `blank`, `space`, `cntrl`, `xdigit`
65+
- **Control sequences:** `eos` (same as `$`), `eol` (end of string or newline), `bnd` (word boundary `\b`), `bos` (same as `^`), `any` (any character except newline)
66+
- **Empty string:** `""`, equivalent to `^$` (or `bos eos`)
67+
68+
## Semantics and Examples
69+
### Variable substitution
70+
```ocaml
71+
let%mik re1 = {| "hello" |}
72+
let%mik re2 = {| re1 "world" |}
73+
74+
let do_something = function%mik
75+
| {|/ ... (re2) ... /|} -> ...
76+
| _ -> ...
77+
78+
(* will expand to *)
79+
let do_something = function%mik
80+
| {|/ ... ("hello" "world") ... /|} -> ...
81+
| _ -> ...
82+
```
83+
84+
### Variable capture
85+
```ocaml
86+
let%mik num = {| digit+ |}
87+
88+
let do_something = function%mik
89+
| {|/ ... (num as n) ... /|} -> ... (* (n : string) available here *)
90+
| _ -> ...
91+
```
92+
93+
Values are also available at the guard level:
94+
95+
```ocaml
96+
let%mik num = {| digit+ |}
97+
98+
let do_something = function%mik
99+
| {|/ ... (num as n) ... /|} when n = 123 -> ...
100+
| _ -> ...
101+
```
102+
103+
#### Type conversion
104+
It is possible to convert variables to `int` of `float` on the fly:
105+
106+
```ocaml
107+
let%mik num = {| digit+ |}
108+
109+
let do_something = function%mik
110+
| {|/ 'd' (num as n : int) ... /|} -> ... (* (n : int) available here *)
111+
| {|/ 'f' (num as n : float) ... /|} -> ... (* (n : float) available here *)
112+
| _ -> ...
113+
```
114+
115+
It is also possible to pass the variables into any `string -> 'a` function:
116+
```ocaml
117+
let%mik ip = {| (digit{1-3} '.'){3} digit{1-3}|}
118+
let parse_ip = String.split_on_char '.'
119+
let get_ip = function%mik
120+
| {|/ ... (ip as ip := parse_ip) ... /|} -> ... (* (ip : string list) available here *)
121+
| _ -> ...
122+
123+
let get_upper_name = function%mik
124+
| {|/ ... (['a'-'z'] as name := String.uppercase) ... /|} -> ... (* (name : string) available here *)
125+
| _ -> ...
126+
```
127+
128+
#### Piping to a catch all function
129+
130+
Using the `>>>` syntax extension, you can pipe all bound named variables into a single function, and name its return value.
131+
132+
```ocaml
133+
type example = {
134+
name : string;
135+
num : int;
136+
mode : [ `A | `B | `Default ];
137+
}
138+
139+
let mk_example name num mode = match mode with
140+
| Some 'a' -> { name; num; mode = `A}
141+
| Some 'b' -> { name; num; mode = `B}
142+
| Some _ | None -> { name; num; mode = `Default}
143+
144+
let mk_example_re = function%mik
145+
| {|/ (['a'-'z'] as name := String.capitalize_ascii) ' ' (digit+ as num : int) ' ' ('a'|'b' as mode)? >>> mk_example as res /|} -> (* (res : example) available here, and all other bound variables *)
146+
| _ -> ...
147+
```
148+
149+
## Case Insensitive Match
150+
151+
You can use `%mik_i`: `match%mik_i` and `function%mik_i`. (not available at the variable level)
152+
153+
## Alternatives
154+
### Defining variables
155+
You have a choice between:
156+
```ocaml
157+
let%mik re = {|some regex|}
158+
(* and *)
159+
let re = {%mik|some regex|}
160+
```
161+
162+
No `/` delimiters are needed here.
163+
164+
### Matching:
165+
#### `match%mik` and `function%mik`
166+
167+
```ocaml
168+
function%mik
169+
| {|/ some regex /|} -> ...
170+
| {|/ another regex /|} -> ...
171+
...
172+
| _ -> ...
173+
```
174+
175+
This match expression will compile all of the REs in the branches into one, and use marks to find which branch was executed.
176+
Efficient if you have multiple branches.
177+
178+
#### General match/function
179+
180+
```ocaml
181+
function
182+
| "some string" -> ...
183+
| {%mik|/ some regex /|} -> ...
184+
...
185+
| "another string" -> ...
186+
| {%mik_i|/ another regex /|} -> ... (* case insensitive *)
187+
| _ -> ...
188+
```
189+
190+
This match expression will compile all of the REs **individually**, and test each one in sequence.
191+
Recommended if you only matching one RE. It is less efficient than the first option for more than one RE, but allows raw string matching.
192+
193+
It keeps all of the features of the previous extension, explored in [Semantics](#Semantics_and_Examples)

README.md

Lines changed: 8 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
[![Build Status][ci-build-status]][ci]
1+
# PPXes for Working with Regular Expressions
22

3-
# Two PPXes for Working with Regular Expressions
4-
5-
This repo provides two PPXes providing regular expression-based routing:
3+
This repo provides PPXes providing regular expression-based routing:
64

75
- `ppx_regexp_extended` maps to [re][] with the conventional last-match extraction
86
into `string` and `string option`. Two syntaxes for regular expressions available:
@@ -62,35 +60,12 @@ string.
6260

6361
### `%mik`
6462

65-
The syntax that this extension accepts is as follows:
66-
67-
- `char-literal`: Match the given character (priority 0).
68-
- `_` (underscore): Match any character (priority 0).
69-
- `string-literal`: Match the given sequence of characters (priority 0).
70-
- `[set-of-characters]`: Character class, match one of the characters given by set-of-characters (priority 0). The grammar for set-of-characters is the following:
71-
- `char-literal``char-literal`: defines a range of characters according to the iso-8859-1 encoding (includes ASCII).
72-
- `char-literal`: defines a singleton (a set containing just this character).
73-
- `string-literal`: defines a set that contains all the characters present in the given string.
74-
- `lowercase-identifier`: is replaced by the corresponding predefined regular expression; this regular expression must be exactly of length 1 and therefore represents a set of characters.
75-
- `set-of-characters`: set-of-characters defines the union of two sets of characters.
76-
- `[^set-of-characters]`: Negative character class
77-
- `regexp *`: Match the pattern given by regexp 0 time or more (priority 0).
78-
- `regexp +`: Match the pattern given by regexp 1 time or more (priority 0).
79-
- `regexp ?`: Match the pattern given by regexp at most once (priority 0).
80-
- `regexp{m−n}`: Match regexp at least `m` times and up to `n` times. `m` and `n` must be integer literals (priority 0).
81-
- `regexp{n}`: Same as regexp{n−n} (priority 0).
82-
- `( regexp )`: Match regexp (priority 0).
83-
- `regexp regexp`: Match the first regular expressions and then the second one (priority 1).
84-
- `regexp | regexp`: Match one of these two regular expressions (priority 2).
85-
- `regexp as lowercase-identifier`: Give a name to the substring that will be matched by the given pattern. This string becomes available under this name (priority 3).
86-
In-place conversions of the matched substring can be performed using one these three mechanisms:
87-
- `regexp as lowercase-identifier : int`: `int` behaves as `int_of_string`
88-
- `regexp as lowercase-identifier : float`: `float` behaves as `float_of_string`
89-
- `regexp as lowercase-identifier := converter`: where `converter` is any function which converts a string into something else.
90-
91-
In addition, the following predefined character classes are available:
92-
- **POSIX character classes:** `lower`, `upper`, `alpha`, `digit`, `alnum`, `punct`, `graph`, `print`, `blank`, `space`, `cntrl`, `xdigit`, `word`
93-
- **Control sequences:** `eos` (same as `$`), `eol` (end of string or newline), `bnd` (word boundary `\b`), `bos` (same as `^`), `any` (any character except newline)
63+
Full [%mik guide](./MIK.md)
64+
65+
#### Quick Links
66+
- [Variable capture](./MIK.md#variable-capture)
67+
- [Type conversion](./MIK.md#type-conversion)
68+
- [Different extensions](./MIK.md#alternatives)
9469

9570
### Example
9671

ppx_regexp/ppx_regexp.ml

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ let transformation ctx =
2525
method! structure_item item acc =
2626
match item.pstr_desc with
2727
(* let%mik/%pcre x = {|some regex|}*)
28-
| Pstr_extension
29-
(({ txt = ("pcre" | "mik") as ext; _ }, PStr [ { pstr_desc = Pstr_value (rec_flag, vbs); _ } ]), _) ->
28+
| Pstr_extension (({ txt = ("pcre" | "mik") as ext; _ }, PStr [ { pstr_desc = Pstr_value (rec_flag, vbs); _ } ]), _) ->
3029
let mode = if ext = "pcre" then `Pcre else `Mik in
3130
let bindings = Transformations.transform_let ~mode ~ctx vbs in
3231
let new_item = { item with pstr_desc = Pstr_value (rec_flag, bindings) } in
@@ -65,33 +64,32 @@ let transformation ctx =
6564
let _ppx_regexp_v = [%e e] in
6665
[%e cases]],
6766
bindings @ acc )
68-
| _ ->
69-
Util.error ~loc "[%%pcre] and [%%mik] only apply to match, function and global let declarations of strings."
67+
| _ -> Util.error ~loc "[%%pcre] and [%%mik] only apply to match, function and global let declarations of strings."
7068
in
7169
match e_ext.pexp_desc with
7270
(* match%mik/match%pcre and function%mik/function%pcre*)
73-
| Pexp_extension
74-
({ txt = ("pcre" | "mik" | "pcre_i" | "mik_i") as ext; _ }, PStr [ { pstr_desc = Pstr_eval (e, _); _ } ]) ->
71+
| Pexp_extension ({ txt = ("pcre" | "mik" | "pcre_i" | "mik_i") as ext; _ }, PStr [ { pstr_desc = Pstr_eval (e, _); _ } ]) ->
7572
let mode = if String.starts_with ~prefix:"pcre" ext then `Pcre else `Mik in
7673
let opts = if String.ends_with ~suffix:"_i" ext then [ `Caseless ] else [] in
7774
let loc = e.pexp_loc in
7875
make_transformations ~mode ~opts ~loc e.pexp_desc
7976
(* match smth with | {%mik|some regex|} -> ...*)
8077
| Pexp_match (matched_expr, cases) ->
81-
let has_mik_case =
78+
let has_ext_case =
8279
List.exists
83-
(fun case -> match case.pc_lhs.ppat_desc with Ppat_extension ({ txt = "mik"; _ }, _) -> true | _ -> false)
80+
(fun case ->
81+
match case.pc_lhs.ppat_desc with Ppat_extension ({ txt = "pcre" | "mik" | "pcre_i" | "mik_i"; _ }, _) -> true | _ -> false)
8482
cases
8583
in
86-
if has_mik_case then Transformations.transform_mixed_match ~loc:e_ext.pexp_loc ~ctx ~matched_expr cases acc
87-
else e_ext, acc
84+
if has_ext_case then Transformations.transform_mixed_match ~loc:e_ext.pexp_loc ~ctx ~matched_expr cases acc else e_ext, acc
8885
| Pexp_function cases ->
89-
let has_mik_case =
86+
let has_ext_case =
9087
List.exists
91-
(fun case -> match case.pc_lhs.ppat_desc with Ppat_extension ({ txt = "mik"; _ }, _) -> true | _ -> false)
88+
(fun case ->
89+
match case.pc_lhs.ppat_desc with Ppat_extension ({ txt = "pcre" | "mik" | "pcre_i" | "mik_i"; _ }, _) -> true | _ -> false)
9290
cases
9391
in
94-
if has_mik_case then Transformations.transform_mixed_match ~loc:e_ext.pexp_loc ~ctx cases acc else e_ext, acc
92+
if has_ext_case then Transformations.transform_mixed_match ~loc:e_ext.pexp_loc ~ctx cases acc else e_ext, acc
9593
| _ -> e_ext, acc
9694
end
9795

ppx_regexp/transformations.ml

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -318,12 +318,14 @@ let transform_mixed_match ~loc ~ctx ?matched_expr cases acc =
318318
let aux case =
319319
match case.pc_lhs.ppat_desc with
320320
| Ppat_extension
321-
({ txt = "mik"; _ }, PStr [ { pstr_desc = Pstr_eval ({ pexp_desc = Pexp_constant (Pconst_string (pat, str_loc, _)); _ }, _); _ } ])
322-
->
321+
( { txt = ("pcre" | "mik" | "pcre_i" | "mik_i") as ext; _ },
322+
PStr [ { pstr_desc = Pstr_eval ({ pexp_desc = Pexp_constant (Pconst_string (pat, str_loc, _)); _ }, _); _ } ] ) ->
323323
let pos = str_loc.loc_start in
324-
let parser = Regexp.parse_mik_exn ~target:`Match in
324+
let mode = if String.starts_with ~prefix:"pcre" ext then `Pcre else `Mik in
325+
let opts = if String.ends_with ~suffix:"_i" ext then [ `Caseless ] else [] in
326+
let parser = match mode with `Pcre -> Regexp.parse_exn ~target:`Match | `Mik -> Regexp.parse_mik_exn ~target:`Match in
325327
let re, bs, nG = extract_bindings ~parser ~pos ~ctx pat in
326-
`Mik (re, nG, bs, case.pc_rhs, case.pc_guard)
328+
`Mik (opts, re, nG, bs, case.pc_rhs, case.pc_guard)
327329
| _ -> `Regular case
328330
in
329331

@@ -340,9 +342,12 @@ let transform_mixed_match ~loc ~ctx ?matched_expr cases acc =
340342
begin
341343
fun i case ->
342344
match case with
343-
| `Mik (re, _, _, _, _) ->
345+
| `Mik (opts, re, _, _, _, _) ->
344346
let comp_var = Util.fresh_var () in
345-
let comp_expr = [%expr Re.compile (Re.Perl.re [%e re])] in
347+
let opts_expr =
348+
match opts with [] -> [%expr []] | [ `Caseless ] -> [%expr [ `Caseless ]] | _ -> failwith "Unknown option"
349+
in
350+
let comp_expr = [%expr Re.compile (Re.Perl.re ~opts:[%e opts_expr] [%e re])] in
346351
let binding = value_binding ~loc ~pat:(ppat_var ~loc { txt = comp_var; loc }) ~expr:comp_expr in
347352
Some (i, comp_var, binding)
348353
| _ -> None
@@ -363,7 +368,7 @@ let transform_mixed_match ~loc ~ctx ?matched_expr cases acc =
363368
match [%e input_var] with
364369
| [%p case.pc_lhs] when [%e Option.value case.pc_guard ~default:[%expr true]] -> [%e case.pc_rhs]
365370
| _ -> [%e build_ordered_match input_var (case_idx + 1) rest mik_comps]]
366-
| `Mik (_, _, bs, rhs, guard) :: rest, (idx, comp_var, _) :: rest_comps when idx = case_idx ->
371+
| `Mik (_, _, _, bs, rhs, guard) :: rest, (idx, comp_var, _) :: rest_comps when idx = case_idx ->
367372
let comp_ident = pexp_ident ~loc { txt = Lident comp_var; loc } in
368373
[%expr
369374
match Re.exec_opt [%e comp_ident] [%e input_var] with

0 commit comments

Comments
 (0)