Skip to content

Commit d97007d

Browse files
committed
feature: variable substitution
1 parent 18ca331 commit d97007d

File tree

3 files changed

+130
-49
lines changed

3 files changed

+130
-49
lines changed

ppx_regexp.opam

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ bug-reports: "https://github.com/paurkedal/ppx_regexp/issues"
1010
depends: [
1111
"ocaml" {>= "4.02.3"}
1212
"dune" {>= "1.11"}
13-
"ppxlib" {>= "0.9.0"}
13+
"ppxlib" {>= "0.9.0" & <= "0.35.0"}
1414
"re" {>= "1.7.2"}
1515
"qcheck" {with-test}
1616
]

ppx_regexp/ppx_regexp.ml

Lines changed: 102 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -51,46 +51,75 @@ module Regexp = struct
5151
| Capture_as (idr, e) ->
5252
fun (nG, bs) ->
5353
recurse must_match e (nG + 1, (idr, Some nG, must_match) :: bs)
54-
| Call _ -> error ~loc "(&...) is not implemented for %%pcre.")
54+
| Call _ -> fun (nG, bs) -> (nG + 1, bs))
5555
in
5656
(function
5757
| {Location.txt = Capture_as (idr, e); _} ->
58-
recurse true e (0, [idr, None, true])
58+
recurse true e (1, [idr, Some 0, true])
5959
| e ->
6060
recurse true e (0, []))
6161

62-
let to_string =
63-
let p_alt, p_seq, p_suffix, p_atom = 0, 1, 2, 3 in
64-
let delimit_if b s = if b then "(?:" ^ s ^ ")" else s in
65-
let rec recurse p (e' : _ Location.loc) =
66-
let loc = e'.Location.loc in
67-
(match e'.Location.txt with
68-
| Code s ->
69-
(* Delimiters not needed as Regexp.parse_exn only returns single
70-
* chars, csets, and escape sequences. *)
71-
s
72-
| Seq es ->
73-
delimit_if (p > p_seq)
74-
(String.concat "" (List.map (recurse p_seq) es))
75-
| Alt es ->
76-
delimit_if (p > p_alt)
77-
(String.concat "|" (List.map (recurse p_alt) es))
78-
| Opt e ->
79-
delimit_if (p > p_suffix) (recurse p_atom e ^ "?")
80-
| Repeat ({Location.txt = (i, j_opt); _}, e) ->
81-
let j_str = match j_opt with None -> "" | Some j -> string_of_int j in
82-
delimit_if (p > p_suffix)
83-
(Printf.sprintf "%s{%d,%s}" (recurse p_atom e) i j_str)
84-
| Nongreedy e -> recurse p_suffix e ^ "?"
85-
| Capture _ -> error ~loc "Unnamed capture is not allowed for %%pcre."
86-
| Capture_as (_, e) -> "(" ^ recurse p_alt e ^ ")"
87-
| Call _ -> error ~loc "(&...) is not implemented for %%pcre.")
62+
let rec to_re_expr ~loc ~in_let (e : _ Location.loc) =
63+
let open Ast_builder.Default in
64+
match e.Location.txt with
65+
| Code s ->
66+
[%expr Re.Perl.re [%e estring ~loc s]]
67+
| Seq es ->
68+
let exprs = List.map (to_re_expr ~loc ~in_let) es in
69+
[%expr Re.seq [%e elist ~loc exprs]]
70+
| Alt es ->
71+
let exprs = List.map (to_re_expr ~loc ~in_let) es in
72+
[%expr Re.alt [%e elist ~loc exprs]]
73+
| Opt e ->
74+
[%expr Re.opt [%e to_re_expr ~loc ~in_let e]]
75+
| Repeat ({Location.txt = (i, j_opt); _}, e) ->
76+
let e_i = eint ~loc i in
77+
let e_j = match j_opt with
78+
| None -> [%expr None]
79+
| Some j -> [%expr Some [%e eint ~loc j]]
80+
in
81+
[%expr Re.repn [%e to_re_expr ~loc ~in_let e] [%e e_i] [%e e_j]]
82+
| Nongreedy e ->
83+
[%expr Re.non_greedy [%e to_re_expr ~loc ~in_let e]]
84+
| Capture e ->
85+
[%expr Re.group [%e to_re_expr ~loc ~in_let e]]
86+
| Capture_as (_, e) ->
87+
[%expr Re.group [%e to_re_expr ~loc ~in_let e]]
88+
| Call lid ->
89+
if in_let then pexp_ident ~loc lid else
90+
[%expr Re.group [%e pexp_ident ~loc lid]]
91+
92+
let rec squash_codes (e : _ Location.loc) : _ Location.loc =
93+
let open Location in
94+
let rec combine (nodes : _ Location.loc list) =
95+
match nodes with
96+
| [] -> []
97+
| {Location.txt = Code s1; loc = loc1} :: {Location.txt = Code s2; loc = loc2} :: rest ->
98+
let combined_loc =
99+
if loc1 = Location.none || loc2 = Location.none then Location.none
100+
else Location.{
101+
loc_start = loc1.loc_start;
102+
loc_end = loc2.loc_end;
103+
loc_ghost = false;
104+
}
105+
in
106+
combine ({Location.txt = Code (s1 ^ s2); loc = combined_loc} :: rest)
107+
| node :: rest -> node :: combine rest
88108
in
89-
(function
90-
| {Location.txt = Capture_as (_, e); _} ->
91-
recurse 0 e
92-
| e ->
93-
recurse 0 e)
109+
match e.txt with
110+
| Code _ -> e
111+
| Seq es ->
112+
let es = List.map squash_codes es in
113+
{e with txt = Seq (combine es)}
114+
| Alt es ->
115+
let es = List.map squash_codes es in
116+
{e with txt = Alt es}
117+
| Opt e' -> {e with txt = Opt (squash_codes e')}
118+
| Repeat (range, e') -> {e with txt = Repeat (range, squash_codes e')}
119+
| Nongreedy e' -> {e with txt = Nongreedy (squash_codes e')}
120+
| Capture e' -> {e with txt = Capture (squash_codes e')}
121+
| Capture_as (name, e') -> {e with txt = Capture_as (name, squash_codes e')}
122+
| Call _ -> e
94123
end
95124

96125
let fresh_var =
@@ -114,11 +143,11 @@ let rec must_match p i =
114143
true
115144

116145
let extract_bindings ~pos s =
117-
let r = Regexp.parse_exn ~pos s in
146+
let r = Regexp.(squash_codes @@ parse_exn ~pos s) in
118147
let nG, bs = Regexp.bindings r in
119-
let re_str = Regexp.to_string r in
120148
let loc = Location.none in
121-
(estring ~loc re_str, bs, nG)
149+
let re_expr = Regexp.to_re_expr ~loc ~in_let:false r in
150+
(re_expr, bs, nG)
122151

123152
let rec wrap_group_bindings ~loc rhs offG = function
124153
| [] -> rhs
@@ -137,6 +166,19 @@ let rec wrap_group_bindings ~loc rhs offG = function
137166
let [%p ppat_var ~loc varG] = [%e eG] in
138167
[%e wrap_group_bindings ~loc rhs offG bs]]
139168

169+
let transform_let =
170+
List.map
171+
begin
172+
fun vb ->
173+
match vb.pvb_pat.ppat_desc, vb.pvb_expr.pexp_desc with
174+
| Ppat_var { txt = _; loc }, Pexp_constant (Pconst_string (value, _, _)) ->
175+
let parsed = Regexp.(squash_codes @@ parse_exn value) in
176+
let re_expr = Regexp.to_re_expr ~loc ~in_let:true parsed in
177+
let expr = [%expr [%e re_expr]] in
178+
{ vb with pvb_expr = expr }
179+
| _ -> vb
180+
end
181+
140182
let transform_cases ~loc cases =
141183
let aux case =
142184
if case.pc_guard <> None then
@@ -173,7 +215,7 @@ let transform_cases ~loc cases =
173215
let cases = List.rev_map aux cases in
174216
let res = pexp_array ~loc (List.map (fun (re, _, _, _) -> re) cases) in
175217
let comp = [%expr
176-
let a = Array.map (fun s -> Re.mark (Re.Perl.re s)) [%e res] in
218+
let a = Array.map (fun re -> Re.mark re) [%e res] in
177219
let marks = Array.map fst a in
178220
let re = Re.compile (Re.alt (Array.to_list (Array.map snd a))) in
179221
(re, marks)
@@ -202,33 +244,46 @@ let transform_cases ~loc cases =
202244
(cases, re_binding)
203245

204246
let transformation = object
205-
inherit [value_binding list] Ast_traverse.fold_map as super
247+
inherit [value_binding list * value_binding list] Ast_traverse.fold_map as super
248+
249+
method! structure_item item (acc, let_acc) =
250+
match item.pstr_desc with
251+
(* let%pcre x = {|some regex|}*)
252+
| Pstr_extension (({ txt = "pcre"; loc }, PStr [ { pstr_desc = Pstr_value (Nonrecursive, vbs); _ } ]), _) ->
253+
let bindings = transform_let vbs in
254+
let dummy = {item with pstr_desc = Pstr_eval ([%expr ()], [])} in
255+
dummy, (acc, bindings @ let_acc)
256+
| _ -> super#structure_item item (acc, let_acc)
206257

207258
method! expression e_ext acc =
208-
let e_ext, acc = super#expression e_ext acc in
259+
let e_ext, (acc, let_acc) = super#expression e_ext acc in
209260
(match e_ext.pexp_desc with
210261
| Pexp_extension
211262
({txt = "pcre"; _}, PStr [{pstr_desc = Pstr_eval (e, _); _}]) ->
212263
let loc = e.pexp_loc in
213264
(match e.pexp_desc with
214265
| Pexp_match (e, cases) ->
215266
let cases, binding = transform_cases ~loc cases in
216-
([%expr let _ppx_regexp_v = [%e e] in [%e cases]], binding :: acc)
267+
([%expr let _ppx_regexp_v = [%e e] in [%e cases]], (binding :: acc, let_acc))
217268
| Pexp_function (cases) ->
218269
let cases, binding = transform_cases ~loc cases in
219-
([%expr fun _ppx_regexp_v -> [%e cases]], binding :: acc)
270+
([%expr fun _ppx_regexp_v -> [%e cases]], (binding :: acc, let_acc))
220271
| _ ->
221272
error ~loc "[%%pcre] only applies to match an function.")
222-
| _ -> (e_ext, acc))
273+
| _ -> (e_ext, (acc, let_acc)))
223274
end
224275

225276
let impl str =
226-
let str, rev_bindings = transformation#structure str [] in
277+
let str, (rev_bindings, let_bindings) = transformation#structure str ([], []) in
227278
if rev_bindings = [] then str else
228-
let re_str =
229279
let loc = Location.none in
230-
[%str open (struct [%%i pstr_value ~loc Nonrecursive rev_bindings] end)]
231-
in
232-
re_str @ str
280+
let all_bindings = List.rev let_bindings @ rev_bindings in
281+
let struct_items =
282+
List.fold_left (fun acc binding ->
283+
acc @ [%str let [%p binding.pvb_pat] = [%e binding.pvb_expr]]
284+
) [] all_bindings
285+
in
286+
let mod_expr = pmod_structure ~loc struct_items in
287+
[%str open [%m mod_expr]] @ str
233288

234289
let () = Driver.register_transformation ~impl "ppx_regexp"

tests/test_ppx_regexp.ml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,23 @@ let test5 = function%pcre
7676
| _ -> assert false)
7777
| _ -> assert false
7878

79+
let%pcre digit = {|[0-9]|}
80+
let%pcre word = {|[a-zA-Z]+|}
81+
let%pcre sep = {|[,;]|}
82+
let%pcre sep_spc = {|(?&sep)| |}
83+
84+
let test6 = function%pcre
85+
| {|^(?&digit)+$|} -> `AllDigits
86+
| {|^(?&word)(?&sep_spc)(?&word)$|} -> `TwoWords
87+
| {|^(?<first>(?&digit)+)-(?<second>(?&digit)+)$|} -> `Range (first, second)
88+
| _ -> `Unknown
89+
90+
let test7 = function%pcre
91+
| {|^(?&num:digit)+$|} -> `Digit num
92+
| {|^(?&a:digit){2}-(?&b:digit){3}$|} -> (* repetitions after subst capture the last match *) `Code (a, b)
93+
| {|^(?&w1:word)(?&sep_spc)(?&w2:word)$|} -> `Words (w1, w2)
94+
| _ -> `Unknown
95+
7996
let () =
8097
test2 "<>";
8198
test2 "<a>";
@@ -92,7 +109,16 @@ let () =
92109
test3 "catch-all";
93110
assert (test4 "::123.456::" = ["123.456"]);
94111
assert (test4 "::abc xyz::" = ["abc"; "xyz"]);
95-
assert (test5 "abcd" = ("bcd", "cd", "d"))
112+
assert (test5 "abcd" = ("bcd", "cd", "d"));
113+
assert (test6 "12345" = `AllDigits);
114+
assert (test6 "hello world" = `TwoWords);
115+
assert (test6 "hello,world" = `TwoWords);
116+
assert (test6 "123-456" = `Range ("123", "456"));
117+
assert (test6 "abc123" = `Unknown);
118+
assert (test7 "999" = `Digit "9");
119+
assert (test7 "hello world" = `Words ("hello", "world"));
120+
assert (test7 "12-345" = `Code ("2", "5"));
121+
assert (test7 "xyz" = `Unknown)
96122

97123
(* It should work in a functor, and Re_pcre.regxp should be lifted to the
98124
* top-level. *)

0 commit comments

Comments
 (0)