Skip to content

Commit 9f99ae8

Browse files
committed
fix: new code generation for correct pattern guard semantics
1 parent 3c13d68 commit 9f99ae8

File tree

2 files changed

+211
-105
lines changed

2 files changed

+211
-105
lines changed

ppx_regexp/ppx_regexp.ml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,13 @@ let transformation ctx =
5656
let make_transformations ~mode ~opts ~loc = function
5757
| Pexp_function cases ->
5858
let cases, binding = Transformations.transform_cases ~mode ~opts ~loc ~ctx cases in
59-
[%expr fun _ppx_regexp_v -> [%e cases]], binding :: acc
59+
[%expr fun _ppx_regexp_v -> [%e cases]], binding @ acc
6060
| Pexp_match (e, cases) ->
6161
let cases, binding = Transformations.transform_cases ~mode ~opts ~loc ~ctx cases in
6262
( [%expr
6363
let _ppx_regexp_v = [%e e] in
6464
[%e cases]],
65-
binding :: acc )
65+
binding @ acc )
6666
| _ -> Util.error ~loc "[%%pcre] and [%%mik] only apply to match, function and global let declarations of strings."
6767
in
6868
match e_ext.pexp_desc with

ppx_regexp/transformations.ml

Lines changed: 209 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,41 @@ module Regexp = struct
2828
in
2929
function { Location.txt = Capture_as (idr, conv, e); _ } -> recurse true e (0, [ idr, None, conv, true ]) | e -> recurse true e (0, [])
3030

31+
let to_re_expr ~ctx =
32+
let rec recurse ~ctx (e' : _ Location.loc) =
33+
let loc = e'.Location.loc in
34+
match e'.Location.txt with
35+
| Code s -> [%expr Re.Perl.re [%e estring ~loc s]]
36+
| Seq es ->
37+
let exprs = List.map (recurse ~ctx) es in
38+
[%expr Re.seq [%e elist ~loc exprs]]
39+
| Alt es ->
40+
let exprs = List.map (recurse ~ctx) es in
41+
[%expr Re.alt [%e elist ~loc exprs]]
42+
| Opt e -> [%expr Re.opt [%e recurse ~ctx e]]
43+
| Repeat ({ Location.txt = i, j_opt; _ }, e) ->
44+
let e_i = eint ~loc i in
45+
let e_j = match j_opt with None -> [%expr None] | Some j -> [%expr Some [%e eint ~loc j]] in
46+
[%expr Re.repn [%e recurse ~ctx e] [%e e_i] [%e e_j]]
47+
| Nongreedy e -> [%expr Re.non_greedy [%e recurse ~ctx e]]
48+
| Capture _ -> Util.error ~loc "Unnamed capture is not allowed for %%pcre and %%mikmatch."
49+
| Capture_as (_, _, e) -> [%expr Re.group [%e recurse ~ctx e]]
50+
| Named_subs (idr, _, _, _) ->
51+
let content = get_substitution ~loc ~ctx idr in
52+
[%expr Re.group [%e recurse ~ctx content]]
53+
| Unnamed_subs (idr, _) ->
54+
let content = get_substitution ~loc ~ctx idr in
55+
recurse ~ctx content
56+
| Pipe_all (_, _, e) -> recurse ~ctx e
57+
| Call _ -> Util.error ~loc "Call is not allowed for %%pcre and %%mikmatch."
58+
and get_substitution ~loc ~ctx idr =
59+
let var_name = idr.txt in
60+
match Util.Ctx.find var_name ctx with
61+
| Some value -> value
62+
| None -> Util.error ~loc "Variable '%s' not found. %%pcre and %%mikmatch only support global let bindings for substitution." var_name
63+
in
64+
function { Location.txt = Capture_as (_, _, e); _ } -> recurse ~ctx e | e -> recurse ~ctx e
65+
3166
let to_string ~ctx =
3267
let p_alt, p_seq, p_suffix, p_atom = 0, 1, 2, 3 in
3368
let delimit_if b s = if b then "(?:" ^ s ^ ")" else s in
@@ -72,6 +107,14 @@ module Regexp = struct
72107
function { Location.txt = Capture_as (_, _, e); _ } -> recurse 0 e | e -> recurse 0 e
73108
end
74109

110+
let apply_re_opts ~loc re_expr opts =
111+
let rec apply re = function
112+
| [] -> re
113+
| `Caseless :: rest -> apply [%expr Re.no_case [%e re]] rest
114+
| `Anchored :: rest -> apply [%expr Re.whole_string [%e re]] rest
115+
in
116+
apply re_expr opts
117+
75118
let rec wrap_group_bindings ~loc ~captured_acc rhs offG = function
76119
| [] -> rhs
77120
| [ (varG, _, Some (Regexp_types.Pipe_all_func func_name), _) ] ->
@@ -102,28 +145,6 @@ let rec wrap_group_bindings ~loc ~captured_acc rhs offG = function
102145
let [%p pat] = [%e eG] in
103146
[%e wrap_group_bindings ~loc ~captured_acc:(pat_ident :: captured_acc) rhs offG bs]]
104147

105-
let guards_equal g1 g2 =
106-
match g1, g2 with
107-
| None, None -> true
108-
| Some e1, Some e2 ->
109-
(* TODO: simplified, should use a more sophisticated AST comparison *)
110-
Pprintast.string_of_expression e1 = Pprintast.string_of_expression e2
111-
| _ -> false
112-
113-
let group_by_guard cases =
114-
List.fold_left
115-
begin
116-
fun groups case ->
117-
let guard = case.pc_guard in
118-
let rec add_to_groups = function
119-
| [] -> [ guard, [ case ] ]
120-
| (g, cases) :: rest -> if guards_equal g guard then (g, case :: cases) :: rest else (g, cases) :: add_to_groups rest
121-
in
122-
add_to_groups groups
123-
end
124-
[] cases
125-
|> List.map (fun (g, cases) -> g, cases)
126-
127148
let rec separate_defaults acc = function
128149
| [] -> List.rev acc, []
129150
| ({ pc_lhs = { ppat_desc = Ppat_any; _ }; _ } as case) :: rest -> acc, case :: rest
@@ -138,9 +159,8 @@ let rec create_opts ~loc = function
138159
let extract_bindings ~(parser : ?pos:position -> string -> string Regexp_types.t) ~ctx ~pos s =
139160
let r = parser ~pos s in
140161
let nG, bs = Regexp.bindings r in
141-
let re_str = Regexp.to_string ~ctx r in
142-
let loc = Location.none in
143-
estring ~loc re_str, bs, nG
162+
let re = Regexp.to_re_expr ~ctx r in
163+
re, bs, nG
144164

145165
let make_default_rhs ~loc = function
146166
| [] ->
@@ -208,102 +228,188 @@ let transform_cases ~mode ~opts ~loc ~ctx cases =
208228
re, nG, bs, case.pc_rhs, case.pc_guard
209229
end
210230
in
211-
let group_by_pattern cases =
212-
List.fold_left
213-
begin
214-
fun acc (re, nG, bs, rhs, guard) ->
215-
let found, groups =
216-
List.fold_left
217-
begin
218-
fun (found, acc_groups) (re', cases) ->
219-
if found then found, (re', cases) :: acc_groups
220-
else if re = re' then true, (re', (nG, bs, rhs, guard) :: cases) :: acc_groups
221-
else false, (re', cases) :: acc_groups
222-
end
223-
(false, []) acc
231+
232+
let group_by_guard_and_re cases =
233+
let rec group acc current_group = function
234+
| [] -> if current_group = [] then acc else current_group :: acc
235+
| ((re, _, _, _, guard) as case) :: rest ->
236+
(match current_group with
237+
| [] -> group acc [ case ] rest
238+
| cases_in_group ->
239+
let can_merge =
240+
match guard with
241+
| None -> List.for_all (fun (_, _, _, _, g) -> g = None) cases_in_group
242+
| Some _ -> List.exists (fun (re', _, _, _, _) -> re = re') cases_in_group
224243
in
225-
if found then groups else (re, [ nG, bs, rhs, guard ]) :: groups
226-
end
227-
[] cases
244+
if can_merge then group acc (case :: current_group) rest else group (List.rev current_group :: acc) [ case ] rest)
245+
in
246+
group [] [] cases
228247
in
229248

230-
let compute_offsets l =
231-
let result, _ =
232-
List.fold_left
233-
begin
234-
fun (acc, offG) (re, case_group) ->
235-
let nG =
236-
let n, _, _, _ = List.hd (List.rev case_group) in
237-
n
238-
in
239-
(re, case_group, offG) :: acc, offG + nG
240-
end
241-
([], 0) l
249+
let compile_group group_idx group_cases =
250+
(* 1: group cases by their regex pattern *)
251+
let group_by_pattern cases =
252+
let add_case_to_groups (re, nG, bs, rhs, guard) groups =
253+
let rec update_groups = function
254+
| [] -> [ re, [ nG, bs, rhs, guard ] ]
255+
| (re', cases) :: rest when re = re' -> (re', (nG, bs, rhs, guard) :: cases) :: rest
256+
| group :: rest -> group :: update_groups rest
257+
in
258+
update_groups groups
259+
in
260+
List.fold_left (fun acc case -> add_case_to_groups case acc) [] cases |> List.rev
261+
in
262+
263+
(* 2: calculate offsets for capture groups *)
264+
let calculate_offsets pattern_groups =
265+
let rec calc acc offset = function
266+
| [] -> List.rev acc
267+
| (re, handlers) :: rest ->
268+
let max_captures = handlers |> List.map (fun (n, _, _, _) -> n) |> List.fold_left max 0 in
269+
calc ((re, handlers, offset) :: acc) (offset + max_captures) rest
270+
in
271+
calc [] 0 pattern_groups
242272
in
243-
List.rev result
273+
274+
(* 3: create handler function for cases *)
275+
let create_handler handler_name case_handlers offset =
276+
let rec make_guard_chain = function
277+
| [] -> [%expr None]
278+
| (_, bs, rhs, None) :: _ ->
279+
(* no guard - directly return result *)
280+
let bs_rev = List.rev bs in
281+
[%expr Some [%e wrap_group_bindings ~captured_acc:[] ~loc rhs offset bs_rev]]
282+
| (_, bs, rhs, Some guard_expr) :: rest ->
283+
(* has guard - check it and continue if fails *)
284+
let bs_rev = List.rev bs in
285+
let guarded = [%expr if [%e guard_expr] then Some [%e rhs] else [%e make_guard_chain rest]] in
286+
wrap_group_bindings ~captured_acc:[] ~loc guarded offset bs_rev
287+
in
288+
let body = [%expr fun _g -> [%e make_guard_chain (List.rev case_handlers)]] in
289+
handler_name, body
290+
in
291+
292+
let pattern_groups = group_by_pattern group_cases in
293+
let patterns_with_offsets = calculate_offsets pattern_groups in
294+
295+
let re_array = patterns_with_offsets |> List.map (fun (re, _, _) -> re) |> pexp_array ~loc in
296+
297+
let handlers =
298+
patterns_with_offsets
299+
|> List.mapi (fun i (_, case_handlers, offset) ->
300+
let handler_name = Printf.sprintf "_group%d_case_%d" group_idx i in
301+
create_handler handler_name case_handlers offset)
302+
in
303+
304+
re_array, handlers
244305
in
245306

246-
let cases, default_cases = separate_defaults [] cases in
247-
let default_rhs = make_default_rhs ~loc default_cases in
248-
let processed_cases = List.map aux cases |> group_by_pattern |> compute_offsets in
307+
let build_group_match_expr ~loc ~idx ~re_var_name ~handlers ~has_guards ~is_single_pattern =
308+
let re_var = pexp_ident ~loc { txt = Lident re_var_name; loc } in
309+
let continue_next = [%expr __ppx_regexp_try_next ([%e eint ~loc idx] + 1)] in
249310

250-
let res = pexp_array ~loc @@ List.map (fun (re, _, _) -> re) processed_cases in
311+
let build_exec_match ~on_match =
312+
[%expr match Re.exec_opt (fst [%e re_var]) _ppx_regexp_v with None -> [%e continue_next] | Some _g -> [%e on_match]]
313+
in
251314

252-
let opts_expr = create_opts ~loc opts in
315+
if is_single_pattern then (
316+
(* single pattern: direct handler call, no dispatcher *)
317+
let handler_name = fst (List.hd handlers) in
318+
let handler = pexp_ident ~loc { txt = Lident handler_name; loc } in
253319

254-
let comp =
255-
[%expr
256-
let a = Array.map (fun s -> Re.mark (Re.Perl.re ~opts:[%e opts_expr] s)) [%e res] in
257-
let marks = Array.map fst a in
258-
let re = Re.compile (Re.alt (Array.to_list (Array.map snd a))) in
259-
re, marks]
320+
build_exec_match ~on_match:[%expr match [%e handler] _g with Some result -> result | None -> [%e continue_next]])
321+
else (
322+
(* multiple patterns: use dispatcher *)
323+
let handlers_array = handlers |> List.map (fun (name, _) -> pexp_ident ~loc { txt = Lident name; loc }) |> pexp_array ~loc in
324+
325+
let dispatch_call = [%expr __ppx_regexp_dispatch (snd [%e re_var]) [%e handlers_array] _g] in
326+
327+
build_exec_match
328+
~on_match:
329+
(if has_guards then [%expr match [%e dispatch_call] with Some result -> result | None -> [%e continue_next]]
330+
else [%expr match [%e dispatch_call] with Some result -> result | None -> assert false]))
260331
in
261332

262-
let var = Util.fresh_var () in
263-
let re_binding = value_binding ~loc ~pat:(ppat_var ~loc { txt = var; loc }) ~expr:comp in
264-
let e_comp = pexp_ident ~loc { txt = Lident var; loc } in
333+
let cases, default_cases = separate_defaults [] cases in
334+
let default_rhs = make_default_rhs ~loc default_cases in
335+
let processed_cases = List.map aux cases in
336+
let case_groups = group_by_guard_and_re processed_cases in
265337

266-
let case_bindings =
338+
let compiled_groups =
267339
List.mapi
268-
begin
269-
fun i (_, case_group, offG) ->
270-
let handler_name = Printf.sprintf "_case_%d" i in
271-
let handler_body =
272-
let rec mk_guard_chains = function
273-
| [] -> [%expr None]
274-
| (_, bs, rhs, guard) :: rest ->
275-
let bs = List.rev bs in
276-
begin
277-
match guard with
278-
| None -> [%expr Some [%e wrap_group_bindings ~captured_acc:[] ~loc rhs offG bs]]
279-
| Some guard_expr ->
280-
let guarded = [%expr if [%e guard_expr] then Some [%e rhs] else [%e mk_guard_chains rest]] in
281-
wrap_group_bindings ~captured_acc:[] ~loc guarded offG bs
282-
end
340+
(fun i group_cases ->
341+
let re_var_name = Util.fresh_var () in
342+
let re_array, handlers = compile_group i group_cases in
343+
re_var_name, re_array, handlers)
344+
case_groups
345+
in
346+
347+
let re_bindings =
348+
List.map
349+
(fun (var_name, re_array, _) ->
350+
let is_single = match re_array.pexp_desc with Pexp_array [ _ ] -> true | _ -> false in
351+
352+
let comp_expr =
353+
if is_single then (
354+
(* single pattern - no marks needed *)
355+
let single_re = match re_array.pexp_desc with Pexp_array [ re ] -> re | _ -> assert false in
356+
[%expr
357+
let re = Re.compile [%e apply_re_opts ~loc single_re opts] in
358+
re, [||]])
359+
else (
360+
(* multiple patterns - mark each one *)
361+
let res_with_opts =
362+
match re_array.pexp_desc with
363+
| Pexp_array res -> res |> List.map (fun re -> [%expr Re.mark [%e apply_re_opts ~loc re opts]])
364+
| _ -> assert false
283365
in
284-
[%expr fun _g -> [%e mk_guard_chains (List.rev case_group)]]
285-
in
286-
value_binding ~loc ~pat:(ppat_var ~loc { txt = handler_name; loc }) ~expr:handler_body
287-
end
288-
processed_cases
366+
[%expr
367+
let a = [%e pexp_array ~loc res_with_opts] in
368+
let marks = Array.map fst a in
369+
let re = Re.compile (Re.alt (Array.to_list (Array.map snd a))) in
370+
re, marks])
371+
in
372+
value_binding ~loc ~pat:(ppat_var ~loc { txt = var_name; loc }) ~expr:comp_expr)
373+
compiled_groups
289374
in
290375

291-
let handlers_array =
292-
pexp_array ~loc @@ List.mapi (fun i _ -> pexp_ident ~loc { txt = Lident (Printf.sprintf "_case_%d" i); loc }) processed_cases
376+
let handler_bindings =
377+
compiled_groups
378+
|> List.concat_map (fun (_, _, handlers) ->
379+
handlers |> List.map (fun (name, body) -> value_binding ~loc ~pat:(ppat_var ~loc { txt = name; loc }) ~expr:body))
293380
in
294381

295-
let match_expr =
382+
let build_match_cascade groups =
383+
let groups_with_info =
384+
List.mapi
385+
(fun i (re_var_name, re_array, handlers) ->
386+
let group_cases = List.nth case_groups i in
387+
let has_guards = List.exists (fun (_, _, _, _, g) -> g <> None) group_cases in
388+
let is_single_pattern = match re_array.pexp_desc with Pexp_array [ _ ] -> true | _ -> false in
389+
i, re_var_name, handlers, has_guards, is_single_pattern)
390+
groups
391+
in
392+
393+
let match_cases =
394+
groups_with_info
395+
|> List.map (fun (idx, re_var_name, handlers, has_guards, is_single_pattern) ->
396+
let match_expr = build_group_match_expr ~loc ~idx ~re_var_name ~handlers ~has_guards ~is_single_pattern in
397+
case ~lhs:(ppat_constant ~loc (Pconst_integer (string_of_int idx, None))) ~guard:None ~rhs:match_expr)
398+
in
399+
400+
let default_case = case ~lhs:(ppat_any ~loc) ~guard:None ~rhs:default_rhs in
401+
296402
[%expr
297-
match Re.exec_opt (fst [%e e_comp]) _ppx_regexp_v with
298-
| None -> [%e default_rhs]
299-
| Some _g ->
300-
[%e
301-
pexp_let ~loc Nonrecursive case_bindings
302-
[%expr
303-
let handlers = [%e handlers_array] in
304-
match __ppx_regexp_dispatch (snd [%e e_comp]) handlers _g with Some result -> result | None -> [%e default_rhs]]]]
403+
let rec __ppx_regexp_try_next group_idx = [%e pexp_match ~loc [%expr group_idx] (match_cases @ [ default_case ])] in
404+
__ppx_regexp_try_next 0]
305405
in
306-
match_expr, re_binding
406+
407+
let match_cascade = build_match_cascade compiled_groups in
408+
409+
(* Add handler bindings if needed *)
410+
let match_expr = if handler_bindings = [] then match_cascade else pexp_let ~loc Nonrecursive handler_bindings match_cascade in
411+
412+
match_expr, re_bindings
307413

308414
(* processes each case individually instead of combining them into one RE *)
309415
let transform_mixed_match ~loc ~ctx ?matched_expr cases acc =
@@ -337,8 +443,8 @@ let transform_mixed_match ~loc ~ctx ?matched_expr cases acc =
337443
match case with
338444
| `Ext (opts, re, _, _, _, _) ->
339445
let comp_var = Util.fresh_var () in
340-
let opts_expr = create_opts ~loc opts in
341-
let comp_expr = [%expr Re.compile (Re.Perl.re ~opts:[%e opts_expr] [%e re])] in
446+
let re_with_opts = apply_re_opts ~loc re opts in
447+
let comp_expr = [%expr Re.compile [%e re_with_opts]] in
342448
let binding = value_binding ~loc ~pat:(ppat_var ~loc { txt = comp_var; loc }) ~expr:comp_expr in
343449
Some (i, comp_var, binding)
344450
| _ -> None

0 commit comments

Comments
 (0)