Skip to content

Commit 51d4ceb

Browse files
committed
simplify + fix error locations
1 parent 0bfb41d commit 51d4ceb

File tree

5 files changed

+76
-100
lines changed

5 files changed

+76
-100
lines changed

lib/mik_parser.mly

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,9 @@ basic_atom:
202202
| LBRACKET char_set { unclosed_error "character set (missing ']')" $startpos($1) $endpos }
203203
| LBRACKET error { syntax_error "Invalid character set" $startpos $endpos }
204204

205+
(* VARIABLE SUBSTITUTION *)
206+
207+
(* Simple identifier captures, no name capture *)
205208
| LPAREN id = IDENT RPAREN
206209
| LPAREN id = MOD_IDENT RPAREN {
207210
(* (word) -> captures the result of calling 'word' pattern *)
@@ -210,6 +213,8 @@ basic_atom:
210213
let name_loc = wrap_loc $startpos(id) $endpos(id) (last_component id) in
211214
wrap_loc $startpos $endpos (Capture_as (name_loc, None, call_node))
212215
}
216+
217+
(* Simple named captures *)
213218
| LPAREN IDENT AS RPAREN | LPAREN MOD_IDENT AS RPAREN { missing_error "name after 'as'" $startpos($3) $endpos($4) }
214219
| LPAREN id = IDENT AS name = IDENT RPAREN
215220
| LPAREN id = MOD_IDENT AS name = IDENT RPAREN {
@@ -219,10 +224,15 @@ basic_atom:
219224
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
220225
wrap_loc $startpos $endpos (Capture_as (name_loc, None, call_node))
221226
}
227+
228+
(* Named captures with type conversion + function app *)
222229
| LPAREN IDENT AS IDENT COLON RPAREN
223-
| LPAREN IDENT AS MOD_IDENT COLON RPAREN {
230+
| LPAREN IDENT AS MOD_IDENT COLON RPAREN
231+
| LPAREN MOD_IDENT AS IDENT COLON RPAREN
232+
| LPAREN MOD_IDENT AS MOD_IDENT COLON RPAREN {
224233
missing_error "type converter after ':'" $startpos($5) $endpos($6)
225234
}
235+
226236
| LPAREN id = IDENT AS name = IDENT COLON INT_CONVERTER RPAREN
227237
| LPAREN id = MOD_IDENT AS name = IDENT COLON INT_CONVERTER RPAREN {
228238
(* (digits as n : int) -> captures 'digits' pattern as 'n' converted to int *)
@@ -258,6 +268,8 @@ basic_atom:
258268
let typ = string_to_longident typ in
259269
wrap_loc $startpos $endpos (Capture_as (name_loc, Some (Func (func, Some typ)), call_node))
260270
}
271+
272+
(* Unclosed parentheses error cases for identifiers *)
261273
| LPAREN IDENT AS IDENT EOF?
262274
| LPAREN MOD_IDENT AS IDENT EOF? {
263275
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($4)
@@ -271,23 +283,46 @@ basic_atom:
271283
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($6)
272284
}
273285

286+
(* GENERAL PATTERNS *)
287+
288+
(* Simple, no capture *)
274289
| LPAREN pattern RPAREN {
275290
$2
276291
}
277292
| LPAREN RPAREN { missing_error "pattern inside parentheses" $startpos $endpos }
278293
| LPAREN pattern EOF? { unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($2) }
279294

295+
(* Simple named capture *)
280296
| LPAREN pattern AS RPAREN { missing_error "capture name after 'as'" $startpos($3) $endpos($4) }
297+
| LPAREN pattern AS COLON {
298+
missing_error "capture name between 'as' and ':'" $startpos($3) $endpos($4)
299+
}
300+
| LPAREN pattern AS COLON INT_CONVERTER RPAREN {
301+
missing_error "capture name between 'as' and ':'" $startpos($3) $endpos($4)
302+
}
303+
| LPAREN pattern AS COLON FLOAT_CONVERTER RPAREN {
304+
missing_error "capture name between 'as' and ':'" $startpos($3) $endpos($4)
305+
}
306+
| LPAREN pattern AS COLON EQUAL ident RPAREN {
307+
missing_error "capture name between 'as' and ':='" $startpos($3) $endpos($5)
308+
}
309+
| LPAREN pattern AS COLON EQUAL ident COLON ident RPAREN {
310+
missing_error "capture name between 'as' and ':='" $startpos($3) $endpos($5)
311+
}
312+
281313
| LPAREN pattern AS name = IDENT RPAREN {
282314
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
283315
wrap_loc $startpos $endpos (Capture_as (name_loc, None, $2))
284316
}
317+
318+
(* Named capture with type conversion + function app *)
285319
| LPAREN pattern AS IDENT COLON RPAREN {
286320
missing_error "type converter after ':'" $startpos($5) $endpos($6)
287321
}
288322
| LPAREN pattern AS IDENT COLON EQUAL RPAREN {
289323
missing_error "function name after ':='" $startpos($5) $endpos($6)
290324
}
325+
291326
| LPAREN pattern AS name = IDENT COLON INT_CONVERTER RPAREN {
292327
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
293328
wrap_loc $startpos $endpos (Capture_as (name_loc, Some Int, $2))
@@ -307,6 +342,7 @@ basic_atom:
307342
let typ = string_to_longident typ in
308343
wrap_loc $startpos $endpos (Capture_as (name_loc, Some (Func (func, Some typ)), $2))
309344
}
345+
310346
| LPAREN pattern AS IDENT EOF? {
311347
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($4)
312348
}
@@ -317,6 +353,7 @@ basic_atom:
317353
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($6)
318354
}
319355

356+
(* Generic error case *)
320357
| LPAREN error { syntax_error "Invalid expression in parentheses" $startpos($2) $endpos }
321358

322359
ident:

lib/regexp.ml

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -266,45 +266,20 @@ let parse_exn ~target:_ ?(pos = Lexing.dummy_pos) s =
266266

267267
let parse_mik_exn ~target ?(pos = Lexing.dummy_pos) s =
268268
let lexbuf = Lexing.from_string s in
269+
lexbuf.lex_curr_p <- pos;
270+
lexbuf.lex_start_p <- pos;
271+
lexbuf.lex_abs_pos <- pos.pos_cnum;
269272
let mk_loc ?loc pos lexbuf =
270273
let open Lexing in
271274
let open Location in
272275
match loc with
273-
| Some loc ->
274-
{
275-
loc_ghost = false;
276-
loc_start =
277-
{
278-
pos_fname = pos.pos_fname;
279-
pos_lnum = pos.pos_lnum + (loc.loc_start.pos_lnum - 1);
280-
pos_bol = pos.pos_bol;
281-
pos_cnum = pos.pos_cnum + loc.loc_start.pos_cnum;
282-
};
283-
loc_end =
284-
{
285-
pos_fname = pos.pos_fname;
286-
pos_lnum = pos.pos_lnum + (loc.loc_end.pos_lnum - 1);
287-
pos_bol = pos.pos_bol;
288-
pos_cnum = pos.pos_cnum + loc.loc_end.pos_cnum;
289-
};
290-
}
276+
| Some loc -> loc
291277
| None ->
278+
(* no location from parser, use lexbuf positions *)
292279
{
293280
loc_ghost = false;
294-
loc_start =
295-
{
296-
pos_fname = pos.pos_fname;
297-
pos_lnum = pos.pos_lnum;
298-
pos_bol = pos.pos_bol;
299-
pos_cnum = pos.pos_cnum + lexbuf.lex_start_p.pos_cnum;
300-
};
301-
loc_end =
302-
{
303-
pos_fname = pos.pos_fname;
304-
pos_lnum = pos.pos_lnum;
305-
pos_bol = pos.pos_bol;
306-
pos_cnum = pos.pos_cnum + lexbuf.lex_curr_p.pos_cnum;
307-
};
281+
loc_start = { pos with pos_cnum = pos.pos_cnum + lexbuf.lex_start_p.pos_cnum };
282+
loc_end = { pos with pos_cnum = pos.pos_cnum + lexbuf.lex_curr_p.pos_cnum };
308283
}
309284
in
310285
let main = match target with `Match -> Mik_parser.main_match_case | `Let -> Mik_parser.main_let_expr in

ppx_regexp/ppx_regexp.ml

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ let transformation =
5555
{
5656
ptyp_desc =
5757
Ptyp_extension
58-
( { txt = ("pcre" | "mikmatch") as ext; loc },
59-
PStr [ { pstr_desc = Pstr_eval ({ pexp_desc = Pexp_constant (Pconst_string (pattern_str, _, _)); _ }, _); _ } ]
58+
( { txt = ("pcre" | "mikmatch") as ext; _ },
59+
PStr
60+
[ { pstr_desc = Pstr_eval ({ pexp_desc = Pexp_constant (Pconst_string (pattern_str, loc, _)); _ }, _); _ } ]
6061
);
6162
_;
6263
} ->
@@ -80,17 +81,15 @@ let transformation =
8081
fun (vbs_acc, bindings_acc) vb ->
8182
match vb.pvb_pat.ppat_desc, vb.pvb_expr.pexp_desc with
8283
(* pattern definition - let%mikmatch/%pcre name = {|/regex/|} *)
83-
| Ppat_var { txt = var_name; loc }, Pexp_constant (Pconst_string (_, _, _)) ->
84-
let binding = Transformations.transform_let ~mode vb in
84+
| Ppat_var { txt = var_name; _ }, Pexp_constant (Pconst_string (_, loc, _)) ->
85+
let binding = Transformations.transform_let ~loc ~mode vb in
8586
let alias = make_alias_binding ~loc ~var_name in
8687
alias :: vbs_acc, binding :: bindings_acc
8788
(* destructuring - let%mikmatch {|/pattern/|} = expr *)
8889
| Ppat_constant (Pconst_string (pattern_str, _, _)), _ ->
8990
let new_vb, new_bindings = Transformations.transform_destructuring_let ~mode ~loc:vb.pvb_loc pattern_str vb.pvb_expr in
9091
new_vb :: vbs_acc, new_bindings @ bindings_acc
91-
| _ ->
92-
let binding = Transformations.transform_let ~mode vb in
93-
binding :: vbs_acc, binding :: bindings_acc
92+
| _ -> vbs_acc, bindings_acc
9493
end
9594
([], acc) vbs
9695
in
@@ -103,11 +102,11 @@ let transformation =
103102
List.fold_left
104103
(fun (vbs_acc, bindings_acc) vb ->
105104
match vb.pvb_expr.pexp_desc with
106-
| Pexp_extension ({ txt = ("pcre" | "mikmatch") as ext; _ }, PStr [ { pstr_desc = Pstr_eval (expr, _); _ } ])
105+
| Pexp_extension ({ txt = ("pcre" | "mikmatch") as ext; loc }, PStr [ { pstr_desc = Pstr_eval (expr, _); _ } ])
107106
when match expr.pexp_desc with Pexp_constant (Pconst_string _) -> true | _ -> false ->
108107
let mode = if ext = "pcre" then `Pcre else `Mik in
109108
let new_vb = { vb with pvb_expr = expr } in
110-
let binding = Transformations.transform_let ~mode new_vb in
109+
let binding = Transformations.transform_let ~loc ~mode new_vb in
111110
let alias =
112111
match vb.pvb_pat.ppat_desc with Ppat_var { txt = var_name; loc } -> make_alias_binding ~loc ~var_name | _ -> new_vb
113112
in

ppx_regexp/transformations.ml

Lines changed: 22 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,8 @@ module Regexp = struct
107107
| Pipe_all (_, _, e) -> recurse e
108108
| Call lid ->
109109
(* Use the Call node's own location, not the parent's location *)
110-
let call_loc = lid.loc in
111-
let ld = { txt = lid.txt; loc = call_loc } in
112-
if in_let then pexp_ident ~loc:call_loc ld else [%expr Re.group [%e pexp_ident ~loc:call_loc ld]]
110+
let ld = { txt = lid.txt; loc = lid.loc } in
111+
if in_let then pexp_ident ~loc:lid.loc ld else [%expr Re.group [%e pexp_ident ~loc:lid.loc ld]]
113112
in
114113
function { Location.txt = Capture_as (_, _, e); _ } -> recurse e | e -> recurse e
115114

@@ -143,51 +142,6 @@ module Regexp = struct
143142
| Pipe_all (r, f, e') -> { e with txt = Pipe_all (r, f, squash_codes e') }
144143
| Call _ -> e
145144

146-
let relocate ~pos e =
147-
let open Location in
148-
let adjust_loc loc =
149-
if loc = Location.none then loc
150-
else
151-
{
152-
loc with
153-
loc_start =
154-
{
155-
pos_fname = pos.pos_fname;
156-
pos_lnum = pos.pos_lnum + loc.loc_start.pos_lnum - 1;
157-
(* -1 because parser starts at line 1 *)
158-
pos_cnum = pos.pos_cnum + loc.loc_start.pos_cnum;
159-
pos_bol = pos.pos_bol;
160-
};
161-
loc_end =
162-
{
163-
pos_fname = pos.pos_fname;
164-
pos_lnum = pos.pos_lnum + loc.loc_end.pos_lnum - 1;
165-
pos_cnum = pos.pos_cnum + loc.loc_end.pos_cnum;
166-
pos_bol = pos.pos_bol;
167-
};
168-
}
169-
in
170-
171-
let rec recurse (node : _ Location.loc) =
172-
let new_loc = adjust_loc node.loc in
173-
let new_txt =
174-
match node.txt with
175-
| Code s -> Code s
176-
| Seq es -> Seq (List.map recurse es)
177-
| Alt es -> Alt (List.map recurse es)
178-
| Opt e -> Opt (recurse e)
179-
| Repeat (range, e) -> Repeat ({ range with loc = adjust_loc range.loc }, recurse e)
180-
| Nongreedy e -> Nongreedy (recurse e)
181-
| Caseless e -> Caseless (recurse e)
182-
| Capture e -> Capture (recurse e)
183-
| Capture_as (name, conv, e) -> Capture_as ({ name with loc = adjust_loc name.loc }, conv, recurse e)
184-
| Pipe_all (name, func, e) -> Pipe_all ({ name with loc = adjust_loc name.loc }, func, recurse e)
185-
| Call lid -> Call { lid with loc = adjust_loc lid.loc }
186-
in
187-
{ txt = new_txt; loc = new_loc }
188-
in
189-
recurse e
190-
191145
let check_alternation_captures ~loc pattern =
192146
let rec get_alt_captures = function
193147
| { Location.txt = Regexp_types.Alt branches; _ } -> Some (List.map get_branch_captures branches)
@@ -236,11 +190,23 @@ module Regexp = struct
236190
end
237191

238192
module Parser = struct
193+
let calculate_pattern_pos ~loc ~pattern_str =
194+
let open Lexing in
195+
match loc with
196+
| { Location.loc_start; loc_end; _ } ->
197+
let total_len = loc_end.pos_cnum - loc_start.pos_cnum in
198+
let pattern_len = String.length pattern_str in
199+
200+
let delimiter_overhead = total_len - pattern_len in
201+
let start_delimiter_size = delimiter_overhead / 2 in
202+
203+
{ loc_start with pos_cnum = loc_start.pos_cnum + start_delimiter_size }
204+
239205
let get_parser ~mode ~target ~pos = match mode with `Pcre -> Regexp.parse_exn ~target ~pos | `Mik -> Regexp.parse_mik_exn ~target ~pos
240206

241-
let run ~parser ~target ~pos s =
207+
let run ~parser ~target ~pos:_ s =
242208
let r, flags = parser s in
243-
let r = Regexp.(relocate ~pos @@ squash_codes r) in
209+
let r = Regexp.squash_codes r in
244210
let nG, bs = Regexp.bindings r in
245211
let re = Regexp.to_re_expr ~in_let:(target = `Let) r in
246212
r, re, bs, nG, flags
@@ -311,11 +277,11 @@ let build_exec_match ~loc ~re_var ~continue_next ~on_match =
311277

312278
(* Transformations *)
313279

314-
let transform_let ~mode vb =
280+
let transform_let ~loc ~mode vb =
315281
let parser = Parser.get_parser ~mode ~target:`Let in
316282
match vb.pvb_pat.ppat_desc, vb.pvb_expr.pexp_desc with
317283
| Ppat_var { txt = _; _ }, Pexp_constant (Pconst_string (value, _, _)) ->
318-
let pos = vb.pvb_loc.loc_start in
284+
let pos = loc.Location.loc_start in
319285
let parsed, _flags = parser ~pos value in
320286
let parsed = Regexp.squash_codes parsed in
321287
let re_expr = Regexp.to_re_expr ~in_let:true parsed in
@@ -325,7 +291,7 @@ let transform_let ~mode vb =
325291

326292
let transform_destructuring_let ~mode ~loc pattern_str expr =
327293
let target = `Match in
328-
let pos = loc.loc_start in
294+
let pos = Parser.calculate_pattern_pos ~loc ~pattern_str in
329295
let parser = Parser.get_parser ~mode ~target ~pos in
330296
let r, re, bs, _, flags = Parser.run ~parser ~target ~pos pattern_str in
331297
let capture_names = List.map (fun (name, _, _, _) -> name) (List.rev bs) in
@@ -389,9 +355,8 @@ let transform_cases ~mode ~loc cases =
389355
in
390356

391357
let parse_pattern ~mode case =
392-
Ast_pattern.(parse (pstring __')) case.pc_lhs.ppat_loc case.pc_lhs (fun { txt = re_src; loc = { loc_start; loc_end; _ } } ->
393-
let re_offset = (loc_end.pos_cnum - loc_start.pos_cnum - String.length re_src) / 2 in
394-
let pos = { loc_start with pos_cnum = loc_start.pos_cnum + re_offset } in
358+
Ast_pattern.(parse (pstring __')) case.pc_lhs.ppat_loc case.pc_lhs (fun { txt = re_src; loc } ->
359+
let pos = Parser.calculate_pattern_pos ~loc ~pattern_str:re_src in
395360
let parser = Parser.get_parser ~mode ~target ~pos in
396361
let _, re, bs, nG, flags = Parser.run ~parser ~target ~pos re_src in
397362
let re_str = Pprintast.string_of_expression re in
@@ -600,7 +565,7 @@ let transform_mixed_match ~loc ?matched_expr cases acc =
600565
end
601566

602567
let transform_type ~mode ~loc rec_flag type_name pattern_str _td =
603-
let pos = loc.Location.loc_start in
568+
let pos = Parser.calculate_pattern_pos ~loc ~pattern_str in
604569
let parser = Parser.get_parser ~mode ~target:`Let ~pos in
605570
let r, re, bs, _nG, flags = Parser.run ~parser ~target:`Let ~pos pattern_str in
606571

tests/test_ppx_regexp.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ let test_parse_with_neither _ =
582582
assert_equal (Format.asprintf "%a" pp_log log) input
583583

584584
type url =
585-
{%mikmatch|
585+
{%mikmatch|
586586
(("http" | "https") as scheme) "://"
587587
((alnum+ ('.' alnum+)*) as host)
588588
(':' (digit+ as port : int))?

0 commit comments

Comments
 (0)