@@ -13,7 +13,6 @@ module Cset = Sedlex_cset
1313(* Decision tree for partitions *)
1414
1515let default_loc = Location. none
16- let lident_loc ~loc s = { loc; txt = lident s }
1716
1817type decision_tree =
1918 | Lte of int * decision_tree * decision_tree
@@ -286,48 +285,86 @@ let codepoint i =
286285 failwith (Printf. sprintf " Invalid Unicode code point: %i" i);
287286 i
288287
289- let regexp_for_char c = Sedlex. chars (Cset. singleton (Char. code c))
290-
291- let regexp_for_string s =
292- let rec aux n =
293- if n = String. length s then Sedlex. eps
294- else Sedlex. seq (regexp_for_char s.[n]) (aux (succ n))
295- in
296- aux 0
297-
298- let err loc s =
299- raise (Location. Error (Location.Error. createf ~loc " Sedlex: %s" s))
288+ let char c = Cset. singleton (Char. code c)
289+ let uchar c = Cset. singleton (Uchar. to_int c)
290+
291+ let err loc fmt =
292+ Printf. ksprintf
293+ (fun s ->
294+ raise (Location. Error (Location.Error. createf ~loc " Sedlex: %s" s)))
295+ fmt
296+
297+ type encoding = Utf8 | Latin1 | Ascii
298+
299+ let string_of_encoding = function
300+ | Utf8 -> " UTF-8"
301+ | Latin1 -> " Latin-1"
302+ | Ascii -> " ASCII"
303+
304+ let rev_csets_of_string ~loc ~encoding s =
305+ match encoding with
306+ | Utf8 ->
307+ Utf8. fold
308+ ~f: (fun acc _ x ->
309+ match x with
310+ | `Malformed _ ->
311+ err loc " Malformed %s string" (string_of_encoding encoding)
312+ | `Uchar c -> uchar c :: acc)
313+ [] s
314+ | Latin1 ->
315+ let l = ref [] in
316+ for i = 0 to String. length s - 1 do
317+ l := char s.[i] :: ! l
318+ done ;
319+ ! l
320+ | Ascii ->
321+ let l = ref [] in
322+ for i = 0 to String. length s - 1 do
323+ match s.[i] with
324+ | '\x00' .. '\x7F' as c -> l := char c :: ! l
325+ | _ -> err loc " Malformed %s string" (string_of_encoding encoding)
326+ done ;
327+ ! l
300328
301329let rec repeat r = function
302330 | 0 , 0 -> Sedlex. eps
303331 | 0 , m -> Sedlex. alt Sedlex. eps (Sedlex. seq r (repeat r (0 , m - 1 )))
304332 | n , m -> Sedlex. seq r (repeat r (n - 1 , m - 1 ))
305333
306334let regexp_of_pattern env =
307- let rec char_pair_op func name p tuple =
335+ let rec char_pair_op func name ~ encoding p tuple =
308336 (* Construct something like Sub(a,b) *)
309337 match tuple with
310338 | Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin
311- match func (aux p0) (aux p1) with
339+ match func (aux ~encoding p0) (aux ~encoding p1) with
312340 | Some r -> r
313341 | None ->
314- err p.ppat_loc @@ " the " ^ name
315- ^ " operator can only applied to single-character length \
316- regexps"
342+ err p.ppat_loc
343+ " the %s operator can only applied to single-character \
344+ length regexps"
345+ name
317346 end
318347 | _ ->
319- err p.ppat_loc @@ " the " ^ name
320- ^ " operator requires two arguments, like " ^ name ^ " (a,b) "
321- and aux p =
348+ err p.ppat_loc " the %s operator requires two arguments, like %s(a,b) "
349+ name name
350+ and aux ~ encoding p =
322351 (* interpret one pattern node *)
323352 match p.ppat_desc with
324- | Ppat_or (p1 , p2 ) -> Sedlex. alt (aux p1) (aux p2)
353+ | Ppat_or (p1 , p2 ) -> Sedlex. alt (aux ~encoding p1) (aux ~encoding p2)
325354 | Ppat_tuple (p :: pl ) ->
326- List. fold_left (fun r p -> Sedlex. seq r (aux p)) (aux p) pl
355+ List. fold_left
356+ (fun r p -> Sedlex. seq r (aux ~encoding p))
357+ (aux ~encoding p) pl
327358 | Ppat_construct ({ txt = Lident "Star" } , Some (_ , p )) ->
328- Sedlex. rep (aux p)
359+ Sedlex. rep (aux ~encoding p)
329360 | Ppat_construct ({ txt = Lident "Plus" } , Some (_ , p )) ->
330- Sedlex. plus (aux p)
361+ Sedlex. plus (aux ~encoding p)
362+ | Ppat_construct ({ txt = Lident "Utf8" } , Some (_ , p )) ->
363+ aux ~encoding: Utf8 p
364+ | Ppat_construct ({ txt = Lident "Latin1" } , Some (_ , p )) ->
365+ aux ~encoding: Latin1 p
366+ | Ppat_construct ({ txt = Lident "Ascii" } , Some (_ , p )) ->
367+ aux ~encoding: Ascii p
331368 | Ppat_construct
332369 ( { txt = Lident " Rep" },
333370 Some
@@ -347,19 +384,19 @@ let regexp_of_pattern env =
347384 | Pconst_integer (i1 , _ ), Pconst_integer (i2 , _ ) ->
348385 let i1 = int_of_string i1 in
349386 let i2 = int_of_string i2 in
350- if 0 < = i1 && i1 < = i2 then repeat (aux p0) (i1, i2)
387+ if 0 < = i1 && i1 < = i2 then repeat (aux ~encoding p0) (i1, i2)
351388 else err p.ppat_loc " Invalid range for Rep operator"
352389 | _ ->
353390 err p.ppat_loc " Rep must take an integer constant or interval"
354391 end
355392 | Ppat_construct ({ txt = Lident "Rep" } , _ ) ->
356393 err p.ppat_loc " the Rep operator takes 2 arguments"
357394 | Ppat_construct ({ txt = Lident "Opt" } , Some (_ , p )) ->
358- Sedlex. alt Sedlex. eps (aux p)
395+ Sedlex. alt Sedlex. eps (aux ~encoding p)
359396 | Ppat_construct ({ txt = Lident "Compl" } , arg ) -> begin
360397 match arg with
361398 | Some (_ , p0 ) -> begin
362- match Sedlex. compl (aux p0) with
399+ match Sedlex. compl (aux ~encoding p0) with
363400 | Some r -> r
364401 | None ->
365402 err p.ppat_loc
@@ -369,10 +406,10 @@ let regexp_of_pattern env =
369406 | _ -> err p.ppat_loc " the Compl operator requires an argument"
370407 end
371408 | Ppat_construct ({ txt = Lident "Sub" } , arg ) ->
372- char_pair_op Sedlex. subtract " Sub" p
409+ char_pair_op ~encoding Sedlex. subtract " Sub" p
373410 (Option. map (fun (_ , arg ) -> arg) arg)
374411 | Ppat_construct ({ txt = Lident "Intersect" } , arg ) ->
375- char_pair_op Sedlex. intersection " Intersect" p
412+ char_pair_op ~encoding Sedlex. intersection " Intersect" p
376413 (Option. map (fun (_ , arg ) -> arg) arg)
377414 | Ppat_construct ({ txt = Lident "Chars" } , arg ) -> (
378415 let const =
@@ -382,16 +419,26 @@ let regexp_of_pattern env =
382419 in
383420 match const with
384421 | Some (Pconst_string (s , _ , _ )) ->
385- let c = ref Cset. empty in
386- for i = 0 to String. length s - 1 do
387- c := Cset. union ! c (Cset. singleton (Char. code s.[i]))
388- done ;
389- Sedlex. chars ! c
422+ let l = rev_csets_of_string ~loc: p.ppat_loc ~encoding s in
423+ let chars = List. fold_left Cset. union Cset. empty l in
424+ Sedlex. chars chars
390425 | _ ->
391426 err p.ppat_loc " the Chars operator requires a string argument" )
392427 | Ppat_interval (i_start , i_end ) -> begin
393428 match (i_start, i_end) with
394429 | Pconst_char c1 , Pconst_char c2 ->
430+ let valid =
431+ match encoding with
432+ (* utf8 char interval can only match ascii because
433+ of the OCaml lexer. *)
434+ | Ascii | Utf8 -> (
435+ function '\x00' .. '\x7f' -> true | _ -> false )
436+ | Latin1 -> ( function _ -> true )
437+ in
438+ if not (valid c1 && valid c2) then
439+ err p.ppat_loc
440+ " this pattern is not a valid %s interval regexp"
441+ (string_of_encoding encoding);
395442 Sedlex. chars (Cset. interval (Char. code c1) (Char. code c2))
396443 | Pconst_integer (i1 , _ ), Pconst_integer (i2 , _ ) ->
397444 Sedlex. chars
@@ -402,20 +449,23 @@ let regexp_of_pattern env =
402449 end
403450 | Ppat_constant const -> begin
404451 match const with
405- | Pconst_string (s , _ , _ ) -> regexp_for_string s
406- | Pconst_char c -> regexp_for_char c
452+ | Pconst_string (s , _ , _ ) ->
453+ let rev_l = rev_csets_of_string s ~loc: p.ppat_loc ~encoding in
454+ List. fold_left
455+ (fun acc cset -> Sedlex. seq (Sedlex. chars cset) acc)
456+ Sedlex. eps rev_l
457+ | Pconst_char c -> Sedlex. chars (char c)
407458 | Pconst_integer (i , _ ) ->
408459 Sedlex. chars (Cset. singleton (codepoint (int_of_string i)))
409460 | _ -> err p.ppat_loc " this pattern is not a valid regexp"
410461 end
411462 | Ppat_var { txt = x } -> begin
412463 try StringMap. find x env
413- with Not_found ->
414- err p.ppat_loc (Printf. sprintf " unbound regexp %s" x)
464+ with Not_found -> err p.ppat_loc " unbound regexp %s" x
415465 end
416466 | _ -> err p.ppat_loc " this pattern is not a valid regexp"
417467 in
418- aux
468+ aux ~encoding: Ascii
419469
420470let previous = ref []
421471let regexps = ref []
@@ -468,7 +518,7 @@ let mapper =
468518 (this#define_regexp name p)#expression body
469519 | [% expr [% sedlex [% e? _]]] ->
470520 err e.pexp_loc
471- " the %sedlex extension is only recognized on match expressions"
521+ " the %% sedlex extension is only recognized on match expressions"
472522 | _ -> super#expression e
473523
474524 val toplevel = true
0 commit comments