Skip to content

Commit c7605da

Browse files
authored
Add utf8 support for string literal (#127)
1 parent d0269f8 commit c7605da

File tree

7 files changed

+192
-43
lines changed

7 files changed

+192
-43
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# dev
22
- Implement Corrigendum #1: UTF-8 Shortest Form
3+
- Add utf8 support for string literal (#127)
34

45
# 3.4 (2025-03-28)
56
- Make the library compatibility with ppxlib.0.36 (#166)

README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ The actions can call functions from the Sedlexing module to extract
8484

8585
Regular expressions are syntactically OCaml patterns:
8686

87-
- `"...."` (string constant): recognize the specified string
87+
- `"...."` (string constant): recognize the specified string.
8888
- `'....'` (character constant) : recognize the specified character
8989
- `i` (integer constant) : recognize the specified codepoint
9090
- `'...' .. '...'`: character range
@@ -103,6 +103,9 @@ Regular expressions are syntactically OCaml patterns:
103103
and recognize the set of items in `R1` but not in `R2` ("subtract")
104104
- `Intersect (R1,R2)` : assume that `R` is a single-character length regexp (see
105105
below) and recognize the set of items which are in both `R1` and `R2`
106+
- `Utf8 R` : string literals inside R are assumed to be utf-8 encoded.
107+
- `Latin1 R` : string literals inside R are assumed to be latin1 encoded.
108+
- `Ascii R` : string literals inside R are assumed to be ascii encoded.
106109
- `lid` (lowercase identifier) : reference a named regexp (see below)
107110

108111
A single-character length regexp is a regexp which does not contain (after
@@ -112,8 +115,9 @@ with a length different from one.
112115

113116

114117
Note:
115-
- The OCaml source is assumed to be encoded in Latin1 (for string
116-
and character literals).
118+
- The OCaml source is assumed to be encoded in UTF-8.
119+
- Strings and chars litterals will be interpreted in ASCII unless otherwise
120+
specified by the `Latin1`,`Ascii` and `Utf8` constructors in patterns.
117121

118122

119123
It is possible to define named regular expressions with the following

src/lib/sedlexing.mli

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,13 @@ module Utf8 : sig
274274

275275
(** As [Sedlexing.sub_lexeme] with a result encoded in UTF-8. *)
276276
val sub_lexeme : lexbuf -> int -> int -> string
277+
278+
module Helper : sig
279+
val width : char -> int
280+
val check_two : int -> int -> int
281+
val check_three : int -> int -> int -> int
282+
val check_four : int -> int -> int -> int -> int
283+
end
277284
end
278285

279286
module Utf16 : sig

src/syntax/ppx_sedlex.ml

Lines changed: 90 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ module Cset = Sedlex_cset
1313
(* Decision tree for partitions *)
1414

1515
let default_loc = Location.none
16-
let lident_loc ~loc s = { loc; txt = lident s }
1716

1817
type decision_tree =
1918
| Lte of int * decision_tree * decision_tree
@@ -286,48 +285,86 @@ let codepoint i =
286285
failwith (Printf.sprintf "Invalid Unicode code point: %i" i);
287286
i
288287

289-
let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))
290-
291-
let regexp_for_string s =
292-
let rec aux n =
293-
if n = String.length s then Sedlex.eps
294-
else Sedlex.seq (regexp_for_char s.[n]) (aux (succ n))
295-
in
296-
aux 0
297-
298-
let err loc s =
299-
raise (Location.Error (Location.Error.createf ~loc "Sedlex: %s" s))
288+
let char c = Cset.singleton (Char.code c)
289+
let uchar c = Cset.singleton (Uchar.to_int c)
290+
291+
let err loc fmt =
292+
Printf.ksprintf
293+
(fun s ->
294+
raise (Location.Error (Location.Error.createf ~loc "Sedlex: %s" s)))
295+
fmt
296+
297+
type encoding = Utf8 | Latin1 | Ascii
298+
299+
let string_of_encoding = function
300+
| Utf8 -> "UTF-8"
301+
| Latin1 -> "Latin-1"
302+
| Ascii -> "ASCII"
303+
304+
let rev_csets_of_string ~loc ~encoding s =
305+
match encoding with
306+
| Utf8 ->
307+
Utf8.fold
308+
~f:(fun acc _ x ->
309+
match x with
310+
| `Malformed _ ->
311+
err loc "Malformed %s string" (string_of_encoding encoding)
312+
| `Uchar c -> uchar c :: acc)
313+
[] s
314+
| Latin1 ->
315+
let l = ref [] in
316+
for i = 0 to String.length s - 1 do
317+
l := char s.[i] :: !l
318+
done;
319+
!l
320+
| Ascii ->
321+
let l = ref [] in
322+
for i = 0 to String.length s - 1 do
323+
match s.[i] with
324+
| '\x00' .. '\x7F' as c -> l := char c :: !l
325+
| _ -> err loc "Malformed %s string" (string_of_encoding encoding)
326+
done;
327+
!l
300328

301329
let rec repeat r = function
302330
| 0, 0 -> Sedlex.eps
303331
| 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1)))
304332
| n, m -> Sedlex.seq r (repeat r (n - 1, m - 1))
305333

306334
let regexp_of_pattern env =
307-
let rec char_pair_op func name p tuple =
335+
let rec char_pair_op func name ~encoding p tuple =
308336
(* Construct something like Sub(a,b) *)
309337
match tuple with
310338
| Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin
311-
match func (aux p0) (aux p1) with
339+
match func (aux ~encoding p0) (aux ~encoding p1) with
312340
| Some r -> r
313341
| None ->
314-
err p.ppat_loc @@ "the " ^ name
315-
^ " operator can only applied to single-character length \
316-
regexps"
342+
err p.ppat_loc
343+
"the %s operator can only applied to single-character \
344+
length regexps"
345+
name
317346
end
318347
| _ ->
319-
err p.ppat_loc @@ "the " ^ name
320-
^ " operator requires two arguments, like " ^ name ^ "(a,b)"
321-
and aux p =
348+
err p.ppat_loc "the %s operator requires two arguments, like %s(a,b)"
349+
name name
350+
and aux ~encoding p =
322351
(* interpret one pattern node *)
323352
match p.ppat_desc with
324-
| Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2)
353+
| Ppat_or (p1, p2) -> Sedlex.alt (aux ~encoding p1) (aux ~encoding p2)
325354
| Ppat_tuple (p :: pl) ->
326-
List.fold_left (fun r p -> Sedlex.seq r (aux p)) (aux p) pl
355+
List.fold_left
356+
(fun r p -> Sedlex.seq r (aux ~encoding p))
357+
(aux ~encoding p) pl
327358
| Ppat_construct ({ txt = Lident "Star" }, Some (_, p)) ->
328-
Sedlex.rep (aux p)
359+
Sedlex.rep (aux ~encoding p)
329360
| Ppat_construct ({ txt = Lident "Plus" }, Some (_, p)) ->
330-
Sedlex.plus (aux p)
361+
Sedlex.plus (aux ~encoding p)
362+
| Ppat_construct ({ txt = Lident "Utf8" }, Some (_, p)) ->
363+
aux ~encoding:Utf8 p
364+
| Ppat_construct ({ txt = Lident "Latin1" }, Some (_, p)) ->
365+
aux ~encoding:Latin1 p
366+
| Ppat_construct ({ txt = Lident "Ascii" }, Some (_, p)) ->
367+
aux ~encoding:Ascii p
331368
| Ppat_construct
332369
( { txt = Lident "Rep" },
333370
Some
@@ -347,19 +384,19 @@ let regexp_of_pattern env =
347384
| Pconst_integer (i1, _), Pconst_integer (i2, _) ->
348385
let i1 = int_of_string i1 in
349386
let i2 = int_of_string i2 in
350-
if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2)
387+
if 0 <= i1 && i1 <= i2 then repeat (aux ~encoding p0) (i1, i2)
351388
else err p.ppat_loc "Invalid range for Rep operator"
352389
| _ ->
353390
err p.ppat_loc "Rep must take an integer constant or interval"
354391
end
355392
| Ppat_construct ({ txt = Lident "Rep" }, _) ->
356393
err p.ppat_loc "the Rep operator takes 2 arguments"
357394
| Ppat_construct ({ txt = Lident "Opt" }, Some (_, p)) ->
358-
Sedlex.alt Sedlex.eps (aux p)
395+
Sedlex.alt Sedlex.eps (aux ~encoding p)
359396
| Ppat_construct ({ txt = Lident "Compl" }, arg) -> begin
360397
match arg with
361398
| Some (_, p0) -> begin
362-
match Sedlex.compl (aux p0) with
399+
match Sedlex.compl (aux ~encoding p0) with
363400
| Some r -> r
364401
| None ->
365402
err p.ppat_loc
@@ -369,10 +406,10 @@ let regexp_of_pattern env =
369406
| _ -> err p.ppat_loc "the Compl operator requires an argument"
370407
end
371408
| Ppat_construct ({ txt = Lident "Sub" }, arg) ->
372-
char_pair_op Sedlex.subtract "Sub" p
409+
char_pair_op ~encoding Sedlex.subtract "Sub" p
373410
(Option.map (fun (_, arg) -> arg) arg)
374411
| Ppat_construct ({ txt = Lident "Intersect" }, arg) ->
375-
char_pair_op Sedlex.intersection "Intersect" p
412+
char_pair_op ~encoding Sedlex.intersection "Intersect" p
376413
(Option.map (fun (_, arg) -> arg) arg)
377414
| Ppat_construct ({ txt = Lident "Chars" }, arg) -> (
378415
let const =
@@ -382,16 +419,26 @@ let regexp_of_pattern env =
382419
in
383420
match const with
384421
| Some (Pconst_string (s, _, _)) ->
385-
let c = ref Cset.empty in
386-
for i = 0 to String.length s - 1 do
387-
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
388-
done;
389-
Sedlex.chars !c
422+
let l = rev_csets_of_string ~loc:p.ppat_loc ~encoding s in
423+
let chars = List.fold_left Cset.union Cset.empty l in
424+
Sedlex.chars chars
390425
| _ ->
391426
err p.ppat_loc "the Chars operator requires a string argument")
392427
| Ppat_interval (i_start, i_end) -> begin
393428
match (i_start, i_end) with
394429
| Pconst_char c1, Pconst_char c2 ->
430+
let valid =
431+
match encoding with
432+
(* utf8 char interval can only match ascii because
433+
of the OCaml lexer. *)
434+
| Ascii | Utf8 -> (
435+
function '\x00' .. '\x7f' -> true | _ -> false)
436+
| Latin1 -> ( function _ -> true)
437+
in
438+
if not (valid c1 && valid c2) then
439+
err p.ppat_loc
440+
"this pattern is not a valid %s interval regexp"
441+
(string_of_encoding encoding);
395442
Sedlex.chars (Cset.interval (Char.code c1) (Char.code c2))
396443
| Pconst_integer (i1, _), Pconst_integer (i2, _) ->
397444
Sedlex.chars
@@ -402,20 +449,23 @@ let regexp_of_pattern env =
402449
end
403450
| Ppat_constant const -> begin
404451
match const with
405-
| Pconst_string (s, _, _) -> regexp_for_string s
406-
| Pconst_char c -> regexp_for_char c
452+
| Pconst_string (s, _, _) ->
453+
let rev_l = rev_csets_of_string s ~loc:p.ppat_loc ~encoding in
454+
List.fold_left
455+
(fun acc cset -> Sedlex.seq (Sedlex.chars cset) acc)
456+
Sedlex.eps rev_l
457+
| Pconst_char c -> Sedlex.chars (char c)
407458
| Pconst_integer (i, _) ->
408459
Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))
409460
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
410461
end
411462
| Ppat_var { txt = x } -> begin
412463
try StringMap.find x env
413-
with Not_found ->
414-
err p.ppat_loc (Printf.sprintf "unbound regexp %s" x)
464+
with Not_found -> err p.ppat_loc "unbound regexp %s" x
415465
end
416466
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
417467
in
418-
aux
468+
aux ~encoding:Ascii
419469

420470
let previous = ref []
421471
let regexps = ref []
@@ -468,7 +518,7 @@ let mapper =
468518
(this#define_regexp name p)#expression body
469519
| [%expr [%sedlex [%e? _]]] ->
470520
err e.pexp_loc
471-
"the %sedlex extension is only recognized on match expressions"
521+
"the %%sedlex extension is only recognized on match expressions"
472522
| _ -> super#expression e
473523

474524
val toplevel = true

src/syntax/utf8.ml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
open Sedlexing
2+
3+
let unsafe_byte s j = Char.code (String.unsafe_get s j)
4+
let malformed s j l = `Malformed (String.sub s j l)
5+
6+
let r_utf_8 s j l =
7+
(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)
8+
let uchar c = `Uchar (Uchar.unsafe_of_int c) in
9+
match l with
10+
| 1 -> uchar (unsafe_byte s j)
11+
| 2 -> (
12+
let b0 = unsafe_byte s j in
13+
let b1 = unsafe_byte s (j + 1) in
14+
match Utf8.Helper.check_two b0 b1 with
15+
| i -> uchar i
16+
| exception MalFormed -> malformed s j l)
17+
| 3 -> (
18+
let b0 = unsafe_byte s j in
19+
let b1 = unsafe_byte s (j + 1) in
20+
let b2 = unsafe_byte s (j + 2) in
21+
match Utf8.Helper.check_three b0 b1 b2 with
22+
| i -> uchar i
23+
| exception MalFormed -> malformed s j l)
24+
| 4 -> (
25+
let b0 = unsafe_byte s j in
26+
let b1 = unsafe_byte s (j + 1) in
27+
let b2 = unsafe_byte s (j + 2) in
28+
let b3 = unsafe_byte s (j + 3) in
29+
match Utf8.Helper.check_four b0 b1 b2 b3 with
30+
| i -> uchar i
31+
| exception MalFormed -> malformed s j l)
32+
| _ -> assert false
33+
34+
let fold ~f acc s =
35+
let rec loop acc f s i last =
36+
if i > last then acc
37+
else (
38+
match Utf8.Helper.width (String.unsafe_get s i) with
39+
| exception MalFormed ->
40+
loop (f acc i (malformed s i 1)) f s (i + 1) last
41+
| need ->
42+
let rem = last - i + 1 in
43+
if rem < need then f acc i (malformed s i rem)
44+
else loop (f acc i (r_utf_8 s i need)) f s (i + need) last)
45+
in
46+
let pos = 0 in
47+
let len = String.length s in
48+
let last = pos + len - 1 in
49+
loop acc f s pos last

src/syntax/utf8.mli

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
val fold :
2+
f:('a -> int -> [> `Malformed of string | `Uchar of Uchar.t ] -> 'a) ->
3+
'a ->
4+
string ->
5+
'a

test/utf8.ml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
open Printf
2+
3+
let next_tok buf =
4+
let open Sedlexing.Utf8 in
5+
match%sedlex buf with
6+
| "a", Utf8 (Chars "+-×÷") -> sprintf "with Chars: %s" (lexeme buf)
7+
| "b", Utf8 ("+" | "-" | "×" | "÷") ->
8+
sprintf "with or_pattern: %s" (lexeme buf)
9+
| Latin1 "\xc0", Utf8 "À", Ascii (Utf8 (Latin1 (Utf8 (Chars "À")))) ->
10+
sprintf "mixed encoding: %s" (lexeme buf)
11+
| Ascii (Star '\x00' .. '\x7f') -> sprintf "only ascii: %s" (lexeme buf)
12+
| Utf8 (Star '\x00' .. '\x7f') ->
13+
assert false
14+
(* utf8 char interval can only match ascii because of the OCaml lexer. The regexp above should match instead *)
15+
| Latin1 (Star '\x00' .. '\xff') -> sprintf "only latin1: %s" (lexeme buf)
16+
| _ -> failwith (sprintf "Unexpected character: %s" (lexeme buf))
17+
18+
let%expect_test _ =
19+
Sedlexing.Utf8.from_string "a+" |> next_tok |> print_string;
20+
[%expect {| with Chars: a+ |}];
21+
Sedlexing.Utf8.from_string "" |> next_tok |> print_string;
22+
[%expect {| with Chars: a÷ |}];
23+
Sedlexing.Utf8.from_string "b+" |> next_tok |> print_string;
24+
[%expect {| with or_pattern: b+ |}];
25+
Sedlexing.Utf8.from_string "" |> next_tok |> print_string;
26+
[%expect {| with or_pattern: b÷ |}];
27+
Sedlexing.Utf8.from_string "ÀÀÀ" |> next_tok |> print_string;
28+
[%expect {| mixed encoding: ÀÀÀ |}];
29+
Sedlexing.Utf8.from_string "az\x7f"
30+
|> next_tok |> String.escaped |> print_string;
31+
[%expect {| only ascii: az\127 |}];
32+
Sedlexing.Utf8.from_string "az\u{c0}" |> next_tok |> print_string;
33+
[%expect {| only latin1: azÀ |}]

0 commit comments

Comments
 (0)