Skip to content

Commit 092bf06

Browse files
committed
First pass at unicode support
1 parent 3aa50a9 commit 092bf06

File tree

7 files changed

+114
-10
lines changed

7 files changed

+114
-10
lines changed

src/frontend/Errors.ml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ open Core
44

55
(** Our type of syntax error information *)
66
type syntax_error =
7-
| Lexing of Middle.Location.t
7+
| Lexing of string * Middle.Location.t
88
| UnexpectedEOF of Middle.Location.t
99
| Include of string * Middle.Location.t
1010
| Parsing of string * Middle.Location_span.t
@@ -57,12 +57,12 @@ let pp_syntax_error ?printed_filename ?code ppf = function
5757
(Middle.Location_span.to_string ?printed_filename loc_span)
5858
(pp_context_with_message ?code)
5959
(message, loc_span.begin_loc)
60-
| Lexing loc ->
60+
| Lexing (message, loc) ->
6161
Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
6262
(Middle.Location.to_string ?printed_filename
6363
{loc with col_num= loc.col_num - 1})
6464
(pp_context_with_message ?code)
65-
("Invalid character found.", loc)
65+
(message, loc)
6666
| UnexpectedEOF loc ->
6767
Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
6868
(Middle.Location.to_string ?printed_filename

src/frontend/Errors.mli

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
(** Our type of syntax error information *)
44
type syntax_error =
5-
| Lexing of Middle.Location.t
5+
| Lexing of string * Middle.Location.t
66
| UnexpectedEOF of Middle.Location.t
77
| Include of string * Middle.Location.t
88
| Parsing of string * Middle.Location_span.t

src/frontend/Unicode.ml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
let error ~loc msg =
2+
raise
3+
(Errors.SyntaxError
4+
(Errors.Lexing (msg, Preprocessor.location_of_position loc)))
5+
6+
let pp_uchar ppf u =
7+
let u_int = Uchar.to_int u in
8+
if u_int < 128 then Fmt.string ppf (Char.chr u_int |> Char.escaped)
9+
else Fmt.pf ppf "U+%04X" u_int
10+
11+
(* Validation based on the
12+
Unicode Standard Annex #31: Unicode Identifiers and Syntax
13+
https://www.unicode.org/reports/tr31 *)
14+
15+
let validate_identifier loc id =
16+
(* sanity check *)
17+
if not (String.is_valid_utf_8 id) then
18+
error "Identifier is not valid UTF-8 string" ~loc;
19+
(* normalize to NFKC as recommended *)
20+
let id = Uunf_string.normalize_utf_8 `NFKC id in
21+
let out = Buffer.create 24 in
22+
let len = String.length id in
23+
let pos = ref 0 in
24+
(* move through code point by code point *)
25+
while !pos != len do
26+
let decode = String.get_utf_8_uchar id !pos in
27+
let char_length = Uchar.utf_decode_length decode in
28+
let uchar = Uchar.utf_decode_uchar decode in
29+
Buffer.add_utf_8_uchar out uchar;
30+
match !pos with
31+
| 0 when not (Uucp.Id.is_xid_start uchar) ->
32+
error ~loc (Fmt.str "Invalid character: '%a'" pp_uchar uchar)
33+
| _ when not (Uucp.Id.is_xid_continue uchar) ->
34+
error ~loc
35+
(Fmt.str "Invalid character in identifier at offset %d: '%a'" !pos
36+
pp_uchar uchar)
37+
| _ -> pos := !pos + char_length
38+
done;
39+
(* another sanity check *)
40+
let res_id = Buffer.contents out in
41+
(if not (String.equal res_id id) then
42+
Core.(
43+
Common.FatalError.fatal_error_msg
44+
[%message "Failed to properly encode id during lexing!" (id : string)]));
45+
id

src/frontend/dune

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
11
(library
22
(name frontend)
33
(public_name stanc.frontend)
4-
(libraries core menhirLib yojson fmt middle stan_math_signatures)
4+
(libraries core menhirLib uunf uucp yojson fmt middle stan_math_signatures)
55
(instrumentation
66
(backend bisect_ppx))
77
(inline_tests)
88
(preprocess
99
(pps ppx_jane ppx_deriving.fold ppx_deriving.map)))
1010

11-
(ocamllex lexer)
11+
(rule
12+
(target lexer.ml)
13+
(deps lexer.mll)
14+
(action
15+
(chdir
16+
%{workspace_root}
17+
(run %{bin:ocamllex} -ml -o %{target} %{deps}))))
1218

1319
(rule
1420
(targets parsing_errors.ml)

src/frontend/lexer.mll

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,40 @@
3737
, location_span_of_positions (lexbuf.lex_start_p, lexbuf.lex_curr_p) )
3838
}
3939

40+
(*
41+
OCamllex does not know about unicode, it just operates over bytes.
42+
So, we can define all 'valid' byte sequences for UTF-8 like so
43+
*)
44+
(* 110xxxxx *)
45+
let utf8_head_byte2 = ['\192'-'\223']
46+
(* 1110xxxx *)
47+
let utf8_head_byte3 = ['\224'-'\239']
48+
(* 11110xxx *)
49+
let utf8_head_byte4 = ['\240'-'\247']
50+
(* 10xxxxxx *)
51+
let utf8_tail_byte = ['\128'-'\191']
52+
53+
(* utf8_1 is ascii *)
54+
let ascii_allowed = ['a'-'z' 'A'-'Z' '0'-'9' '_']
55+
(* 11 bits of payload *)
56+
let utf8_2 = utf8_head_byte2 utf8_tail_byte
57+
(* 16 bits of payload *)
58+
let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
59+
(* 21 bits of payload *)
60+
let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte
61+
62+
(* Any UTF-8-encoded code point, outside the ASCII range.
63+
This set includes more than it should for simplicity.
64+
*)
65+
let utf8_nonascii = utf8_2 | utf8_3 | utf8_4
66+
67+
(* identifiers here are overly permissive, and are checked
68+
in the semantic action of the rule that matches here.
69+
*)
70+
let identifier = (ascii_allowed | utf8_nonascii)+
71+
4072
(* Some auxiliary definition for variables and constants *)
4173
let string_literal = '"' [^ '"' '\r' '\n']* '"'
42-
let identifier = ['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '0'-'9' '_']* (* TODO: We should probably expand the alphabet *)
4374

4475
let integer_constant = ['0'-'9']+ ('_' ['0'-'9']+)*
4576

@@ -198,8 +229,10 @@ rule token = parse
198229
| string_literal as s { lexer_logger ("string_literal " ^ s) ;
199230
Parser.STRINGLITERAL (lexeme lexbuf) }
200231
| identifier as id { lexer_logger ("identifier " ^ id) ;
201-
lexer_pos_logger (lexeme_start_p lexbuf);
202-
Parser.IDENTIFIER (lexeme lexbuf) }
232+
let loc = (lexeme_start_p lexbuf) in
233+
lexer_pos_logger loc;
234+
let canonical_id = Unicode.validate_identifier loc id in
235+
Parser.IDENTIFIER (canonical_id) }
203236
(* End of file *)
204237
| eof { lexer_logger "eof" ;
205238
if Preprocessor.size () = 1
@@ -210,7 +243,8 @@ rule token = parse
210243

211244
| _ { raise (Errors.SyntaxError
212245
(Errors.Lexing
213-
(location_of_position
246+
("Invalid character found.",
247+
location_of_position
214248
(lexeme_start_p
215249
(current_buffer ()))))) }
216250

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
data {
2+
real ñabc;
3+
// this is a different encoding than above, should be prevented still!
4+
real ñabc;
5+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
data {
2+
int<lower=0> J; // number of schools
3+
array[J] real y; // estimated treatment effect (school j)
4+
array[J] real<lower=0> σ; // std err of effect estimate (school j)
5+
}
6+
parameters {
7+
real μ;
8+
array[J] real θ;
9+
real<lower=0> τ;
10+
}
11+
model {
12+
θ ~ normal(μ, τ);
13+
y ~ normal(θ, σ);
14+
}

0 commit comments

Comments
 (0)