Skip to content

Commit 2d466b1

Browse files
committed
Fast path for ASCII
1 parent 092bf06 commit 2d466b1

File tree

6 files changed

+69
-11
lines changed

6 files changed

+69
-11
lines changed

src/frontend/Unicode.ml

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,31 @@ let pp_uchar ppf u =
88
if u_int < 128 then Fmt.string ppf (Char.chr u_int |> Char.escaped)
99
else Fmt.pf ppf "U+%04X" u_int
1010

11+
let is_ascii s =
12+
let rec loop max b i =
13+
if i > max then true
14+
else if Bytes.get_uint8 b i < 128 then loop max b (i + 1)
15+
else false in
16+
let b = Bytes.of_string s in
17+
loop (Bytes.length b - 1) b 0
18+
19+
let validate_ascii_id ~loc id =
20+
Debugging.lexer_logger ("ascii id: " ^ id);
21+
let first = String.get_uint8 id 0 in
22+
if
23+
(first >= Char.code 'A' && first <= Char.code 'Z')
24+
|| (first >= Char.code 'a' && first <= Char.code 'z')
25+
then id
26+
else error ~loc "Invalid character found."
27+
1128
(* Validation based on the
1229
Unicode Standard Annex #31: Unicode Identifiers and Syntax
1330
https://www.unicode.org/reports/tr31 *)
1431

15-
let validate_identifier loc id =
16-
(* sanity check *)
32+
let validate_utf8_id ~loc id =
1733
if not (String.is_valid_utf_8 id) then
18-
error "Identifier is not valid UTF-8 string" ~loc;
34+
error ~loc "Identifier is not valid UTF-8 string";
35+
Debugging.lexer_logger ("unicode id: " ^ id);
1936
(* normalize to NFKC as recommended *)
2037
let id = Uunf_string.normalize_utf_8 `NFKC id in
2138
let out = Buffer.create 24 in
@@ -40,6 +57,9 @@ let validate_identifier loc id =
4057
let res_id = Buffer.contents out in
4158
(if not (String.equal res_id id) then
4259
Core.(
43-
Common.FatalError.fatal_error_msg
60+
Common.ICE.internal_compiler_error
4461
[%message "Failed to properly encode id during lexing!" (id : string)]));
4562
id
63+
64+
let validate_identifier loc id =
65+
if is_ascii id then validate_ascii_id ~loc id else validate_utf8_id ~loc id

test/integration/bad/lang/stanc.expected

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,18 @@ Semantic error in 'tparam_tuple_int.stan', line 2, column 2 to column 21:
533533
-------------------------------------------------
534534

535535
(Transformed) Parameters cannot be integers.
536+
[exit 1]
537+
$ ../../../../../install/default/bin/stanc unicode_normalization.stan
538+
Semantic error in 'unicode_normalization.stan', line 4, column 7 to column 13:
539+
-------------------------------------------------
540+
2: real ñabc;
541+
3: // this is a different encoding than above, should be prevented still!
542+
4: real ñabc;
543+
^
544+
5: }
545+
-------------------------------------------------
546+
547+
Identifier 'ñabc' is already in use.
536548
[exit 1]
537549
$ ../../../../../install/default/bin/stanc unterminated_comment.stan
538550
Syntax error in 'unterminated_comment.stan', line 4, column -1, lexing error:

test/integration/bad/new/stanc.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1450,7 +1450,7 @@ Syntax error in 'lexing_error.stan', line 1, column 6, lexing error:
14501450
^
14511451
-------------------------------------------------
14521452

1453-
Invalid character found.
1453+
Invalid character: '1'
14541454
[exit 1]
14551455
$ ../../../../../install/default/bin/stanc location-scale-bad1.stan
14561456
Syntax error in 'location-scale-bad1.stan', line 1, column 27 to column 32, parsing error:

test/integration/bad/numeric-literal/stanc.expected

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
$ ../../../../../install/default/bin/stanc int-bad1.stan
2-
Syntax error in 'int-bad1.stan', line 2, column 17, lexing error:
2+
Syntax error in 'int-bad1.stan', line 2, column 11, lexing error:
33
-------------------------------------------------
44
1: transformed data {
55
2: int n = 10_000_;
6-
^
6+
^
77
3: }
88
-------------------------------------------------
99

1010
Invalid character found.
1111
[exit 1]
1212
$ ../../../../../install/default/bin/stanc int-bad2.stan
13-
Syntax error in 'int-bad2.stan', line 2, column 13, lexing error:
13+
Syntax error in 'int-bad2.stan', line 2, column 11, lexing error:
1414
-------------------------------------------------
1515
1: transformed data {
1616
2: int n = 10__000;
17-
^
17+
^
1818
3: }
1919
-------------------------------------------------
2020

@@ -65,11 +65,11 @@ Syntax error in 'real-bad1.stan', line 2, column 12, lexing error:
6565
Invalid character found.
6666
[exit 1]
6767
$ ../../../../../install/default/bin/stanc real-bad2.stan
68-
Syntax error in 'real-bad2.stan', line 2, column 14, lexing error:
68+
Syntax error in 'real-bad2.stan', line 2, column 12, lexing error:
6969
-------------------------------------------------
7070
1: transformed data {
7171
2: real x = 12_.345;
72-
^
72+
^
7373
3: }
7474
-------------------------------------------------
7575

test/integration/cli-args/debug-flags.t/run.t

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Flags not used elsewhere in the tests
1717
Lexer: space
1818
Lexer: identifier N
1919
{fname=basic.stan; line=2}
20+
Lexer: ascii id: N
2021
Lexer: ;
2122
Lexer: newline
2223
{fname=basic.stan; line=2}
@@ -26,6 +27,7 @@ Flags not used elsewhere in the tests
2627
Lexer: [
2728
Lexer: identifier N
2829
{fname=basic.stan; line=3}
30+
Lexer: ascii id: N
2931
Lexer: ]
3032
Lexer: space
3133
Lexer: int
@@ -42,6 +44,7 @@ Flags not used elsewhere in the tests
4244
Lexer: space
4345
Lexer: identifier y
4446
{fname=basic.stan; line=3}
47+
Lexer: ascii id: y
4548
Lexer: ;
4649
Lexer: newline
4750
{fname=basic.stan; line=3}
@@ -69,6 +72,7 @@ Flags not used elsewhere in the tests
6972
Lexer: space
7073
Lexer: identifier theta
7174
{fname=basic.stan; line=6}
75+
Lexer: ascii id: theta
7276
Lexer: ;
7377
Lexer: newline
7478
{fname=basic.stan; line=6}
@@ -84,11 +88,13 @@ Flags not used elsewhere in the tests
8488
Lexer: space
8589
Lexer: identifier theta
8690
{fname=basic.stan; line=9}
91+
Lexer: ascii id: theta
8792
Lexer: space
8893
Lexer: ~
8994
Lexer: space
9095
Lexer: identifier beta
9196
{fname=basic.stan; line=9}
97+
Lexer: ascii id: beta
9298
Lexer: (
9399
Lexer: int_constant 1
94100
Lexer: ,
@@ -103,14 +109,17 @@ Flags not used elsewhere in the tests
103109
Lexer: space
104110
Lexer: identifier y
105111
{fname=basic.stan; line=10}
112+
Lexer: ascii id: y
106113
Lexer: space
107114
Lexer: ~
108115
Lexer: space
109116
Lexer: identifier bernoulli
110117
{fname=basic.stan; line=10}
118+
Lexer: ascii id: bernoulli
111119
Lexer: (
112120
Lexer: identifier theta
113121
{fname=basic.stan; line=10}
122+
Lexer: ascii id: theta
114123
Lexer: )
115124
Lexer: ;
116125
Lexer: newline

test/integration/good/lang/pretty.expected

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
$ ../../../../../install/default/bin/stanc --auto-format basic_unicode.stan
2+
data {
3+
int<lower=0> J; // number of schools
4+
array[J] real y; // estimated treatment effect (school j)
5+
array[J] real<lower=0> σ; // std err of effect estimate (school j)
6+
}
7+
parameters {
8+
real μ;
9+
array[J] real θ;
10+
real<lower=0> τ;
11+
}
12+
model {
13+
θ ~ normal(μ, τ);
14+
y ~ normal(θ, σ);
15+
}
16+
17+
[exit 0]
118
$ ../../../../../install/default/bin/stanc --auto-format exit_model.stan
219
data {
320

0 commit comments

Comments
 (0)