Skip to content

Commit a46ba91

Browse files
committed
mikmatch syntax support v0
- almost all features supported, missing backreferences, lookaheads, `#` - surrounding the mikmatch match expressions by `{||}` should do the trick - lexing/parsing error reports - type conversions using `:`
1 parent 6e798ad commit a46ba91

File tree

10 files changed

+600
-78
lines changed

10 files changed

+600
-78
lines changed

common/mik_lexer.mll

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
{
2+
open Mik_parser
3+
exception Error of string
4+
5+
let new_line lexbuf =
6+
Lexing.new_line lexbuf
7+
8+
let predefined_classes = [
9+
("lower", {|[a-z]|});
10+
("upper", {|[A-Z]|});
11+
("alpha", {|[a-zA-Z]|});
12+
("digit", {|[0-9]|});
13+
("alnum", {|[a-zA-Z0-9]|});
14+
("punct", {|[!-/:-@\\[-`{-~]|});
15+
(* ("graph", {|[!-~]|}); *)
16+
(* ("print", {|[!-~]|}); *)
17+
("blank", {|[ \t]|});
18+
("cntrl", {|[[:cntrl:]]|});
19+
("xdigit", {|[0-9A-Fa-f]|});
20+
("space", {|[[:space:]]|});
21+
("word", {|[a-zA-Z0-9_]|});
22+
("eos", {|$|});
23+
("eol", {|$|[\n]|});
24+
("bos", {|^|});
25+
("any", {|.|});
26+
]
27+
28+
let escape_char = function
29+
| 'n' -> '\n'
30+
| 't' -> '\t'
31+
| 'r' -> '\r'
32+
| 'b' -> '\b'
33+
| '\\' -> '\\'
34+
| '\'' -> '\''
35+
| '"' -> '"'
36+
| c -> c
37+
38+
let escape_special = function
39+
| '(' -> {|\(|}
40+
| ')' -> {|\)|}
41+
| '[' -> {|\[|}
42+
| ']' -> {|\]|}
43+
| '{' -> {|\{|}
44+
| '}' -> {|\}|}
45+
| '.' -> {|\.|}
46+
| '*' -> {|\*|}
47+
| '+' -> {|\+|}
48+
| '?' -> {|\?|}
49+
| '^' -> {|\^|}
50+
| '$' -> {|\$|}
51+
| '|' -> {|\||}
52+
| c -> String.make 1 c
53+
}
54+
55+
let whitespace = [' ' '\t' '\r']
56+
let lowercase = ['a'-'z']
57+
let uppercase = ['A'-'Z']
58+
let alpha = lowercase | uppercase
59+
let digit = ['0'-'9']
60+
let ident = (alpha | '_') (alpha | digit | '_' | '\'')*
61+
62+
rule token = parse
63+
| [' ' '\t' '\r']+ { token lexbuf }
64+
| '\n' { new_line lexbuf; token lexbuf }
65+
| '/' { SLASH }
66+
| '(' { LPAREN }
67+
| ')' { RPAREN }
68+
| '[' { LBRACKET }
69+
| ']' { RBRACKET }
70+
| '^' { CARET }
71+
| '{' { LBRACE }
72+
| '}' { RBRACE }
73+
| '-' { DASH }
74+
| '|' { BAR }
75+
| '*' { STAR }
76+
| '+' { PLUS }
77+
| '?' { QUESTION }
78+
| '_' { UNDERSCORE }
79+
| ':' { COLON }
80+
| "as" { AS }
81+
| "int" { INT_CONVERTER }
82+
| "float" { FLOAT_CONVERTER }
83+
| digit+ as n { INT (int_of_string n) }
84+
| ident as id {
85+
match List.assoc_opt id predefined_classes with
86+
| Some pcre_class -> PREDEFINED_CLASS pcre_class
87+
| None -> IDENT id
88+
}
89+
| '\'' { char_literal (Buffer.create 16) lexbuf }
90+
| '"' { string_literal (Buffer.create 16) lexbuf }
91+
| eof { EOF }
92+
| _ as c { raise (Error ("Unexpected character: " ^ String.make 1 c)) }
93+
94+
and char_literal buf = parse
95+
| '\\' (_ as c) {
96+
Buffer.add_char buf (escape_char c);
97+
char_literal buf lexbuf
98+
}
99+
| '\'' { CHAR_LITERAL (Buffer.contents buf) }
100+
| ('(' as c) | (')' as c) {
101+
Buffer.add_string buf (escape_special c);
102+
char_literal buf lexbuf
103+
}
104+
| ('{' as c) | ('}' as c) {
105+
Buffer.add_string buf (escape_special c);
106+
char_literal buf lexbuf
107+
}
108+
| ('[' as c) | (']' as c) {
109+
Buffer.add_string buf (escape_special c);
110+
char_literal buf lexbuf
111+
}
112+
| ('.' as c) | ('*' as c) | ('+' as c) | ('?' as c) | ('^' as c) | ('$' as c) | ('|' as c) {
113+
Buffer.add_string buf (escape_special c);
114+
char_literal buf lexbuf
115+
}
116+
| _ as c {
117+
Buffer.add_char buf c;
118+
char_literal buf lexbuf
119+
}
120+
| eof { raise (Error "Unterminated character literal") }
121+
122+
and string_literal buf = parse
123+
| '\\' (_ as c) {
124+
Buffer.add_char buf (escape_char c);
125+
string_literal buf lexbuf
126+
}
127+
| '"' { STRING_LITERAL (Buffer.contents buf) }
128+
| _ as c {
129+
Buffer.add_char buf c;
130+
string_literal buf lexbuf
131+
}
132+
| eof { raise (Error "Unterminated string literal") }

common/mik_parser.mly

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
%{
2+
open Regexp_types
3+
4+
let nonepsilon = function { Location.txt = Seq []; _ } -> false | _ -> true
5+
6+
let simplify_seq ~loc es =
7+
match List.filter nonepsilon es with
8+
| [ e ] -> e
9+
| es -> mkloc (Seq es) loc
10+
11+
let simplify_alt es =
12+
match es with
13+
| [ e ] -> e.Location.txt
14+
| es -> Alt es
15+
16+
let make_loc startpos endpos =
17+
Location.{
18+
loc_start = startpos;
19+
loc_end = endpos;
20+
loc_ghost = false;
21+
}
22+
23+
let wrap_loc startpos endpos x =
24+
Location.{
25+
txt = x;
26+
loc = make_loc startpos endpos;
27+
}
28+
29+
let to_pcre_regex str startpos endpos =
30+
wrap_loc startpos endpos (Code str)
31+
32+
let syntax_error msg startpos endpos =
33+
let loc = make_loc startpos endpos in
34+
raise @@ PError (loc, msg)
35+
36+
let missing_error what startpos endpos =
37+
syntax_error (Printf.sprintf "Missing %s" what) startpos endpos
38+
39+
let unclosed_error what startpos endpos =
40+
syntax_error (Printf.sprintf "Unclosed %s" what) startpos endpos
41+
%}
42+
43+
%token <string> CHAR_LITERAL STRING_LITERAL IDENT PREDEFINED_CLASS
44+
%token <int> INT
45+
%token SLASH LPAREN RPAREN LBRACKET RBRACKET CARET LBRACE RBRACE
46+
%token DASH BAR STAR PLUS QUESTION UNDERSCORE COLON AS
47+
%token INT_CONVERTER FLOAT_CONVERTER EOF
48+
49+
%start <string t> main
50+
%start <string t> pattern
51+
52+
/* operator precedence from lowest to highest */
53+
%right AS /* priority 3: capture */
54+
%left BAR /* priority 2: alternation */
55+
%left seq /* priority 1: sequence (implicit) */
56+
%nonassoc STAR PLUS QUESTION LBRACE /* priority 0: repetition */
57+
58+
%%
59+
60+
main:
61+
| SLASH p = pattern SLASH EOF { p }
62+
| SLASH pattern EOF { unclosed_error "pattern (missing closing '/')" $startpos($1) $endpos }
63+
| SLASH error { syntax_error "Invalid pattern after opening slash" $startpos($2) $endpos($2) }
64+
| error { syntax_error "Expected pattern to start with '/'" $startpos($1) $endpos($1) }
65+
66+
pattern:
67+
| alt_expr { $1 }
68+
| { missing_error "pattern expression" $startpos $endpos }
69+
70+
alt_expr:
71+
| seq_expr { $1 }
72+
| seq_expr BAR alt_expr {
73+
let loc = make_loc $startpos $endpos in
74+
mkloc (simplify_alt [$1; $3]) loc
75+
}
76+
| seq_expr BAR { missing_error "expression after '|'" $startpos($2) $endpos }
77+
| BAR { missing_error "expression before '|'" $startpos $endpos }
78+
79+
seq_expr:
80+
| atom_expr { $1 }
81+
| atom_expr seq_expr %prec seq {
82+
let loc = make_loc $startpos $endpos in
83+
simplify_seq ~loc [$1; $2]
84+
}
85+
86+
atom_expr:
87+
| basic_atom { $1 }
88+
| basic_atom STAR {
89+
let repeat_loc = wrap_loc $startpos($2) $endpos($2) (0, None) in
90+
wrap_loc $startpos $endpos (Repeat (repeat_loc, $1))
91+
}
92+
| basic_atom PLUS {
93+
let repeat_loc = wrap_loc $startpos($2) $endpos($2) (1, None) in
94+
wrap_loc $startpos $endpos (Repeat (repeat_loc, $1))
95+
}
96+
| basic_atom QUESTION {
97+
wrap_loc $startpos $endpos (Opt $1)
98+
}
99+
| basic_atom LBRACE n = INT RBRACE {
100+
let repeat_loc = wrap_loc $startpos($2) $endpos($4) (n, Some n) in
101+
wrap_loc $startpos $endpos (Repeat (repeat_loc, $1))
102+
}
103+
| basic_atom LBRACE min = INT DASH max = INT RBRACE {
104+
let repeat_loc = wrap_loc $startpos($2) $endpos($6) (min, Some max) in
105+
wrap_loc $startpos $endpos (Repeat (repeat_loc, $1))
106+
}
107+
| basic_atom LBRACE min = INT DASH RBRACE {
108+
let repeat_loc = wrap_loc $startpos($2) $endpos($5) (min, None) in
109+
wrap_loc $startpos $endpos (Repeat (repeat_loc, $1))
110+
}
111+
(* error cases for repetition *)
112+
| basic_atom LBRACE RBRACE { missing_error "repetition count" $startpos($2) $endpos($3) }
113+
| basic_atom LBRACE INT DASH DASH { syntax_error "Invalid repetition range" $startpos($4) $endpos($5) }
114+
| basic_atom LBRACE INT { unclosed_error "repetition (missing '}')" $startpos($2) $endpos }
115+
| basic_atom LBRACE error { syntax_error "Invalid repetition syntax" $startpos($2) $endpos }
116+
(* error cases for missing atoms before operators *)
117+
| STAR { missing_error "expression before '*'" $startpos $endpos }
118+
| PLUS { missing_error "expression before '+'" $startpos $endpos }
119+
| QUESTION { missing_error "expression before '?'" $startpos $endpos }
120+
121+
basic_atom:
122+
| CHAR_LITERAL {
123+
to_pcre_regex $1 $startpos $endpos
124+
}
125+
| STRING_LITERAL {
126+
to_pcre_regex $1 $startpos $endpos
127+
}
128+
| UNDERSCORE {
129+
to_pcre_regex "." $startpos $endpos
130+
}
131+
| PREDEFINED_CLASS {
132+
to_pcre_regex $1 $startpos $endpos
133+
}
134+
| IDENT {
135+
let ident_loc = wrap_loc $startpos $endpos $1 in
136+
let pattern_node = to_pcre_regex $1 $startpos $endpos in
137+
wrap_loc $startpos $endpos (Unnamed_subs (ident_loc, pattern_node))
138+
}
139+
140+
| LBRACKET char_set RBRACKET {
141+
let set_str = "[" ^ $2 ^ "]" in
142+
to_pcre_regex set_str $startpos $endpos
143+
}
144+
| LBRACKET CARET char_set RBRACKET {
145+
let set_str = "[^" ^ $3 ^ "]" in
146+
to_pcre_regex set_str $startpos $endpos
147+
}
148+
| LBRACKET RBRACKET { missing_error "character set content" $startpos $endpos }
149+
| LBRACKET CARET RBRACKET { missing_error "character set content after '^'" $startpos $endpos }
150+
| LBRACKET char_set { unclosed_error "character set (missing ']')" $startpos($1) $endpos }
151+
| LBRACKET error { syntax_error "Invalid character set" $startpos $endpos }
152+
153+
| LPAREN pattern RPAREN {
154+
$2
155+
}
156+
| LPAREN RPAREN { missing_error "pattern inside parentheses" $startpos $endpos }
157+
| LPAREN pattern EOF? { unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($2) }
158+
159+
| LPAREN IDENT RPAREN {
160+
let ident_loc = wrap_loc $startpos($2) $endpos($2) $2 in
161+
let pattern_node = to_pcre_regex $2 $startpos($2) $endpos($2) in
162+
wrap_loc $startpos $endpos (Named_subs (ident_loc, None, None, pattern_node))
163+
}
164+
| LPAREN IDENT AS RPAREN { missing_error "name after 'as'" $startpos($3) $endpos($4) }
165+
| LPAREN IDENT AS name = IDENT RPAREN {
166+
let ident_loc = wrap_loc $startpos($2) $endpos($2) $2 in
167+
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
168+
let pattern_node = to_pcre_regex $2 $startpos($2) $endpos($2) in
169+
wrap_loc $startpos $endpos (Named_subs (ident_loc, Some name_loc, None, pattern_node))
170+
}
171+
| LPAREN IDENT AS IDENT COLON RPAREN {
172+
missing_error "type converter after ':'" $startpos($5) $endpos($6)
173+
}
174+
| LPAREN IDENT AS name = IDENT COLON INT_CONVERTER RPAREN {
175+
let ident_loc = wrap_loc $startpos($2) $endpos($2) $2 in
176+
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
177+
let pattern_node = to_pcre_regex $2 $startpos($2) $endpos($2) in
178+
wrap_loc $startpos $endpos (Named_subs (ident_loc, Some name_loc, Some Int, pattern_node))
179+
}
180+
| LPAREN IDENT AS name = IDENT COLON FLOAT_CONVERTER RPAREN {
181+
let ident_loc = wrap_loc $startpos($2) $endpos($2) $2 in
182+
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
183+
let pattern_node = to_pcre_regex $2 $startpos($2) $endpos($2) in
184+
wrap_loc $startpos $endpos (Named_subs (ident_loc, Some name_loc, Some Float, pattern_node))
185+
}
186+
| LPAREN IDENT AS IDENT EOF? {
187+
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($4)
188+
}
189+
| LPAREN IDENT AS IDENT COLON INT_CONVERTER EOF? {
190+
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($6)
191+
}
192+
| LPAREN IDENT AS IDENT COLON FLOAT_CONVERTER EOF? {
193+
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($6)
194+
}
195+
196+
| LPAREN pattern AS RPAREN { missing_error "capture name after 'as'" $startpos($3) $endpos($4) }
197+
| LPAREN pattern AS name = IDENT RPAREN {
198+
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
199+
wrap_loc $startpos $endpos (Capture_as (name_loc, None, $2))
200+
}
201+
| LPAREN pattern AS IDENT COLON RPAREN {
202+
missing_error "type converter after ':'" $startpos($5) $endpos($6)
203+
}
204+
| LPAREN pattern AS name = IDENT COLON INT_CONVERTER RPAREN {
205+
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
206+
wrap_loc $startpos $endpos (Capture_as (name_loc, Some Int, $2))
207+
}
208+
| LPAREN pattern AS name = IDENT COLON FLOAT_CONVERTER RPAREN {
209+
let name_loc = wrap_loc $startpos(name) $endpos(name) name in
210+
wrap_loc $startpos $endpos (Capture_as (name_loc, Some Float, $2))
211+
}
212+
| LPAREN pattern AS IDENT EOF? {
213+
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($4)
214+
}
215+
| LPAREN pattern AS IDENT COLON INT_CONVERTER EOF? {
216+
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($6)
217+
}
218+
| LPAREN pattern AS IDENT COLON FLOAT_CONVERTER EOF? {
219+
unclosed_error "parentheses (missing ')')" $startpos($1) $endpos($6)
220+
}
221+
222+
| LPAREN error { syntax_error "Invalid expression in parentheses" $startpos($2) $endpos }
223+
224+
char_set:
225+
| char_set_item { $1 }
226+
| char_set_item char_set { $1 ^ $2 }
227+
228+
char_set_item:
229+
| CHAR_LITERAL { $1 }
230+
| CHAR_LITERAL DASH CHAR_LITERAL { $1 ^ "-" ^ $3 }
231+
| CHAR_LITERAL DASH { missing_error "character after '-' in range" $startpos($2) $endpos }
232+
| STRING_LITERAL { $1 }
233+
| PREDEFINED_CLASS { $1 }
234+
| IDENT { $1 }

0 commit comments

Comments
 (0)