|
| 1 | +module CSTParser |
| 2 | +global debug = true |
| 3 | + |
| 4 | +using Tokenize |
| 5 | +import Base: length, first, last, getindex, setindex! |
| 6 | +import Tokenize.Tokens |
| 7 | +import Tokenize.Tokens: RawToken, AbstractToken, iskeyword, isliteral, isoperator, untokenize |
| 8 | +import Tokenize.Lexers: Lexer, peekchar, iswhitespace |
| 9 | + |
| 10 | +export ParseState, parse_expression |
| 11 | + |
| 12 | +include("lexer.jl") |
| 13 | +include("spec.jl") |
| 14 | +include("utils.jl") |
| 15 | +include("recovery.jl") |
| 16 | +include("components/internals.jl") |
| 17 | +include("components/keywords.jl") |
| 18 | +include("components/lists.jl") |
| 19 | +include("components/operators.jl") |
| 20 | +include("components/strings.jl") |
| 21 | +include("location.jl") |
| 22 | +include("conversion.jl") |
| 23 | +include("display.jl") |
| 24 | +include("interface.jl") |
| 25 | + |
| 26 | + |
| 27 | +""" |
| 28 | + parse_expression(ps) |
| 29 | +
|
| 30 | +Parses an expression until `closer(ps) == true`. Expects to enter the |
| 31 | +`ParseState` the token before the the beginning of the expression and ends |
| 32 | +on the last token. |
| 33 | +
|
| 34 | +Acceptable starting tokens are: |
| 35 | ++ A keyword |
| 36 | ++ An opening parentheses or brace. |
| 37 | ++ An operator. |
| 38 | ++ An instance (e.g. identifier, number, etc.) |
| 39 | ++ An `@`. |
| 40 | +
|
| 41 | +""" |
| 42 | +@addctx :expr function parse_expression(ps::ParseState) |
| 43 | + if ps.nt.kind == Tokens.COMMA |
| 44 | + push!(ps.errors, Error((ps.nt.startbyte:ps.nws.endbyte) .+ 1, "Expression began with a comma.")) |
| 45 | + ret = ErrorToken(PUNCTUATION(next(ps))) |
| 46 | + elseif ps.nt.kind ∈ term_c && ps.nt.kind != Tokens.END |
| 47 | + push!(ps.errors, Error((ps.nt.startbyte:ps.nws.endbyte) .+ 1, "Expression began with a terminal token: $(ps.nt.kind).")) |
| 48 | + ret = ErrorToken(INSTANCE(next(ps))) |
| 49 | + else |
| 50 | + next(ps) |
| 51 | + if iskeyword(ps.t.kind) && ps.t.kind != Tokens.DO |
| 52 | + ret = parse_kw(ps) |
| 53 | + elseif ps.t.kind == Tokens.LPAREN |
| 54 | + ret = parse_paren(ps) |
| 55 | + elseif ps.t.kind == Tokens.LSQUARE |
| 56 | + ret = @default ps parse_array(ps) |
| 57 | + elseif ps.t.kind == Tokens.LBRACE |
| 58 | + ret = @default ps @closebrace ps parse_braces(ps) |
| 59 | + elseif isinstance(ps.t) || isoperator(ps.t) |
| 60 | + if ps.t.kind == Tokens.WHERE |
| 61 | + ret = IDENTIFIER(ps) |
| 62 | + else |
| 63 | + ret = INSTANCE(ps) |
| 64 | + end |
| 65 | + if is_colon(ret) && ps.nt.kind != Tokens.COMMA |
| 66 | + ret = parse_unary(ps, ret) |
| 67 | + end |
| 68 | + elseif ps.t.kind == Tokens.AT_SIGN |
| 69 | + ret = parse_macrocall(ps) |
| 70 | + else |
| 71 | + ret = ErrorToken(INSTANCE(ps)) |
| 72 | + push!(ps.errors, Error((ps.nt.startbyte:ps.nws.endbyte) .+ 1, "Expression began with a : $(ps.nt.kind).")) |
| 73 | + end |
| 74 | + |
| 75 | + while !closer(ps) |
| 76 | + ret = parse_compound(ps, ret) |
| 77 | + end |
| 78 | + end |
| 79 | + return ret |
| 80 | +end |
| 81 | + |
| 82 | +function parse_compound(ps::ParseState, @nospecialize ret) |
| 83 | + if ps.nt.kind == Tokens.FOR |
| 84 | + ret = parse_generator(ps, ret) |
| 85 | + elseif ps.nt.kind == Tokens.DO |
| 86 | + ret = @default ps @closer ps block parse_do(ps, ret) |
| 87 | + elseif isajuxtaposition(ps, ret) |
| 88 | + op = OPERATOR(0, 0, Tokens.STAR, false) |
| 89 | + ret = parse_operator(ps, ret, op) |
| 90 | + elseif (ret isa EXPR{x_Str} || ret isa EXPR{x_Cmd}) && ps.nt.kind == Tokens.IDENTIFIER |
| 91 | + arg = IDENTIFIER(next(ps)) |
| 92 | + push!(ret, LITERAL(arg.fullspan, arg.span, val(ps.t, ps), Tokens.STRING)) |
| 93 | + elseif (ret isa IDENTIFIER || (ret isa BinarySyntaxOpCall && is_dot(ret.op))) && (ps.nt.kind == Tokens.STRING || ps.nt.kind == Tokens.TRIPLE_STRING || ps.nt.kind == Tokens.CMD) |
| 94 | + next(ps) |
| 95 | + arg = parse_string_or_cmd(ps, ret) |
| 96 | + head = arg.kind == Tokens.CMD ? x_Cmd : x_Str |
| 97 | + ret = EXPR{head}(Any[ret, arg]) |
| 98 | + elseif ps.nt.kind == Tokens.LPAREN |
| 99 | + no_ws = !isemptyws(ps.ws) |
| 100 | + err_rng = ps.t.endbyte + 2:ps.nt.startbyte |
| 101 | + ret = @closeparen ps parse_call(ps, ret) |
| 102 | + if no_ws && !(ret isa UnaryOpCall || ret isa UnarySyntaxOpCall) |
| 103 | + push!(ps.errors, Error(err_rng, "White space in function call.")) |
| 104 | + ret = ErrorToken(ret) |
| 105 | + end |
| 106 | + elseif ps.nt.kind == Tokens.LBRACE |
| 107 | + if isemptyws(ps.ws) |
| 108 | + ret = @default ps @nocloser ps inwhere @closebrace ps parse_curly(ps, ret) |
| 109 | + else |
| 110 | + push!(ps.errors, Error(ps.t.endbyte + 2:ps.nt.startbyte , "White space in brace call.")) |
| 111 | + ret = ErrorToken(@default ps @nocloser ps inwhere @closebrace ps parse_curly(ps, ret)) |
| 112 | + end |
| 113 | + elseif ps.nt.kind == Tokens.LSQUARE && isemptyws(ps.ws) && !(ret isa OPERATOR) |
| 114 | + ret = @default ps @nocloser ps block parse_ref(ps, ret) |
| 115 | + elseif ps.nt.kind == Tokens.COMMA |
| 116 | + ret = parse_tuple(ps, ret) |
| 117 | + elseif isunaryop(ret) && ps.nt.kind != Tokens.EQ |
| 118 | + ret = parse_unary(ps, ret) |
| 119 | + elseif isoperator(ps.nt) |
| 120 | + op = OPERATOR(next(ps)) |
| 121 | + ret = parse_operator(ps, ret, op) |
| 122 | + elseif ret isa UnarySyntaxOpCall && is_prime(ret.arg2) |
| 123 | + # prime operator followed by an identifier has an implicit multiplication |
| 124 | + nextarg = @precedence ps 11 parse_expression(ps) |
| 125 | + ret = BinaryOpCall(ret, OPERATOR(0, 0, Tokens.STAR,false), nextarg) |
| 126 | +################################################################################ |
| 127 | +# Everything below here is an error |
| 128 | +################################################################################ |
| 129 | + elseif ps.nt.kind in (Tokens.RPAREN, Tokens.RSQUARE, Tokens.RBRACE) |
| 130 | + push!(ps.errors, Error((ps.t.startbyte:ps.nt.endbyte) .+ 1 , "Disallowed compound expression.")) |
| 131 | + ret = EXPR{ErrorToken}([ret, ErrorToken(PUNCTUATION(next(ps)))]) |
| 132 | + else |
| 133 | + push!(ps.errors, Error((ps.t.startbyte:ps.nt.endbyte) .+ 1 , "Disallowed compound expression.")) |
| 134 | + nextarg = parse_expression(ps) |
| 135 | + ret = EXPR{ErrorToken}([ret, nextarg]) |
| 136 | + end |
| 137 | + return ret |
| 138 | +end |
| 139 | + |
| 140 | +""" |
| 141 | + parse_paren(ps, ret) |
| 142 | +
|
| 143 | +Parses an expression starting with a `(`. |
| 144 | +""" |
| 145 | +@addctx :paren function parse_paren(ps::ParseState) |
| 146 | + args = Any[PUNCTUATION(ps)] |
| 147 | + @closeparen ps @default ps @nocloser ps inwhere parse_comma_sep(ps, args, false, true, true) |
| 148 | + |
| 149 | + if length(args) == 2 && ((ps.ws.kind != SemiColonWS || (length(args) == 2 && args[2] isa EXPR{Block})) && !(args[2] isa EXPR{Parameters})) |
| 150 | + accept_rparen(ps, args) |
| 151 | + ret = EXPR{InvisBrackets}(args) |
| 152 | + else |
| 153 | + accept_rparen(ps, args) |
| 154 | + ret = EXPR{TupleH}(args) |
| 155 | + end |
| 156 | + return ret |
| 157 | +end |
| 158 | + |
| 159 | +""" |
| 160 | + parse(str, cont = false) |
| 161 | +
|
| 162 | +Parses the passed string. If `cont` is true then will continue parsing until the end of the string returning the resulting expressions in a TOPLEVEL block. |
| 163 | +""" |
| 164 | +function parse(str::String, cont = false) |
| 165 | + ps = ParseState(str) |
| 166 | + x, ps = parse(ps, cont) |
| 167 | + return x |
| 168 | +end |
| 169 | + |
| 170 | +function parse_doc(ps::ParseState) |
| 171 | + if (ps.nt.kind == Tokens.STRING || ps.nt.kind == Tokens.TRIPLE_STRING) && !isemptyws(ps.nws) |
| 172 | + doc = LITERAL(next(ps)) |
| 173 | + if (ps.nt.kind == Tokens.ENDMARKER || ps.nt.kind == Tokens.END) |
| 174 | + return doc |
| 175 | + elseif isbinaryop(ps.nt) && !closer(ps) |
| 176 | + ret = parse_compound(ps, doc) |
| 177 | + return ret |
| 178 | + end |
| 179 | + |
| 180 | + ret = parse_expression(ps) |
| 181 | + ret = EXPR{MacroCall}(Any[GlobalRefDOC, doc, ret]) |
| 182 | + elseif ps.nt.kind == Tokens.IDENTIFIER && val(ps.nt, ps) == "doc" && (ps.nnt.kind == Tokens.STRING || ps.nnt.kind == Tokens.TRIPLE_STRING) |
| 183 | + doc = IDENTIFIER(next(ps)) |
| 184 | + next(ps) |
| 185 | + arg = parse_string_or_cmd(ps, doc) |
| 186 | + doc = EXPR{x_Str}(Any[doc, arg]) |
| 187 | + ret = parse_expression(ps) |
| 188 | + ret = EXPR{MacroCall}(Any[GlobalRefDOC, doc, ret]) |
| 189 | + else |
| 190 | + ret = parse_expression(ps) |
| 191 | + end |
| 192 | + return ret |
| 193 | +end |
| 194 | + |
| 195 | +function parse(ps::ParseState, cont = false) |
| 196 | + if ps.l.io.size == 0 |
| 197 | + return (cont ? EXPR{FileH}(Any[]) : nothing), ps |
| 198 | + end |
| 199 | + last_line = 0 |
| 200 | + curr_line = 0 |
| 201 | + |
| 202 | + if cont |
| 203 | + top = EXPR{FileH}(Any[]) |
| 204 | + if ps.nt.kind == Tokens.WHITESPACE || ps.nt.kind == Tokens.COMMENT |
| 205 | + next(ps) |
| 206 | + push!(top, LITERAL(ps.nt.startbyte, ps.nt.startbyte, "", Tokens.NOTHING)) |
| 207 | + end |
| 208 | + |
| 209 | + while !ps.done && !ps.errored |
| 210 | + curr_line = ps.nt.startpos[1] |
| 211 | + ret = parse_doc(ps) |
| 212 | + |
| 213 | + # join semicolon sep items |
| 214 | + if curr_line == last_line && last(top.args) isa EXPR{TopLevel} |
| 215 | + push!(last(top.args), ret) |
| 216 | + elseif ps.ws.kind == SemiColonWS |
| 217 | + push!(top, EXPR{TopLevel}(Any[ret])) |
| 218 | + else |
| 219 | + push!(top, ret) |
| 220 | + end |
| 221 | + last_line = curr_line |
| 222 | + end |
| 223 | + else |
| 224 | + if ps.nt.kind == Tokens.WHITESPACE || ps.nt.kind == Tokens.COMMENT |
| 225 | + next(ps) |
| 226 | + top = LITERAL(ps.nt.startbyte, ps.nt.startbyte, "", Tokens.NOTHING) |
| 227 | + else |
| 228 | + top = parse_doc(ps) |
| 229 | + last_line = ps.nt.startpos[1] |
| 230 | + if ps.ws.kind == SemiColonWS |
| 231 | + top = EXPR{TopLevel}(Any[top]) |
| 232 | + while ps.ws.kind == SemiColonWS && ps.nt.startpos[1] == last_line && ps.nt.kind != Tokens.ENDMARKER |
| 233 | + ret = parse_doc(ps) |
| 234 | + push!(top, ret) |
| 235 | + last_line = ps.nt.startpos[1] |
| 236 | + end |
| 237 | + end |
| 238 | + end |
| 239 | + end |
| 240 | + |
| 241 | + return top, ps |
| 242 | +end |
| 243 | + |
| 244 | + |
| 245 | +function parse_file(path::String) |
| 246 | + x = parse(read(path, String), true) |
| 247 | + File([], [], path, x, []) |
| 248 | +end |
| 249 | + |
| 250 | +function parse_directory(path::String, proj = Project(path, [])) |
| 251 | + for f in readdir(path) |
| 252 | + if isfile(joinpath(path, f)) && endswith(f, ".jl") |
| 253 | + try |
| 254 | + push!(proj.files, parse_file(joinpath(path, f))) |
| 255 | + catch |
| 256 | + println("$f failed to parse") |
| 257 | + end |
| 258 | + elseif isdir(joinpath(path, f)) |
| 259 | + parse_directory(joinpath(path, f), proj) |
| 260 | + end |
| 261 | + end |
| 262 | + proj |
| 263 | +end |
| 264 | +end |
0 commit comments