diff --git a/Cargo.lock b/Cargo.lock index 99978ac..4e45a5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -140,6 +140,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "bindgen" version = "0.70.1" @@ -847,6 +853,40 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "logos" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f536c1af4c7cc81edf73da1f8029896e7e1e16a219ef09b184e76a296f3db" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189bbfd0b61330abea797e5e9276408f2edbe4f822d7ad08685d67419aafb34e" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax 0.8.5", + "rustc_version", + "syn 2.0.79", +] + +[[package]] +name = "logos-derive" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebfe8e1a19049ddbfccbd14ac834b215e11b85b90bab0c2dba7c7b92fb5d5cba" +dependencies = [ + "logos-codegen", +] + [[package]] name = "loom" version = "0.7.2" @@ -951,6 +991,7 @@ name = "new-nu-parser" version = "0.1.0" dependencies = [ "insta", + "logos", "nu-parser", "nu-protocol", "tango-bench", @@ -1545,6 +1586,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.38.37" @@ -1611,6 +1661,12 @@ version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + [[package]] name = "serde" version = "1.0.210" diff --git a/Cargo.toml b/Cargo.toml index 6811fed..94b86e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [dependencies] tracy-client = { version = "0.17.3", default-features = false } # for tracy v0.11.1 +logos = "0.15" [profile.profiling] inherits = "release" diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index baeac59..43ca269 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -1,5 +1,6 @@ use std::process::exit; +use new_nu_parser::lexer::{lex, Tokens}; use nu_protocol::engine::{EngineState, StateWorkingSet}; use tango_bench::{benchmark_fn, tango_benchmarks, tango_main, Benchmark, IntoBenchmarks}; @@ -16,9 +17,11 @@ const BENCHMARKS: &[&str] = &[ "combined10", "combined100", "combined1000", + "int100", ]; enum Stage { + Lex, Parse, Resolve, Typecheck, @@ -30,6 +33,7 @@ enum Stage { /// Stages of compilation we want to profile const STAGES: &[Stage] = &[ + Stage::Lex, Stage::Parse, Stage::Resolve, Stage::Typecheck, @@ -52,8 +56,15 @@ fn setup_compiler( let contents = std::fs::read(fname).map_err(|_| format!("Cannot find file {fname}"))?; compiler.add_file(&fname, &contents); + let (tokens, err) = lex(&contents, span_offset); + if let Err(e) = err { + tokens.eprint(&compiler.source); + eprintln!("Lexing error. Error: {:?}", e); + exit(1); + } + if do_parse { - let parser = Parser::new(compiler, span_offset); + let parser = Parser::new(compiler, tokens); compiler = parser.parse(); if !compiler.errors.is_empty() { @@ -87,8 +98,8 @@ fn setup_compiler( } /// Parse only -pub fn parse(mut compiler: Compiler, span_offset: usize) { - let parser = Parser::new(compiler, span_offset); +pub fn parse(mut compiler: Compiler, tokens: Tokens) { + let parser = Parser::new(compiler, tokens); compiler = parser.parse(); if !compiler.errors.is_empty() { @@ -129,7 +140,14 @@ pub fn typecheck(mut compiler: Compiler, do_merge: bool) { /// Run all compiler stages pub fn compile(mut compiler: Compiler, span_offset: usize) { - let parser = Parser::new(compiler, span_offset); + let (tokens, err) = lex(&compiler.source, span_offset); + if let Err(e) = err { + tokens.eprint(&compiler.source); + eprintln!("Lexing error. Error: {:?}", e); + exit(1); + } + + let parser = Parser::new(compiler, tokens); compiler = parser.parse(); if !compiler.errors.is_empty() { @@ -176,13 +194,36 @@ fn compiler_benchmarks() -> impl IntoBenchmarks { let bench_file = format!("benches/nu/{bench_name}.nu"); let bench = match stage { + Stage::Lex => { + let name = format!("{bench_name}_lex"); + benchmark_fn(name, move |b| { + let contents = std::fs::read(&bench_file) + .expect(&format!("Cannot find file {bench_file}")); + b.iter(move || { + let (tokens, err) = lex(&contents, 0); + if let Err(e) = err { + tokens.eprint(&contents); + eprintln!("Lexing error. Error: {:?}", e); + exit(1); + } + }) + }) + } Stage::Parse => { let name = format!("{bench_name}_parse"); benchmark_fn(name, move |b| { let (compiler_def_init, span_offset) = setup_compiler(&bench_file, false, false, false) .expect("Error setting up compiler"); - b.iter(move || parse(compiler_def_init.clone(), span_offset)) + let contents = std::fs::read(&bench_file) + .expect(&format!("Cannot find file {bench_file}")); + let (tokens, err) = lex(&contents, span_offset); + if let Err(e) = err { + tokens.eprint(&contents); + eprintln!("Lexing error. Error: {:?}", e); + exit(1); + } + b.iter(move || parse(compiler_def_init.clone(), tokens.clone())) }) } Stage::Resolve => { diff --git a/benches/nu/int100.nu b/benches/nu/int100.nu new file mode 100644 index 0000000..401083e --- /dev/null +++ b/benches/nu/int100.nu @@ -0,0 +1,404 @@ +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 +1000 +0x2000 +0o3000 +0b1000 diff --git a/src/compiler.rs b/src/compiler.rs index 7a551c1..e44e9bc 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -10,15 +10,21 @@ pub struct RollbackPoint { idx_nodes: usize, idx_errors: usize, idx_blocks: usize, - span_offset: usize, + token_pos: usize, } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] pub struct Span { pub start: usize, pub end: usize, } +impl Span { + pub fn new(start: usize, end: usize) -> Self { + Self { start, end } + } +} + #[derive(Clone)] pub struct Compiler { // Core information, indexed by NodeId: @@ -174,13 +180,13 @@ impl Compiler { NodeId(self.ast_nodes.len() - 1) } - pub fn get_rollback_point(&self, span_offset: usize) -> RollbackPoint { + pub fn get_rollback_point(&self, token_pos: usize) -> RollbackPoint { RollbackPoint { idx_span_start: self.spans.len(), idx_nodes: self.ast_nodes.len(), idx_errors: self.errors.len(), idx_blocks: self.blocks.len(), - span_offset, + token_pos, } } @@ -190,7 +196,7 @@ impl Compiler { self.errors.truncate(rbp.idx_errors); self.spans.truncate(rbp.idx_span_start); - rbp.span_offset + rbp.token_pos } /// Get span of node diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..37aadab --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,325 @@ +use crate::compiler::Span; +use logos::Logos; + +/// Average number of bytes per token used for estimating the tokens buffer size. +/// +/// Estimated with this snippet: +/// let res = ls tests/**/*.nu | get name | each {|name| +/// let nbytes = open --raw $name | into binary | length +/// let ntokens = cargo run -- $name | lines | where $it starts-with 'Token3' | length +/// { +/// file: $name +/// nbytes: $nbytes +/// ntokens: $ntokens +/// bytes_per_token: ($nbytes / $ntokens) +/// } +/// } +/// +/// TODO: Use larger and more representative codebase to estimate this +const AVG_BYTES_PER_TOKEN: usize = 2; + +/// Lexed tokens +/// +/// Tokens and spans are stored in separate vectors indexed by a position index (starting at 0). +#[derive(Clone)] +pub struct Tokens { + pos: usize, + tokens: Vec, + spans: Vec, +} + +impl Tokens { + /// Create a new Tokens with allocated storage for the tokens and spans + pub fn new(source: &[u8]) -> Self { + let estimated_num_tokens = source.len() / AVG_BYTES_PER_TOKEN; + Tokens { + pos: 0, + tokens: Vec::with_capacity(estimated_num_tokens), + spans: Vec::with_capacity(estimated_num_tokens), + } + } + + // Position-related methods + + /// Advance position to point at the next token + /// + /// Note that this can potentially point beyond the tokens if called enough times. The parser + /// should correctly check for EOF and terminate without advancing further. + pub fn advance(&mut self) { + debug_assert!(self.pos < self.tokens.len()); + self.pos += 1; + } + + /// Return current position + pub fn pos(&self) -> usize { + self.pos + } + + /// Set current position + pub fn set_pos(&mut self, pos: usize) { + self.pos = pos; + } + + // Adding and fetching tokens + + /// Push a spanned token to the internal storage + pub fn push(&mut self, token: Token, span: Span) { + self.tokens.push(token); + self.spans.push(span); + } + + /// Check the token at the current position + pub fn peek(&self) -> (Token, Span) { + (self.peek_token(), self.peek_span()) + } + + /// Same as peek() but return only the token + pub fn peek_token(&self) -> Token { + self.tokens[self.pos] + } + + /// Same as peek() but return only the span + pub fn peek_span(&self) -> Span { + self.spans[self.pos] + } + + // Printing + + /// Format the tokens into a human-readable output for debugging + pub fn display(&self, source: &[u8]) -> String { + let mut result = String::new(); + + result.push_str("==== TOKENS ====\n"); + + for (i, (token, span)) in self.tokens.iter().zip(self.spans.iter()).enumerate() { + result.push_str(&format!( + "Token3 {i:4}: {:25} span: {:4} .. {:4} '{}'\n", + format!("{:?}", token), + span.start, + span.end, + String::from_utf8_lossy( + source + .get(span.start..span.end) + .expect("missing source of token span") + ) + .replace("\r", "\\r") + .replace("\n", "\\n") + .replace("\t", "\\t") + )); + } + + result + } + + /// Print the output of display() to standard output + pub fn print(&self, source: &[u8]) { + let output = self.display(source); + print!("{output}"); + } + + /// Print the output of display() to standard error + pub fn eprint(&self, source: &[u8]) { + let output = self.display(source); + eprint!("{output}"); + } +} + +/// Lex the source contents and return allocated Tokens. +/// +/// In the case of error, you can look up the last stored token to get a clue what went wrong. The +/// last stored token is always End Of File (EOF), so there will always be at least one token. +pub fn lex(contents: &[u8], span_offset: usize) -> (Tokens, Result<(), ()>) { + // TODO: We might require the contents to always end with a newline, in which case return an error + let mut tokens = Tokens::new(contents); + let lexer = Token::lexer(contents).spanned(); + + for (res, span) in lexer { + match res { + Ok(token) => tokens.push( + token, + Span::new(span.start + span_offset, span.end + span_offset), + ), + Err(_) => { + tokens.push( + Token::Eof, + Span::new(span.end + span_offset, span.end + span_offset), + ); + return (tokens, Err(())); + } + } + } + + tokens.push( + Token::Eof, + Span::new(contents.len() + span_offset, contents.len() + span_offset), + ); + + (tokens, Ok(())) +} + +#[derive(Logos, Debug, Clone, Copy, PartialEq)] +#[logos(skip r"[ \t]+")] +#[logos(source = [u8])] +pub enum Token { + #[regex("(?:0[xob])?[0-9][0-9_]*", priority = 10)] + Int, + #[regex(r"(:?[0-9][0-9_]*)*\.([0-9][0-9_]*)*(?:[eE][+-]?[0-9_]+)?")] + Float, + #[regex("\n|\r\n|\x0C")] + Newline, + #[regex(r#""(?:[^"\\]|\\["\\bnfrt])*""#)] + DoubleQuotedString, + #[regex(r#"'[^']*'"#)] + SingleQuotedString, + #[regex(r#"`[^`]*`"#)] + BacktickBareword, + // #[regex(r#"[ \t]+"#)] + // HorizontalWhitespace, + #[regex(r#"[0-9]{4}-[0-9]{2}-[0-9]{2}(?:T[0-9]{2}:[0-9]{2}:[0-9]{2}(?:\.[0-9]+)?)?(?:Z|[\+-][0-9]{2}:[0-9]{2})?"#)] + Datetime, + #[regex(r#"#[^\n]*"#, priority = 20)] + Comment, + // lower priority to avoid clashing with Int + #[regex(r#"(_|[^\s[:punct:]])(#|_|[^\s[:punct:]])*"#, priority = 2)] + Bareword, + #[token("...")] + DotDotDot, + #[token("..")] + DotDot, + #[token(".", priority = 10)] // higher priority to avoid "." being tokenized as Float + Dot, + #[token("(")] + LParen, + #[token(")")] + RParen, + #[token("[")] + LSquare, + #[token("]")] + RSquare, + #[token("{")] + LCurly, + #[token("}")] + RCurly, + #[token("<=")] + LessThanEqual, + #[token("<")] + LessThan, + #[token(">=")] + GreaterThanEqual, + #[token(">")] + GreaterThan, + #[token("++")] + PlusPlus, + #[token("+=")] + PlusEquals, + #[token("+")] + Plus, + #[token("->")] + ThinArrow, + #[token("=>")] + ThickArrow, + #[token("-=")] + DashEquals, + #[token("-")] + Dash, + #[token("**")] + AsteriskAsterisk, + #[token("*=")] + AsteriskEquals, + #[token("*")] + Asterisk, + #[token("//")] + ForwardSlashForwardSlash, + #[token("/=")] + ForwardSlashEquals, + #[token("/")] + ForwardSlash, + #[token("==")] + EqualsEquals, + #[token("=~")] + EqualsTilde, + #[token("=")] + Equals, + #[token("::")] + ColonColon, + #[token(":")] + Colon, + #[token("$")] + Dollar, + #[token(";")] + Semicolon, + #[token("!=")] + ExclamationEquals, + #[token("!~")] + ExclamationTilde, + #[token("!")] + Exclamation, + #[token("&&")] + AmpersandAmpersand, + #[token("&")] + Ampersand, + #[token(",")] + Comma, + #[token("?")] + QuestionMark, + #[token("^")] + Caret, + #[token("@")] + At, + #[token("||")] + PipePipe, + #[token("|")] + Pipe, + #[token("o>")] + OutGreaterThan, + #[token("o>>")] + OutGreaterGreaterThan, + #[token("e>")] + ErrGreaterThan, + #[token("e>>")] + ErrGreaterGreaterThan, + #[token("o+e>")] + OutErrGreaterThan, + #[token("o+e>>")] + OutErrGreaterGreaterThan, + #[token("e>|")] + ErrGreaterThanPipe, + #[token("o+e>|")] + OutErrGreaterThanPipe, + /// End of file, doesn't match any syntax, but source code always end with it + Eof, +} + +#[cfg(test)] +mod test { + /// Lexer tests useful for smaller sources, errors and corner cases + use crate::compiler::Span; + use crate::lexer::{lex, Token}; + + fn test_lex(src: &[u8], expected_tokens: &[(Token, Span)], expected_result: Result<(), ()>) { + let (mut actual_tokens, actual_result) = lex(src, 0); + + assert_eq!(expected_result, actual_result, "Lexing result mismatch"); + + for (i, expected) in expected_tokens.iter().enumerate() { + let actual = actual_tokens.peek(); + assert_eq!(expected, &actual, "Mismatch in token {}", i); + actual_tokens.advance(); + } + } + + fn span(start: usize, end: usize) -> Span { + Span { start, end } + } + + #[test] + fn lex_last_eof() { + test_lex(b"", &[(Token::Eof, span(0, 0))], Ok(())); + } + + #[test] + fn lex_unmatched_string() { + // TODO: Make unmatched delimiters nicer + test_lex(b"'unmatched string", &[(Token::Eof, span(17, 17))], Err(())); + } +} diff --git a/src/lib.rs b/src/lib.rs index 39ebde3..7a28cdf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,10 +1,9 @@ pub mod compiler; pub mod errors; -pub mod naming; +pub mod lexer; pub mod parser; pub mod protocol; pub mod resolver; #[cfg(test)] mod test; -mod token; pub mod typechecker; diff --git a/src/main.rs b/src/main.rs index 68aaad6..60b3ac8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ use std::process::exit; use new_nu_parser::compiler::Compiler; +use new_nu_parser::lexer::lex; use new_nu_parser::parser::Parser; use new_nu_parser::resolver::Resolver; use new_nu_parser::typechecker::Typechecker; @@ -30,7 +31,19 @@ fn main() { let span_offset = compiler.span_offset(); compiler.add_file(&fname, &contents); - let parser = Parser::new(compiler, span_offset); + let (tokens, err) = lex(&contents, span_offset); + if let Err(e) = err { + tokens.print(&compiler.source); + eprintln!("Lexing error. Error: {:?}", e); + exit(1); + } + + if do_print { + tokens.print(&compiler.source); + } + + let parser = Parser::new(compiler, tokens); + compiler = parser.parse(); if do_print { diff --git a/src/naming.rs b/src/naming.rs deleted file mode 100644 index 685e894..0000000 --- a/src/naming.rs +++ /dev/null @@ -1,33 +0,0 @@ -//! Naming is hard -//! -//! The intent for this file is to store all naming-related code in one place. - -pub const STRING_STRICT: BarewordContext = BarewordContext { - as_string: true, - strictness: NameStrictness::Strict, -}; - -pub const NAME_STRICT: BarewordContext = BarewordContext { - as_string: false, - strictness: NameStrictness::Strict, -}; - -/// Defines how barewords should be handled when parsing expressions -#[derive(Debug, Copy, Clone)] -pub struct BarewordContext { - /// Bareword is a string (e.g., in `[ a b c ]`) - pub as_string: bool, - /// Which characters are allowed / forbidden for the bareeword - pub strictness: NameStrictness, -} - -/// Defines which characters are allowed for names and barewords -/// -/// All of thee variants disallow whitespace -#[derive(Debug, Copy, Clone)] -pub enum NameStrictness { - /// Only letters and '_' are allowed (no punctuation) - Strict, - /// All characters except those in the list are allowed - AllCharsExcept(&'static [u8]), -} diff --git a/src/parser.rs b/src/parser.rs index 781f4bd..d990281 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,16 +1,12 @@ use crate::compiler::{Compiler, RollbackPoint, Span}; use crate::errors::{Severity, SourceError}; -use crate::naming::{BarewordContext, NameStrictness, NAME_STRICT, STRING_STRICT}; -use crate::token::{Token, TokenType}; +use crate::lexer::{Token, Tokens}; use tracy_client::span; pub struct Parser { pub compiler: Compiler, - pub span_offset: usize, - content_length: usize, - next_token: Option, - next_offset: usize, + tokens: Tokens, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -48,6 +44,14 @@ pub enum ParamsContext { Pipes, } +#[derive(Debug)] +pub enum BarewordContext { + /// Bareword is a string (e.g., in a list) + String, + /// Bareword is a name (e.g., in a call position) + Call, +} + // TODO: All nodes with Vec<...> should be moved to their own ID (like BlockId) to allow Copy trait #[derive(Debug, PartialEq, Clone)] pub enum AstNode { @@ -212,23 +216,12 @@ impl AstNode { } impl Parser { - pub fn new(compiler: Compiler, span_offset: usize) -> Self { - let content_length = compiler.source.len() - span_offset; - Self { - compiler, - content_length, - span_offset, - next_token: None, - next_offset: span_offset, - } + pub fn new(compiler: Compiler, tokens: Tokens) -> Self { + Self { compiler, tokens } } fn position(&mut self) -> usize { - if let Some(Token { span_start, .. }) = self.peek() { - span_start - } else { - self.content_length - } + self.tokens.peek_span().start } fn get_span_end(&self, node_id: NodeId) -> usize { @@ -271,13 +264,9 @@ impl Parser { // } // Otherwise assume a math expression - let mut leftmost = self.simple_expression(NAME_STRICT); + let mut leftmost = self.simple_expression(BarewordContext::Call); - if let Some(Token { - token_type: TokenType::Equals, - .. - }) = self.peek() - { + if self.is_equals() { if !allow_assignment { self.error("assignment found in expression"); } @@ -318,7 +307,7 @@ impl Parser { } let rhs = if self.is_simple_expression() { - self.simple_expression(NAME_STRICT) + self.simple_expression(BarewordContext::Call) } else { self.error("incomplete math expression") }; @@ -370,37 +359,44 @@ impl Parser { pub fn simple_expression(&mut self, bareword_context: BarewordContext) -> NodeId { let _span = span!(); + + // skip comments and newlines + while self.is_comment() || self.is_newline() { + self.tokens.advance(); + } + let span_start = self.position(); - let mut expr = if self.is_lcurly() { - self.record_or_closure() - } else if self.is_lparen() { - self.lparen(); - let output = self.expression(); - self.rparen(); - output - } else if self.is_lsquare() { - self.list_or_table() - } else if self.is_keyword(b"true") || self.is_keyword(b"false") { - self.boolean() - } else if self.is_keyword(b"null") { - self.null() - } else if self.is_string() { - self.string() - } else if self.is_number() { - self.number() - } else if self.is_dollar() { - self.variable() - } else if self.is_bareword(bareword_context.strictness) { - if bareword_context.as_string { - let node_id = self.bareword(bareword_context.strictness); - self.compiler.ast_nodes[node_id.0] = AstNode::String; - node_id - } else { - self.call() + let (token, span) = self.tokens.peek(); + + let mut expr = match token { + Token::LCurly => self.record_or_closure(), + Token::LParen => { + self.tokens.advance(); + let output = self.expression(); + self.rparen(); + output } - } else { - self.error("incomplete expression") + Token::LSquare => self.list_or_table(), + Token::Int => self.advance_node(AstNode::Int, span), + Token::Float => self.advance_node(AstNode::Float, span), + Token::DoubleQuotedString => self.advance_node(AstNode::String, span), + Token::SingleQuotedString => self.advance_node(AstNode::String, span), + Token::Dollar => self.variable(), + Token::Bareword => match self.compiler.get_span_contents_manual(span.start, span.end) { + b"true" => self.advance_node(AstNode::True, span), + b"false" => self.advance_node(AstNode::False, span), + b"null" => self.advance_node(AstNode::Null, span), + _ => match bareword_context { + BarewordContext::String => { + let node_id = self.name(); + self.compiler.ast_nodes[node_id.0] = AstNode::String; + node_id + } + BarewordContext::Call => self.call(), + }, + }, + _ => self.error("incomplete expression"), }; loop { @@ -408,7 +404,7 @@ impl Parser { return expr; } else if self.is_dotdot() { // Range - self.next(); + self.tokens.advance(); if self.is_horizontal_space() { // TODO: implement range from @@ -417,7 +413,7 @@ impl Parser { self.error("incomplete range"); return expr; } else { - let rhs = self.simple_expression(STRING_STRICT); + let rhs = self.simple_expression(BarewordContext::String); let span_end = self.get_span_end(rhs); expr = @@ -425,19 +421,16 @@ impl Parser { } } else if self.is_dot() { // Member access - self.next(); + self.tokens.advance(); if self.is_horizontal_space() { self.error("missing path name"); return expr; } - let prev_offset = self.span_offset; - let name = self.name(); let field_or_call = if self.is_lparen() { - self.span_offset = prev_offset; self.variable() } else { name @@ -465,106 +458,48 @@ impl Parser { } } - pub fn number(&mut self) -> NodeId { - match self.peek() { - Some(Token { - token_type: TokenType::Number, - span_start, - span_end, - }) => { - self.next(); - let contents = &self.compiler.source[span_start..span_end]; - - if contents.contains(&b'.') { - self.create_node(AstNode::Float, span_start, span_end) - } else { - self.create_node(AstNode::Int, span_start, span_end) - } - } - _ => self.error("expected: number"), - } - } - - pub fn boolean(&mut self) -> NodeId { - match self.peek() { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == b"true" => { - self.next(); - self.create_node(AstNode::True, span_start, span_end) - } - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == b"false" => { - self.next(); - self.create_node(AstNode::False, span_start, span_end) - } - _ => self.error("expected: boolean"), - } - } - - pub fn null(&mut self) -> NodeId { - match self.peek() { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == b"null" => { - self.next(); - - self.create_node(AstNode::Null, span_start, span_end) - } - _ => self.error("expected: null"), - } + pub fn advance_node(&mut self, node: AstNode, span: Span) -> NodeId { + self.tokens.advance(); + self.create_node(node, span.start, span.end) } pub fn variable(&mut self) -> NodeId { if self.is_dollar() { let span_start = self.position(); + self.tokens.advance(); - self.next(); - let name = self - .next() - .expect("internal error: missing token that was expected to be there"); - let name_end = name.span_end; - self.create_node(AstNode::Variable, span_start, name_end) + if let (Token::Bareword, name_span) = self.tokens.peek() { + self.tokens.advance(); + self.create_node(AstNode::Variable, span_start, name_span.end) + } else { + self.error("variable name must be a bareword") + } } else { - self.error("expected variable") + self.error("expected variable starting with '$'") } } pub fn variable_decl(&mut self) -> NodeId { let _span = span!(); + + let span_start = self.position(); + if self.is_dollar() { - let span_start = self.position(); + self.tokens.advance(); + } - self.next(); - let name = self - .next() - .expect("internal error: missing token that was expected to be there"); - let name_end = name.span_end; - self.create_node(AstNode::Variable, span_start, name_end) - } else if self.is_name() { - let name = self - .next() - .expect("internal error: missing token that was expected to be there"); - let name_start = name.span_start; - let name_end = name.span_end; - self.create_node(AstNode::Variable, name_start, name_end) + if let (Token::Bareword, name_span) = self.tokens.peek() { + self.tokens.advance(); + self.create_node(AstNode::Variable, span_start, name_span.end) } else { - self.error("expected variable") + self.error("variable assignment name must be a bareword") } } pub fn call(&mut self) -> NodeId { let _span = span!(); - let mut parts = vec![self.bareword(NameStrictness::AllCharsExcept(&[]))]; + let mut parts = vec![self.call_name()]; let mut is_head = true; - // let mut args = vec![]; let span_start = self.position(); while self.has_tokens() { @@ -580,7 +515,7 @@ impl Parser { // TODO: Add flags is_head = false; - let arg_id = self.simple_expression(STRING_STRICT); + let arg_id = self.simple_expression(BarewordContext::String); parts.push(arg_id); } @@ -601,24 +536,24 @@ impl Parser { loop { if self.is_rsquare() { span_end = self.position(); - self.next(); + self.tokens.advance(); break; } else if self.is_comma() || self.is_newline() { // TODO: should we disallow `[,,,]`? - self.next(); + self.tokens.advance(); } else if self.is_semicolon() { if items.len() != 1 { self.error("semicolon to create table should immediately follow headers"); } else if !matches!(self.compiler.get_node(items[0]), AstNode::List(_)) { self.error_on_node("tables require a list for their headers", items[0]) } - self.next(); + self.tokens.advance(); is_table = true; } else if self.is_simple_expression() { - items.push(self.simple_expression(STRING_STRICT)); + items.push(self.simple_expression(BarewordContext::String)); } else { items.push(self.error("expected list item")); - if self.peek().is_none() { + if self.is_eof() { // prevent forever looping if there is no token to put the error on break; } @@ -651,7 +586,7 @@ impl Parser { let mut items = vec![]; self.lcurly(); - self.skip_space_and_newlines(); + self.skip_newlines(); // Explicit closure case if self.is_pipe() { @@ -672,28 +607,28 @@ impl Parser { let rollback_point = self.get_rollback_point(); loop { - self.skip_space_and_newlines(); + self.skip_newlines(); if self.is_rcurly() { self.rcurly(); span_end = self.position(); break; } - let key = self.simple_expression(STRING_STRICT); - self.skip_space_and_newlines(); + let key = self.simple_expression(BarewordContext::String); + self.skip_newlines(); if first_pass && !self.is_colon() { is_closure = true; break; } self.colon(); - self.skip_space_and_newlines(); - let val = self.simple_expression(STRING_STRICT); + self.skip_newlines(); + let val = self.simple_expression(BarewordContext::String); items.push((key, val)); first_pass = false; if self.is_comma() { self.comma() } - if self.peek().is_none() { + if self.is_eof() { // abort when appropriate break; } @@ -720,107 +655,33 @@ impl Parser { } pub fn operator(&mut self) -> NodeId { - match self.peek() { - Some(Token { - token_type, - span_start, - span_end, - .. - }) => match token_type { - TokenType::Plus => { - self.next(); - self.create_node(AstNode::Plus, span_start, span_end) - } - TokenType::PlusPlus => { - self.next(); - self.create_node(AstNode::Append, span_start, span_end) - } - TokenType::Dash => { - self.next(); - self.create_node(AstNode::Minus, span_start, span_end) - } - TokenType::Asterisk => { - self.next(); - self.create_node(AstNode::Multiply, span_start, span_end) - } - TokenType::ForwardSlash => { - self.next(); - self.create_node(AstNode::Divide, span_start, span_end) - } - TokenType::LessThan => { - self.next(); - self.create_node(AstNode::LessThan, span_start, span_end) - } - TokenType::LessThanEqual => { - self.next(); - self.create_node(AstNode::LessThanOrEqual, span_start, span_end) - } - TokenType::GreaterThan => { - self.next(); - self.create_node(AstNode::GreaterThan, span_start, span_end) - } - TokenType::GreaterThanEqual => { - self.next(); - self.create_node(AstNode::GreaterThanOrEqual, span_start, span_end) - } - TokenType::EqualsEquals => { - self.next(); - self.create_node(AstNode::Equal, span_start, span_end) - } - TokenType::ExclamationEquals => { - self.next(); - self.create_node(AstNode::NotEqual, span_start, span_end) - } - TokenType::AsteriskAsterisk => { - self.next(); - self.create_node(AstNode::Pow, span_start, span_end) - } - TokenType::AmpersandAmpersand => { - self.next(); - self.create_node(AstNode::And, span_start, span_end) - } - TokenType::PipePipe => { - self.next(); - self.create_node(AstNode::Or, span_start, span_end) - } - TokenType::Equals => { - self.next(); - self.create_node(AstNode::Assignment, span_start, span_end) - } - TokenType::PlusEquals => { - self.next(); - self.create_node(AstNode::AddAssignment, span_start, span_end) - } - TokenType::DashEquals => { - self.next(); - self.create_node(AstNode::SubtractAssignment, span_start, span_end) - } - TokenType::AsteriskEquals => { - self.next(); - self.create_node(AstNode::MultiplyAssignment, span_start, span_end) - } - TokenType::ForwardSlashEquals => { - self.next(); - self.create_node(AstNode::DivideAssignment, span_start, span_end) - } - TokenType::Name => { - let op = &self.compiler.source[span_start..span_end]; - match op { - b"and" => { - self.next(); - self.create_node(AstNode::And, span_start, span_end) - } - b"or" => { - self.next(); - self.create_node(AstNode::Or, span_start, span_end) - } - _ => self.error(format!( - "Unknown operator: '{}'", - String::from_utf8_lossy(op) - )), - } - } - _ => self.error("expected: operator"), + let (token, span) = self.tokens.peek(); + + match token { + Token::Plus => self.advance_node(AstNode::Plus, span), + Token::PlusPlus => self.advance_node(AstNode::Append, span), + Token::Dash => self.advance_node(AstNode::Minus, span), + Token::Asterisk => self.advance_node(AstNode::Multiply, span), + Token::ForwardSlash => self.advance_node(AstNode::Divide, span), + Token::LessThan => self.advance_node(AstNode::LessThan, span), + Token::LessThanEqual => self.advance_node(AstNode::LessThanOrEqual, span), + Token::GreaterThan => self.advance_node(AstNode::GreaterThan, span), + Token::GreaterThanEqual => self.advance_node(AstNode::GreaterThanOrEqual, span), + Token::EqualsEquals => self.advance_node(AstNode::Equal, span), + Token::ExclamationEquals => self.advance_node(AstNode::NotEqual, span), + Token::AsteriskAsterisk => self.advance_node(AstNode::Pow, span), + Token::Equals => self.advance_node(AstNode::Assignment, span), + Token::PlusEquals => self.advance_node(AstNode::AddAssignment, span), + Token::DashEquals => self.advance_node(AstNode::SubtractAssignment, span), + Token::AsteriskEquals => self.advance_node(AstNode::MultiplyAssignment, span), + Token::ForwardSlashEquals => self.advance_node(AstNode::DivideAssignment, span), + Token::Bareword => match self.compiler.get_span_contents_manual(span.start, span.end) { + b"and" => self.advance_node(AstNode::And, span), + b"or" => self.advance_node(AstNode::Or, span), + op => self.error(format!( + "Unknown operator: '{}'", + String::from_utf8_lossy(op) + )), }, _ => self.error("expected: operator"), } @@ -838,52 +699,45 @@ impl Parser { } pub fn string(&mut self) -> NodeId { - match self.peek() { - Some(Token { - token_type: TokenType::String, - span_start, - span_end, - .. - }) => { - self.next(); - self.create_node(AstNode::String, span_start, span_end) - } + match self.tokens.peek() { + (Token::DoubleQuotedString, span) => self.advance_node(AstNode::String, span), + (Token::SingleQuotedString, span) => self.advance_node(AstNode::String, span), _ => self.error("expected: string"), } } pub fn name(&mut self) -> NodeId { - match self.peek() { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - .. - }) => { - self.next(); - self.create_node(AstNode::Name, span_start, span_end) - } - _ => self.error("expect name"), + match self.tokens.peek() { + (Token::Bareword, span) => self.advance_node(AstNode::Name, span), + _ => self.error("expected: name"), } } - pub fn bareword(&mut self, name_strictness: NameStrictness) -> NodeId { - match self.peek_bareword(name_strictness) { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - .. - }) => { - self.next_bareword(name_strictness); - self.create_node(AstNode::Name, span_start, span_end) + pub fn call_name(&mut self) -> NodeId { + let (mut token, mut span) = self.tokens.peek(); + + loop { + if [Token::Eof, Token::Newline].contains(&token) { + break; } - _ => self.error("expect bareword"), + + self.tokens.advance(); + let (next_token, next_span) = self.tokens.peek(); + + if next_span.start > span.end { + // horizontal whitespace + break; + } + + token = next_token; + span.end = next_span.end; } + + self.create_node(AstNode::Name, span.start, span.end) } pub fn has_tokens(&mut self) -> bool { - self.peek().is_some() + self.tokens.peek_token() != Token::Eof } pub fn match_expression(&mut self) -> NodeId { @@ -892,7 +746,7 @@ impl Parser { let span_end; self.keyword(b"match"); - let target = self.simple_expression(STRING_STRICT); + let target = self.simple_expression(BarewordContext::String); let mut match_arms = vec![]; @@ -908,18 +762,18 @@ impl Parser { self.rcurly(); break; } else if self.is_simple_expression() { - let pattern = self.simple_expression(STRING_STRICT); + let pattern = self.simple_expression(BarewordContext::String); if !self.is_thick_arrow() { return self.error("expected thick arrow (=>) between match cases"); } - self.next(); + self.tokens.advance(); - let pattern_result = self.simple_expression(NAME_STRICT); + let pattern_result = self.simple_expression(BarewordContext::String); match_arms.push((pattern, pattern_result)); } else if self.is_newline() { - self.next(); + self.tokens.advance(); } else { return self.error("expected match arm in match"); } @@ -936,20 +790,15 @@ impl Parser { self.keyword(b"if"); let condition = self.expression(); - while self.is_newline() { - self.next(); - } - let then_block = self.block(BlockContext::Curlies); + self.skip_newlines(); - while self.is_newline() { - self.next(); - } + let then_block = self.block(BlockContext::Curlies); + self.skip_newlines(); let else_block = if self.is_keyword(b"else") { - self.next(); - while self.is_newline() { - self.next(); - } + self.tokens.advance(); + self.skip_newlines(); + let block = if self.is_keyword(b"if") { self.if_expression() } else if self.is_keyword(b"match") { @@ -1004,7 +853,7 @@ impl Parser { } if self.is_comma() { - self.next(); + self.tokens.advance(); continue; } @@ -1061,7 +910,7 @@ impl Parser { } if self.is_comma() { - self.next(); + self.tokens.advance(); continue; } @@ -1079,39 +928,33 @@ impl Parser { pub fn typename(&mut self) -> NodeId { let _span = span!(); - match self.peek() { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - .. - }) => { - let name = self.name(); - let mut params = None; - if self.is_less_than() { - // We have generics - params = Some(self.type_params()); - } - - let optional = if self.is_question_mark() { - // We have an optional type - self.next(); - true - } else { - false - }; + if let (Token::Bareword, span) = self.tokens.peek() { + let name = self.name(); + let mut params = None; + if self.is_less_than() { + // We have generics + params = Some(self.type_params()); + } + + let optional = if self.is_question_mark() { + // We have an optional type + self.tokens.advance(); + true + } else { + false + }; - self.create_node( - AstNode::Type { - name, - params, - optional, - }, - span_start, - span_end, - ) - } - _ => self.error("expect name"), + self.create_node( + AstNode::Type { + name, + params, + optional, + }, + span.start, + span.end, + ) + } else { + self.error("expect name") } } @@ -1121,17 +964,11 @@ impl Parser { self.keyword(b"def"); - let name = match self.next() { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) => self.create_node(AstNode::Name, span_start, span_end), - Some(Token { - token_type: TokenType::String, - span_start, - span_end, - }) => self.create_node(AstNode::String, span_start, span_end), + let name = match self.tokens.peek() { + (Token::Bareword, span) => self.advance_node(AstNode::Name, span), + (Token::DoubleQuotedString | Token::SingleQuotedString, span) => { + self.advance_node(AstNode::String, span) + } _ => return self.error("expected def name"), }; @@ -1152,6 +989,7 @@ impl Parser { ) } + // TODO: Deduplicate code between let/mut/const assignments pub fn let_statement(&mut self) -> NodeId { let _span = span!(); let is_mutable = false; @@ -1188,6 +1026,7 @@ impl Parser { ) } + // TODO: Deduplicate code between let/mut/const assignments pub fn mut_statement(&mut self) -> NodeId { let _span = span!(); let is_mutable = true; @@ -1226,20 +1065,13 @@ impl Parser { pub fn keyword(&mut self, keyword: &[u8]) { let _span = span!(); - match self.peek() { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == keyword => { - self.next(); - } - _ => { - self.error(format!( - "expected keyword: {}", - String::from_utf8_lossy(keyword) - )); - } + if self.is_keyword(keyword) { + self.tokens.advance(); + } else { + self.error(format!( + "expected keyword: {}", + String::from_utf8_lossy(keyword) + )); } } @@ -1259,8 +1091,8 @@ impl Parser { } else if self.is_rcurly() && context == BlockContext::Closure { // not responsible for parsing it, yield back to the closure pass break; - } else if self.is_semicolon() || self.is_newline() { - self.next(); + } else if self.is_semicolon() || self.is_newline() || self.is_comment() { + self.tokens.advance(); continue; } else if self.is_keyword(b"def") { code_body.push(self.def_statement()); @@ -1287,7 +1119,7 @@ impl Parser { if self.is_semicolon() { // This is a statement, not an expression - self.next(); + self.tokens.advance(); code_body.push(self.create_node( AstNode::Statement(expression), exp_span_start, @@ -1329,7 +1161,7 @@ impl Parser { let variable = self.variable_decl(); self.keyword(b"in"); - let range = self.simple_expression(NAME_STRICT); + let range = self.simple_expression(BarewordContext::String); let block = self.block(BlockContext::Curlies); let span_end = self.get_span_end(block); @@ -1392,320 +1224,156 @@ impl Parser { } pub fn is_operator(&mut self) -> bool { - match self.peek() { - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) => { - &self.compiler.source[span_start..span_end] == b"and" - || &self.compiler.source[span_start..span_end] == b"or" + let (token, span) = self.tokens.peek(); + + match token { + Token::Plus + | Token::PlusPlus + | Token::Dash + | Token::Asterisk + | Token::ForwardSlash + | Token::LessThan + | Token::LessThanEqual + | Token::GreaterThan + | Token::GreaterThanEqual + | Token::EqualsEquals + | Token::ExclamationEquals + | Token::AsteriskAsterisk + | Token::Equals + | Token::PlusEquals + | Token::DashEquals + | Token::AsteriskEquals + | Token::ForwardSlashEquals => true, + Token::Bareword => { + let op = self.compiler.get_span_contents_manual(span.start, span.end); + op == b"and" || op == b"or" } - Some(Token { token_type, .. }) => matches!( - token_type, - TokenType::Asterisk - | TokenType::AsteriskAsterisk - | TokenType::Dash - | TokenType::EqualsEquals - | TokenType::ExclamationEquals - | TokenType::ForwardSlash - | TokenType::LessThan - | TokenType::LessThanEqual - | TokenType::Plus - | TokenType::PlusPlus - | TokenType::GreaterThan - | TokenType::GreaterThanEqual - | TokenType::AmpersandAmpersand - | TokenType::PipePipe - | TokenType::Equals - | TokenType::PlusEquals - | TokenType::DashEquals - | TokenType::AsteriskEquals - | TokenType::ForwardSlashEquals - ), _ => false, } } + pub fn is_equals(&mut self) -> bool { + self.tokens.peek_token() == Token::Equals + } + pub fn is_comma(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Comma, - .. - }) - ) + self.tokens.peek_token() == Token::Comma } pub fn is_lcurly(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::LCurly, - .. - }) - ) + self.tokens.peek_token() == Token::LCurly } pub fn is_rcurly(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::RCurly, - .. - }) - ) + self.tokens.peek_token() == Token::RCurly } pub fn is_lparen(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::LParen, - .. - }) - ) + self.tokens.peek_token() == Token::LParen } pub fn is_rparen(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::RParen, - .. - }) - ) + self.tokens.peek_token() == Token::RParen } pub fn is_lsquare(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::LSquare, - .. - }) - ) + self.tokens.peek_token() == Token::LSquare } pub fn is_rsquare(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::RSquare, - .. - }) - ) + self.tokens.peek_token() == Token::RSquare } pub fn is_less_than(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::LessThan, - .. - }) - ) + self.tokens.peek_token() == Token::LessThan } pub fn is_greater_than(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::GreaterThan, - .. - }) - ) + self.tokens.peek_token() == Token::GreaterThan } pub fn is_pipe(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Pipe, - .. - }) - ) + self.tokens.peek_token() == Token::Pipe } pub fn is_dollar(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Dollar, - .. - }) - ) + self.tokens.peek_token() == Token::Dollar + } + + pub fn is_comment(&mut self) -> bool { + self.tokens.peek_token() == Token::Comment } pub fn is_question_mark(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::QuestionMark, - .. - }) - ) + self.tokens.peek_token() == Token::QuestionMark } pub fn is_thin_arrow(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::ThinArrow, - .. - }) - ) + self.tokens.peek_token() == Token::ThinArrow } pub fn is_thick_arrow(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::ThickArrow, - .. - }) - ) + self.tokens.peek_token() == Token::ThickArrow } - // pub fn is_double_pipe(&mut self) -> bool { - // matches!( - // self.peek(), - // Some(Token { - // token_type: TokenType::PipePipe, - // .. - // }) - // ) - // } - - // pub fn is_double_ampersand(&mut self) -> bool { - // matches!( - // self.peek(), - // Some(Token { - // token_type: TokenType::AmpersandAmpersand, - // .. - // }) - // ) - // } - - // pub fn is_dash(&mut self) -> bool { - // matches!( - // self.peek(), - // Some(Token { - // token_type: TokenType::Dash, - // .. - // }) - // ) - // } - pub fn is_colon(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Colon, - .. - }) - ) + self.tokens.peek_token() == Token::Colon } pub fn is_newline(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Newline, - .. - }) - ) + self.tokens.peek_token() == Token::Newline } pub fn is_semicolon(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Semicolon, - .. - }) - ) + self.tokens.peek_token() == Token::Semicolon } pub fn is_dot(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Dot, - .. - }) - ) + self.tokens.peek_token() == Token::Dot } pub fn is_dotdot(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::DotDot, - .. - }) - ) + self.tokens.peek_token() == Token::DotDot } pub fn is_coloncolon(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::ColonColon, - .. - }) - ) + self.tokens.peek_token() == Token::ColonColon } - pub fn is_number(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Number, - .. - }) - ) + pub fn is_int(&mut self) -> bool { + self.tokens.peek_token() == Token::Int + } + + pub fn is_float(&mut self) -> bool { + self.tokens.peek_token() == Token::Float } pub fn is_string(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::String, - .. - }) - ) + self.tokens.peek_token() == Token::DoubleQuotedString + || self.tokens.peek_token() == Token::SingleQuotedString } pub fn is_keyword(&mut self, keyword: &[u8]) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == keyword - ) + if let (Token::Bareword, span) = self.tokens.peek() { + self.compiler.get_span_contents_manual(span.start, span.end) == keyword + } else { + false + } } pub fn is_name(&mut self) -> bool { - matches!( - self.peek(), - Some(Token { - token_type: TokenType::Name, - .. - }) - ) + self.tokens.peek_token() == Token::Bareword } - pub fn is_bareword(&mut self, name_strictness: NameStrictness) -> bool { - matches!( - self.peek_bareword(name_strictness), - Some(Token { - token_type: TokenType::Name, - .. - }) - ) + pub fn is_eof(&mut self) -> bool { + self.tokens.peek_token() == Token::Eof + } + + pub fn is_horizontal_space(&self) -> bool { + let span_position = self.tokens.peek_span().start; + let whitespace: &[u8] = b" \t"; + + span_position > 0 && whitespace.contains(&self.compiler.source[span_position - 1]) } pub fn is_expression(&mut self) -> bool { @@ -1716,56 +1384,18 @@ impl Parser { } pub fn is_simple_expression(&mut self) -> bool { - match self.peek() { - Some(Token { - token_type: TokenType::Number, - .. - }) - | Some(Token { - token_type: TokenType::String, - .. - }) - | Some(Token { - token_type: TokenType::LCurly, - .. - }) - | Some(Token { - token_type: TokenType::LSquare, - .. - }) - | Some(Token { - token_type: TokenType::LParen, - .. - }) => true, - Some(Token { - token_type: TokenType::Dot, - .. - }) => true, - Some(Token { - token_type: TokenType::Dollar, - .. - }) => true, - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == b"true" => true, - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == b"false" => true, - Some(Token { - token_type: TokenType::Name, - span_start, - span_end, - }) if &self.compiler.source[span_start..span_end] == b"null" => true, - Some(Token { - token_type: TokenType::Name, - .. - }) => true, - _ => false, - } + self.is_string() + || self.is_int() + || self.is_float() + || self.is_lcurly() + || self.is_lsquare() + || self.is_lparen() + || self.is_dot() + || self.is_dollar() + || self.is_keyword(b"true") + || self.is_keyword(b"false") + || self.is_keyword(b"null") + || self.is_name() } pub fn error_on_node(&mut self, message: impl Into, node_id: NodeId) { @@ -1777,31 +1407,20 @@ impl Parser { } pub fn error(&mut self, message: impl Into) -> NodeId { - if let Some(Token { - span_start, - span_end, - .. - }) = self.next() - { - let node_id = self.create_node(AstNode::Garbage, span_start, span_end); - self.compiler.errors.push(SourceError { - message: message.into(), - node_id, - severity: Severity::Error, - }); - - node_id - } else { - let node_id = - self.create_node(AstNode::Garbage, self.content_length, self.content_length); - self.compiler.errors.push(SourceError { - message: message.into(), - node_id, - severity: Severity::Error, - }); - - node_id + let (token, span) = self.tokens.peek(); + + if token != Token::Eof { + self.tokens.advance(); } + + let node_id = self.create_node(AstNode::Garbage, span.start, span.end); + self.compiler.errors.push(SourceError { + message: message.into(), + node_id, + severity: Severity::Error, + }); + + node_id } pub fn create_node(&mut self, ast_node: AstNode, span_start: usize, span_end: usize) -> NodeId { @@ -1813,844 +1432,113 @@ impl Parser { } pub fn lparen(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::LParen, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: left paren '('"); - } + if self.is_lparen() { + self.tokens.advance(); + } else { + self.error("expected: left paren '('"); } } pub fn rparen(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::RParen, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: right paren ')'"); - } + if self.is_rparen() { + self.tokens.advance(); + } else { + self.error("expected: right paren ')'"); } } pub fn lsquare(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::LSquare, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: left bracket '['"); - } + if self.is_lsquare() { + self.tokens.advance(); + } else { + self.error("expected: left bracket '['"); } } pub fn rsquare(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::RSquare, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: right bracket ']'"); - } + if self.is_rsquare() { + self.tokens.advance(); + } else { + self.error("expected: right bracket ']'"); } } pub fn lcurly(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::LCurly, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: left bracket '{'"); - } + if self.is_lcurly() { + self.tokens.advance(); + } else { + self.error("expected: left bracket '{'"); } } pub fn rcurly(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::RCurly, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: right bracket '}'"); - } + if self.is_rcurly() { + self.tokens.advance(); + } else { + self.error("expected: right bracket '}'"); } } pub fn pipe(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::Pipe, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: pipe symbol '|'"); - } + if self.is_pipe() { + self.tokens.advance(); + } else { + self.error("expected: pipe symbol '|'"); } } pub fn less_than(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::LessThan, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: less than/left angle bracket '<'"); - } + if self.is_less_than() { + self.tokens.advance(); + } else { + self.error("expected: less than/left angle bracket '<'"); } } pub fn greater_than(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::GreaterThan, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: greater than/right angle bracket '>'"); - } + if self.is_greater_than() { + self.tokens.advance(); + } else { + self.error("expected: greater than/right angle bracket '>'"); } } pub fn equals(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::Equals, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: equals '='"); - } + if self.is_equals() { + self.tokens.advance(); + } else { + self.error("expected: equals '='"); } } pub fn colon(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::Colon, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: colon ':'"); - } - } - } - pub fn comma(&mut self) { - match self.peek() { - Some(Token { - token_type: TokenType::Comma, - .. - }) => { - self.next(); - } - _ => { - self.error("expected: comma ','"); - } - } - } - - pub fn lex_quoted_string(&mut self) -> Option { - let span_start = self.span_offset; - let mut span_position = span_start + 1; - let mut is_escaped = false; - while span_position < self.compiler.source.len() { - if is_escaped { - is_escaped = false; - } else if self.compiler.source[span_position] == b'\\' { - is_escaped = true; - } else if self.compiler.source[span_position] == b'"' - || self.compiler.source[span_position] == b'\'' - { - span_position += 1; - break; - } - span_position += 1; - } - - self.span_offset = span_position; - - Some(Token { - token_type: TokenType::String, - span_start, - span_end: self.span_offset, - }) - } - - pub fn lex_number(&mut self) -> Option { - let span_start = self.span_offset; - let mut span_position = span_start; - while span_position < self.compiler.source.len() { - if !self.compiler.source[span_position].is_ascii_digit() { - break; - } - span_position += 1; - } - - // Check to see if we have a hex/octal/binary number - if span_position < self.compiler.source.len() && self.compiler.source[span_position] == b'x' - { - span_position += 1; - while span_position < self.compiler.source.len() { - if !self.compiler.source[span_position].is_ascii_hexdigit() { - break; - } - span_position += 1; - } - } else if span_position < self.compiler.source.len() - && self.compiler.source[span_position] == b'o' - { - span_position += 1; - while span_position < self.compiler.source.len() { - if !(self.compiler.source[span_position] >= b'0' - && self.compiler.source[span_position] <= b'7') - { - break; - } - span_position += 1; - } - } else if span_position < self.compiler.source.len() - && self.compiler.source[span_position] == b'b' - { - span_position += 1; - while span_position < self.compiler.source.len() { - if !(self.compiler.source[span_position] >= b'0' - && self.compiler.source[span_position] <= b'1') - { - break; - } - span_position += 1; - } - } else if span_position < self.compiler.source.len() - && self.compiler.source[span_position] == b'.' - && (span_position + 1 < self.compiler.source.len()) - && self.compiler.source[span_position + 1].is_ascii_digit() - { - // Looks like a float - span_position += 1; - while span_position < self.compiler.source.len() { - if !self.compiler.source[span_position].is_ascii_digit() { - break; - } - span_position += 1; - } - - if span_position < self.compiler.source.len() - && (self.compiler.source[span_position] == b'e' - || self.compiler.source[span_position] == b'E') - { - span_position += 1; - - if span_position < self.compiler.source.len() - && self.compiler.source[span_position] == b'-' - { - span_position += 1; - } - - while span_position < self.compiler.source.len() { - if !self.compiler.source[span_position].is_ascii_digit() { - break; - } - span_position += 1; - } - } - } - - self.span_offset = span_position; - - Some(Token { - token_type: TokenType::Number, - span_start, - span_end: self.span_offset, - }) - } - - pub fn is_horizontal_space(&self) -> bool { - let span_position = self.span_offset; - let whitespace: &[u8] = b" \t"; - - span_position < self.compiler.source.len() - && whitespace.contains(&self.compiler.source[span_position]) - } - - pub fn skip_space(&mut self) { - let mut span_position = self.span_offset; - let whitespace: &[u8] = b" \t"; - while span_position < self.compiler.source.len() { - if !whitespace.contains(&self.compiler.source[span_position]) { - break; - } - span_position += 1; - } - self.span_offset = span_position; - } - - pub fn skip_space_and_newlines(&mut self) { - self.skip_space(); - while self.newline().is_some() { - self.skip_space(); - } - } - - pub fn newline(&mut self) -> Option { - if matches!( - self.next_token, - Some(Token { - token_type: TokenType::Newline, - .. - }) - ) { - let token = self.next_token; - self.next_token = None; - self.span_offset = self.next_offset; - token + if self.is_colon() { + self.tokens.advance(); } else { - let mut span_position = self.span_offset; - let whitespace: &[u8] = b"\r\n"; - while span_position < self.compiler.source.len() { - if !whitespace.contains(&self.compiler.source[span_position]) { - break; - } - span_position += 1; - } - - if self.span_offset == span_position { - None - } else { - let output = Some(Token { - token_type: TokenType::Newline, - span_start: self.span_offset, - span_end: span_position, - }); - self.span_offset = span_position; - output - } - } - } - - pub fn skip_comment(&mut self) { - let mut span_position = self.span_offset; - while span_position < self.compiler.source.len() - && self.compiler.source[span_position] != b'\n' - { - span_position += 1; - } - self.span_offset = span_position; - } - - /// More relaxed name lexing - pub fn lex_bareword(&mut self, name_strictness: NameStrictness) -> Option { - let span_start = self.span_offset; - let mut span_position = span_start; - - match name_strictness { - NameStrictness::Strict => { - while span_position < self.compiler.source.len() - && !self.compiler.source[span_position].is_ascii_whitespace() - && (!self.compiler.source[span_position].is_ascii_punctuation() - || self.compiler.source[span_position] == b'_') - { - span_position += 1; - } - } - NameStrictness::AllCharsExcept(chars) => { - while span_position < self.compiler.source.len() - && !self.compiler.source[span_position].is_ascii_whitespace() - && !chars.contains(&self.compiler.source[span_position]) - { - span_position += 1; - } - } + self.error("expected: colon ':'"); } - assert_ne!( - self.span_offset, span_position, - "lex_bareword did not consume any bytes" - ); - self.span_offset = span_position; - - Some(Token { - token_type: TokenType::Name, - span_start, - span_end: self.span_offset, - }) - } - - fn lex_redirect_symbol(&mut self) -> Option { - let span_start = self.span_offset; - let content = &self.compiler.source[span_start..]; - let redirect_tokens: [(&[u8], TokenType); 8] = [ - (b"o>", TokenType::OutGreaterThan), - (b"o>>", TokenType::OutGreaterGreaterThan), - (b"e>", TokenType::ErrGreaterThan), - (b"e>>", TokenType::ErrGreaterGreaterThan), - (b"o+e>", TokenType::OutErrGreaterThan), - (b"o+e>>", TokenType::OutErrGreaterGreaterThan), - (b"e>|", TokenType::ErrGreaterThanPipe), - (b"o+e>|", TokenType::OutErrGreaterThanPipe), - ]; - for (bytes, token_type) in redirect_tokens { - if content.starts_with(bytes) { - let result = Token { - token_type, - span_start, - span_end: span_start + bytes.len(), - }; - self.span_offset = result.span_end; - return Some(result); - } - } - None - } - - pub fn lex_symbol(&mut self) -> Option { - // try span redirection symbol first. - let result = self.lex_redirect_symbol(); - if result.is_some() { - return result; - } - - let span_start = self.span_offset; - let result = match self.compiler.source[span_start] { - b'(' => Token { - token_type: TokenType::LParen, - span_start, - span_end: span_start + 1, - }, - b'[' => Token { - token_type: TokenType::LSquare, - span_start, - span_end: span_start + 1, - }, - b'{' => Token { - token_type: TokenType::LCurly, - span_start, - span_end: span_start + 1, - }, - b'<' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::LessThanEqual, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::LessThan, - span_start, - span_end: span_start + 1, - } - } - } - b')' => Token { - token_type: TokenType::RParen, - span_start, - span_end: span_start + 1, - }, - b']' => Token { - token_type: TokenType::RSquare, - span_start, - span_end: span_start + 1, - }, - b'}' => Token { - token_type: TokenType::RCurly, - span_start, - span_end: span_start + 1, - }, - b'>' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::GreaterThanEqual, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::GreaterThan, - span_start, - span_end: span_start + 1, - } - } - } - b'+' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'+' - { - Token { - token_type: TokenType::PlusPlus, - span_start, - span_end: span_start + 2, - } - } else if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::PlusEquals, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Plus, - span_start, - span_end: span_start + 1, - } - } - } - b'-' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'>' - { - Token { - token_type: TokenType::ThinArrow, - span_start, - span_end: span_start + 2, - } - } else if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::DashEquals, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Dash, - span_start, - span_end: span_start + 1, - } - } - } - b'*' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'*' - { - Token { - token_type: TokenType::AsteriskAsterisk, - span_start, - span_end: span_start + 2, - } - } else if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::AsteriskEquals, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Asterisk, - span_start, - span_end: span_start + 1, - } - } - } - b'/' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'/' - { - Token { - token_type: TokenType::ForwardSlashForwardSlash, - span_start, - span_end: span_start + 2, - } - } else if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::ForwardSlashEquals, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::ForwardSlash, - span_start, - span_end: span_start + 1, - } - } - } - b'=' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::EqualsEquals, - span_start, - span_end: span_start + 2, - } - } else if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'~' - { - Token { - token_type: TokenType::EqualsTilde, - span_start, - span_end: span_start + 2, - } - } else if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'>' - { - Token { - token_type: TokenType::ThickArrow, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Equals, - span_start, - span_end: span_start + 1, - } - } - } - b':' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b':' - { - Token { - token_type: TokenType::ColonColon, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Colon, - span_start, - span_end: span_start + 1, - } - } - } - b'$' => Token { - token_type: TokenType::Dollar, - span_start, - span_end: span_start + 1, - }, - b';' => Token { - token_type: TokenType::Semicolon, - span_start, - span_end: span_start + 1, - }, - b'.' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'.' - { - Token { - token_type: TokenType::DotDot, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Dot, - span_start, - span_end: span_start + 1, - } - } - } - b'!' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'=' - { - Token { - token_type: TokenType::ExclamationEquals, - span_start, - span_end: span_start + 2, - } - } else if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'~' - { - Token { - token_type: TokenType::ExclamationTilde, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Exclamation, - span_start, - span_end: span_start + 1, - } - } - } - b'|' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'|' - { - Token { - token_type: TokenType::PipePipe, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Pipe, - span_start, - span_end: span_start + 1, - } - } - } - b'&' => { - if self.span_offset < (self.compiler.source.len() - 1) - && self.compiler.source[self.span_offset + 1] == b'&' - { - Token { - token_type: TokenType::AmpersandAmpersand, - span_start, - span_end: span_start + 2, - } - } else { - Token { - token_type: TokenType::Ampersand, - span_start, - span_end: span_start + 1, - } - } - } - b',' => Token { - token_type: TokenType::Comma, - span_start, - span_end: span_start + 1, - }, - b'?' => Token { - token_type: TokenType::QuestionMark, - span_start, - span_end: span_start + 1, - }, - x => { - panic!( - "Internal compiler error: symbol character mismatched in lexer: {}", - x as char - ) - } - }; - - self.span_offset = result.span_end; - - Some(result) - } - - pub fn peek(&mut self) -> Option { - self.peek_bareword(NameStrictness::Strict) } - pub fn peek_bareword(&mut self, name_strictness: NameStrictness) -> Option { - let _span = span!(); - if self.next_token.is_none() { - let prev_offset = self.span_offset; - self.next_token = self.next_bareword(name_strictness); - self.next_offset = self.span_offset; - self.span_offset = prev_offset; + pub fn comma(&mut self) { + if self.is_comma() { + self.tokens.advance(); + } else { + self.error("expected: comma ','"); } - - self.next_token } - #[allow(clippy::should_implement_trait)] - pub fn next(&mut self) -> Option { - self.next_bareword(NameStrictness::Strict) - } - - // Returns next token, or None if end of source has been reached. - // If token is returned, the span_offset is increased. - pub fn next_bareword(&mut self, name_strictness: NameStrictness) -> Option { - let _span = span!(); - - if let Some(token) = self.next_token { - self.next_token = None; - self.span_offset = self.next_offset; - Some(token) - } else { - loop { - if self.span_offset >= self.compiler.source.len() { - return None; - } - - let char = self.compiler.source[self.span_offset]; - - if char.is_ascii_digit() { - return self.lex_number(); - } else if char == b'"' || char == b'\'' { - return self.lex_quoted_string(); - } else if char == b'#' { - // Comment - self.skip_comment(); - } else if is_symbol(&self.compiler.source[self.span_offset..]) { - return self.lex_symbol(); - } else if char == b' ' || char == b'\t' { - self.skip_space() - } else if char == b'\r' || char == b'\n' { - return self.newline(); - } else { - return self.lex_bareword(name_strictness); - } - } + pub fn skip_newlines(&mut self) { + while self.is_newline() { + self.tokens.advance(); } } fn get_rollback_point(&self) -> RollbackPoint { - self.compiler.get_rollback_point(self.span_offset) + self.compiler.get_rollback_point(self.tokens.pos()) } fn apply_rollback(&mut self, rbp: RollbackPoint) { - self.span_offset = self.compiler.apply_compiler_rollback(rbp); - self.next_token = None; - self.next_offset = self.span_offset; - } -} - -fn is_symbol(source: &[u8]) -> bool { - let first_byte = source[0]; - if [ - b'+', b'-', b'*', b'/', b'.', b',', b'(', b'[', b'{', b'<', b')', b']', b'}', b'>', b':', - b';', b'=', b'$', b'|', b'!', b'~', b'&', b'\'', b'"', b'?', - ] - .contains(&first_byte) - { - return true; - } - - let redirect_symbols: [&[u8]; 8] = [ - b"o>", b"e>", b"o>>", b"e>>", b"o+e>", b"o+e>>", b"e>|", b"o+e>|", - ]; - for s in redirect_symbols { - if source.starts_with(s) { - return true; - } + let token_pos = self.compiler.apply_compiler_rollback(rbp); + self.tokens.set_pos(token_pos); } - - false } diff --git a/src/snapshots/new_nu_parser__test__lexer.snap b/src/snapshots/new_nu_parser__test__lexer.snap new file mode 100644 index 0000000..87e09ba --- /dev/null +++ b/src/snapshots/new_nu_parser__test__lexer.snap @@ -0,0 +1,21 @@ +--- +source: src/test.rs +expression: evaluate_lexer(path) +input_file: tests/lex/int.nu +snapshot_kind: text +--- +Token 0: Number span: 0 .. 1 '0' +Token 1: Newline span: 1 .. 2 '\n' +Token 2: Number span: 2 .. 4 '00' +Token 3: Newline span: 4 .. 5 '\n' +Token 4: Number span: 5 .. 10 '0x123' +Token 5: Newline span: 10 .. 11 '\n' +Token 6: Number span: 11 .. 16 '0b101' +Token 7: Newline span: 16 .. 17 '\n' +Token 8: Name span: 17 .. 19 '_0' +Token 9: Newline span: 19 .. 20 '\n' +Token 10: Number span: 20 .. 21 '0' +Token 11: Name span: 21 .. 22 '_' +Token 12: Newline span: 22 .. 23 '\n' +Token 13: Name span: 23 .. 24 '_' +Token 14: Newline span: 24 .. 25 '\n' diff --git a/src/snapshots/new_nu_parser__test__lexer@bareword.nu.snap b/src/snapshots/new_nu_parser__test__lexer@bareword.nu.snap new file mode 100644 index 0000000..22baeac --- /dev/null +++ b/src/snapshots/new_nu_parser__test__lexer@bareword.nu.snap @@ -0,0 +1,19 @@ +--- +source: src/test.rs +expression: evaluate_lexer(path) +input_file: tests/lex/bareword.nu +snapshot_kind: text +--- +==== TOKENS ==== +Token3 0: Bareword span: 0 .. 1 '_' +Token3 1: Newline span: 1 .. 2 '\n' +Token3 2: Bareword span: 2 .. 4 '_0' +Token3 3: Newline span: 4 .. 5 '\n' +Token3 4: Bareword span: 5 .. 8 'foo' +Token3 5: Bareword span: 9 .. 12 'bar' +Token3 6: Newline span: 12 .. 13 '\n' +Token3 7: Bareword span: 13 .. 16 'foo' +Token3 8: Dot span: 16 .. 17 '.' +Token3 9: Bareword span: 17 .. 20 'bar' +Token3 10: Newline span: 20 .. 21 '\n' +Token3 11: Eof span: 21 .. 21 '' diff --git a/src/snapshots/new_nu_parser__test__lexer@comment.nu.snap b/src/snapshots/new_nu_parser__test__lexer@comment.nu.snap new file mode 100644 index 0000000..a0bb841 --- /dev/null +++ b/src/snapshots/new_nu_parser__test__lexer@comment.nu.snap @@ -0,0 +1,30 @@ +--- +source: src/test.rs +expression: evaluate_lexer(path) +input_file: tests/lex/comment.nu +snapshot_kind: text +--- +==== TOKENS ==== +Token3 0: Comment span: 0 .. 17 '#!/usr/bin/env nu' +Token3 1: Newline span: 17 .. 18 '\n' +Token3 2: Comment span: 18 .. 27 '# comment' +Token3 3: Newline span: 27 .. 28 '\n' +Token3 4: Comment span: 28 .. 36 '#comment' +Token3 5: Newline span: 36 .. 37 '\n' +Token3 6: Comment span: 37 .. 38 '#' +Token3 7: Newline span: 38 .. 39 '\n' +Token3 8: Bareword span: 39 .. 46 'command' +Token3 9: Comment span: 47 .. 56 '# comment' +Token3 10: Newline span: 56 .. 57 '\n' +Token3 11: Bareword span: 57 .. 64 'command' +Token3 12: Comment span: 65 .. 73 '#comment' +Token3 13: Newline span: 73 .. 74 '\n' +Token3 14: Bareword span: 74 .. 86 'command#call' +Token3 15: Newline span: 86 .. 87 '\n' +Token3 16: Bareword span: 87 .. 95 'command#' +Token3 17: Bareword span: 96 .. 100 'call' +Token3 18: Newline span: 100 .. 101 '\n' +Token3 19: Bareword span: 101 .. 108 'command' +Token3 20: Comment span: 109 .. 117 '#comment' +Token3 21: Newline span: 117 .. 118 '\n' +Token3 22: Eof span: 118 .. 118 '' diff --git a/src/snapshots/new_nu_parser__test__lexer@datetime.nu.snap b/src/snapshots/new_nu_parser__test__lexer@datetime.nu.snap new file mode 100644 index 0000000..769c706 --- /dev/null +++ b/src/snapshots/new_nu_parser__test__lexer@datetime.nu.snap @@ -0,0 +1,22 @@ +--- +source: src/test.rs +expression: evaluate_lexer(path) +input_file: tests/lex/datetime.nu +snapshot_kind: text +--- +==== TOKENS ==== +Token3 0: Datetime span: 0 .. 10 '2020-12-20' +Token3 1: Newline span: 10 .. 11 '\n' +Token3 2: Datetime span: 11 .. 30 '2020-12-20T12:23:34' +Token3 3: Newline span: 30 .. 31 '\n' +Token3 4: Datetime span: 31 .. 54 '2020-12-20T12:23:34.456' +Token3 5: Newline span: 54 .. 55 '\n' +Token3 6: Datetime span: 55 .. 79 '2020-12-20T12:23:34.456Z' +Token3 7: Newline span: 79 .. 80 '\n' +Token3 8: Datetime span: 80 .. 109 '2020-12-20T12:23:34.456+02:00' +Token3 9: Newline span: 109 .. 110 '\n' +Token3 10: Datetime span: 110 .. 139 '2020-12-20T12:23:34.456-02:00' +Token3 11: Newline span: 139 .. 140 '\n' +Token3 12: Datetime span: 140 .. 165 '2020-12-20T12:23:34+02:00' +Token3 13: Newline span: 165 .. 166 '\n' +Token3 14: Eof span: 166 .. 166 '' diff --git a/src/snapshots/new_nu_parser__test__lexer@float.nu.snap b/src/snapshots/new_nu_parser__test__lexer@float.nu.snap new file mode 100644 index 0000000..a74d04a --- /dev/null +++ b/src/snapshots/new_nu_parser__test__lexer@float.nu.snap @@ -0,0 +1,45 @@ +--- +source: src/test.rs +expression: evaluate_lexer(path) +input_file: tests/lex/float.nu +snapshot_kind: text +--- +==== TOKENS ==== +Token3 0: Float span: 0 .. 3 '1.0' +Token3 1: Newline span: 3 .. 4 '\n' +Token3 2: Float span: 4 .. 9 '01.10' +Token3 3: Newline span: 9 .. 10 '\n' +Token3 4: Float span: 10 .. 15 '1.0e1' +Token3 5: Newline span: 15 .. 16 '\n' +Token3 6: Float span: 16 .. 22 '1.0e01' +Token3 7: Newline span: 22 .. 23 '\n' +Token3 8: Float span: 23 .. 25 '.2' +Token3 9: Newline span: 25 .. 26 '\n' +Token3 10: Float span: 26 .. 28 '2.' +Token3 11: Newline span: 28 .. 29 '\n' +Token3 12: Float span: 29 .. 33 '.3e3' +Token3 13: Newline span: 33 .. 34 '\n' +Token3 14: Float span: 34 .. 38 '3.e3' +Token3 15: Newline span: 38 .. 39 '\n' +Token3 16: Float span: 39 .. 45 '.1e-10' +Token3 17: Newline span: 45 .. 46 '\n' +Token3 18: Float span: 46 .. 52 '.2e+20' +Token3 19: Newline span: 52 .. 53 '\n' +Token3 20: Float span: 53 .. 62 '45_67.8_9' +Token3 21: Newline span: 62 .. 63 '\n' +Token3 22: Float span: 63 .. 69 '45_.8_' +Token3 23: Newline span: 69 .. 70 '\n' +Token3 24: Bareword span: 70 .. 71 '_' +Token3 25: Float span: 71 .. 73 '.3' +Token3 26: Newline span: 73 .. 74 '\n' +Token3 27: Bareword span: 74 .. 77 '_44' +Token3 28: Float span: 77 .. 80 '.44' +Token3 29: Newline span: 80 .. 81 '\n' +Token3 30: Float span: 81 .. 83 '5.' +Token3 31: Bareword span: 83 .. 84 '_' +Token3 32: Newline span: 84 .. 85 '\n' +Token3 33: Bareword span: 85 .. 86 '_' +Token3 34: Dot span: 86 .. 87 '.' +Token3 35: Bareword span: 87 .. 88 '_' +Token3 36: Newline span: 88 .. 89 '\n' +Token3 37: Eof span: 89 .. 89 '' diff --git a/src/snapshots/new_nu_parser__test__lexer@int.nu.snap b/src/snapshots/new_nu_parser__test__lexer@int.nu.snap new file mode 100644 index 0000000..c32f497 --- /dev/null +++ b/src/snapshots/new_nu_parser__test__lexer@int.nu.snap @@ -0,0 +1,24 @@ +--- +source: src/test.rs +expression: evaluate_lexer(path) +input_file: tests/lex/int.nu +snapshot_kind: text +--- +==== TOKENS ==== +Token3 0: Int span: 0 .. 1 '0' +Token3 1: Newline span: 1 .. 2 '\n' +Token3 2: Int span: 2 .. 4 '00' +Token3 3: Newline span: 4 .. 5 '\n' +Token3 4: Int span: 5 .. 10 '0x123' +Token3 5: Newline span: 10 .. 11 '\n' +Token3 6: Int span: 11 .. 16 '0b101' +Token3 7: Newline span: 16 .. 17 '\n' +Token3 8: Int span: 17 .. 19 '0_' +Token3 9: Newline span: 19 .. 20 '\n' +Token3 10: Bareword span: 20 .. 23 '0bo' +Token3 11: Newline span: 23 .. 24 '\n' +Token3 12: Bareword span: 24 .. 26 '0x' +Token3 13: Newline span: 26 .. 27 '\n' +Token3 14: Bareword span: 27 .. 32 '01x10' +Token3 15: Newline span: 32 .. 33 '\n' +Token3 16: Eof span: 33 .. 33 '' diff --git a/src/snapshots/new_nu_parser__test__node_output@calls.nu.snap b/src/snapshots/new_nu_parser__test__node_output@calls.nu.snap index 04a601d..4f4a03e 100644 --- a/src/snapshots/new_nu_parser__test__node_output@calls.nu.snap +++ b/src/snapshots/new_nu_parser__test__node_output@calls.nu.snap @@ -41,9 +41,12 @@ snapshot_kind: text 33: BinaryOp { lhs: NodeId(30), op: NodeId(31), rhs: NodeId(32) } (100 to 110) 34: Int (112 to 113) "3" 35: Call { parts: [NodeId(28), NodeId(29), NodeId(33), NodeId(34)] } (95 to 113) -36: Block(BlockId(1)) (0 to 114) +36: Name (115 to 128) "foo/bar/spam +" +37: Call { parts: [NodeId(36)] } (127 to 127) +38: Block(BlockId(1)) (0 to 128) ==== SCOPE ==== -0: Frame Scope, node_id: NodeId(36) +0: Frame Scope, node_id: NodeId(38) decls: [ existing: NodeId(8) ] 1: Frame Scope, node_id: NodeId(26) variables: [ a: NodeId(9), b: NodeId(13), c: NodeId(17) ] @@ -84,4 +87,6 @@ snapshot_kind: text 33: string 34: int 35: any -36: any +36: unknown +37: stream +38: stream diff --git a/src/snapshots/new_nu_parser__test__node_output@variable_names.nu.snap b/src/snapshots/new_nu_parser__test__node_output@variable_names.nu.snap new file mode 100644 index 0000000..ecd843c --- /dev/null +++ b/src/snapshots/new_nu_parser__test__node_output@variable_names.nu.snap @@ -0,0 +1,14 @@ +--- +source: src/test.rs +expression: evaluate_example(path) +input_file: tests/variable_names.nu +snapshot_kind: text +--- +==== COMPILER ==== +0: Variable (0 to 4) "$abc" +1: Variable (5 to 7) "$_" +2: Variable (8 to 12) "$a_c" +3: Garbage (14 to 17) +4: Block(BlockId(0)) (0 to 18) +==== COMPILER ERRORS ==== +Error (NodeId 3): variable name must be a bareword diff --git a/src/test.rs b/src/test.rs index fe5e0b4..18df3cf 100644 --- a/src/test.rs +++ b/src/test.rs @@ -1,6 +1,8 @@ +use crate::lexer::lex; use crate::resolver::Resolver; use crate::typechecker::Typechecker; use crate::{compiler::Compiler, parser::Parser}; + use std::path::Path; fn evaluate_example(fname: &Path) -> String { @@ -10,7 +12,14 @@ fn evaluate_example(fname: &Path) -> String { let span_offset = compiler.span_offset(); compiler.add_file(&fname.to_string_lossy(), &contents); - let parser = Parser::new(compiler, span_offset); + let (tokens, err) = lex(&contents, span_offset); + if let Err(e) = err { + tokens.eprint(&contents); + eprintln!("Lexing error. Error: {:?}", e); + std::process::exit(1); + } + + let parser = Parser::new(compiler, tokens); compiler = parser.parse(); let mut result = compiler.display_state(); @@ -38,9 +47,33 @@ fn evaluate_example(fname: &Path) -> String { result } +fn evaluate_lexer(fname: &Path) -> String { + let contents = std::fs::read(fname); + + let Ok(contents) = contents else { + panic!("Lexer: can't find file {}", fname.to_string_lossy()); + }; + + let (tokens, err) = lex(&contents, 0); + let mut res = tokens.display(&contents); + + if let Err(e) = err { + res.push_str(&format!("Lexing error. Error: {:?}", e)); + } + + res +} + #[test] fn test_node_output() { insta::glob!("../tests", "*.nu", |path| { insta::assert_snapshot!(evaluate_example(path)); }); } + +#[test] +fn test_lexer() { + insta::glob!("../tests/lex", "*.nu", |path| { + insta::assert_snapshot!(evaluate_lexer(path)); + }); +} diff --git a/src/token.rs b/src/token.rs deleted file mode 100644 index 35ecab7..0000000 --- a/src/token.rs +++ /dev/null @@ -1,64 +0,0 @@ -#[derive(Clone, Copy, Debug)] -pub enum TokenType { - Number, - Comma, - Caret, - String, - Dollar, - Dot, - DotDot, - Name, - Pipe, - PipePipe, - Colon, - ColonColon, - Semicolon, - Plus, - PlusPlus, - PlusEquals, - Dash, - DashEquals, - Exclamation, - Asterisk, - AsteriskAsterisk, - AsteriskEquals, - ForwardSlash, - ForwardSlashForwardSlash, - ForwardSlashEquals, - Equals, - EqualsEquals, - EqualsTilde, - ExclamationTilde, - ExclamationEquals, - LParen, - LSquare, - LCurly, - LessThan, - LessThanEqual, - RParen, - RSquare, - RCurly, - GreaterThan, - GreaterThanEqual, - Ampersand, - AmpersandAmpersand, - QuestionMark, - ThinArrow, - ThickArrow, - Newline, - ErrGreaterThanPipe, - OutErrGreaterThanPipe, - OutGreaterThan, - OutGreaterGreaterThan, - ErrGreaterThan, - ErrGreaterGreaterThan, - OutErrGreaterThan, - OutErrGreaterGreaterThan, -} - -#[derive(Clone, Copy, Debug)] -pub struct Token { - pub token_type: TokenType, - pub span_start: usize, - pub span_end: usize, -} diff --git a/tests/calls.nu b/tests/calls.nu index 5655c67..568b735 100644 --- a/tests/calls.nu +++ b/tests/calls.nu @@ -2,3 +2,5 @@ spam foo "bar" (1 + 2) def existing [a: string, b: string, c: int] { [ $a, $b, $c] } existing foo ("ba" + "r") 3 + +foo/bar/spam diff --git a/tests/lex/bareword.nu b/tests/lex/bareword.nu new file mode 100644 index 0000000..595fbdc --- /dev/null +++ b/tests/lex/bareword.nu @@ -0,0 +1,4 @@ +_ +_0 +foo bar +foo.bar diff --git a/tests/lex/comment.nu b/tests/lex/comment.nu new file mode 100644 index 0000000..7b5fe1b --- /dev/null +++ b/tests/lex/comment.nu @@ -0,0 +1,9 @@ +#!/usr/bin/env nu +# comment +#comment +# +command # comment +command #comment +command#call +command# call +command #comment diff --git a/tests/lex/datetime.nu b/tests/lex/datetime.nu new file mode 100644 index 0000000..0d817d0 --- /dev/null +++ b/tests/lex/datetime.nu @@ -0,0 +1,7 @@ +2020-12-20 +2020-12-20T12:23:34 +2020-12-20T12:23:34.456 +2020-12-20T12:23:34.456Z +2020-12-20T12:23:34.456+02:00 +2020-12-20T12:23:34.456-02:00 +2020-12-20T12:23:34+02:00 diff --git a/tests/lex/float.nu b/tests/lex/float.nu new file mode 100644 index 0000000..0c78018 --- /dev/null +++ b/tests/lex/float.nu @@ -0,0 +1,16 @@ +1.0 +01.10 +1.0e1 +1.0e01 +.2 +2. +.3e3 +3.e3 +.1e-10 +.2e+20 +45_67.8_9 +45_.8_ +_.3 +_44.44 +5._ +_._ diff --git a/tests/lex/int.nu b/tests/lex/int.nu new file mode 100644 index 0000000..b323bc7 --- /dev/null +++ b/tests/lex/int.nu @@ -0,0 +1,8 @@ +0 +00 +0x123 +0b101 +0_ +0bo +0x +01x10 diff --git a/tests/variable_names.nu b/tests/variable_names.nu new file mode 100644 index 0000000..bd3c717 --- /dev/null +++ b/tests/variable_names.nu @@ -0,0 +1,4 @@ +$abc +$_ +$a_c +$234