Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 71 additions & 60 deletions harper-core/src/lexing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use url::lex_url;
use self::email_address::lex_email_address;
use crate::char_ext::CharExt;
use crate::punctuation::{Punctuation, Quote};
use crate::{Number, TokenKind};
use crate::{Number, Span, Token, TokenKind};

#[derive(Debug, Eq, PartialEq)]
pub struct FoundToken {
Expand All @@ -19,8 +19,31 @@ pub struct FoundToken {
pub token: TokenKind,
}

pub fn lex_weir_token(source: &[char]) -> Option<FoundToken> {
let lexers = [
/// Lex `source` with the provided `lex_fn`.
///
/// `lex_fn` should be a function that takes a subslice of the source, and returns the first found
/// token.
pub fn lex_with(source: &[char], lex_fn: fn(&[char]) -> FoundToken) -> Vec<Token> {
let mut cursor = 0;
let mut tokens = Vec::new();

loop {
if cursor >= source.len() {
return tokens;
}

let FoundToken { token, next_index } = lex_fn(&source[cursor..]);

tokens.push(Token {
span: Span::new(cursor, cursor + next_index),
kind: token,
});
cursor += next_index;
}
}

pub fn lex_weir_token(source: &[char]) -> FoundToken {
[
lex_punctuation,
lex_tabs,
lex_spaces,
Expand All @@ -33,20 +56,14 @@ pub fn lex_weir_token(source: &[char]) -> Option<FoundToken> {
lex_email_address,
lex_hostname_token,
lex_word,
lex_catch,
];

for lexer in lexers {
if let Some(f) = lexer(source) {
return Some(f);
}
}

None
]
.into_iter()
.find_map(|lexer| lexer(source))
.unwrap_or_else(lex_catch)
}

pub fn lex_english_token(source: &[char]) -> Option<FoundToken> {
let lexers = [
pub fn lex_english_token(source: &[char]) -> FoundToken {
[
lex_regexish,
lex_punctuation,
lex_tabs,
Expand All @@ -60,16 +77,10 @@ pub fn lex_english_token(source: &[char]) -> Option<FoundToken> {
lex_email_address,
lex_hostname_token,
lex_word,
lex_catch,
];

for lexer in lexers {
if let Some(f) = lexer(source) {
return Some(f);
}
}

None
]
.into_iter()
.find_map(|lexer| lexer(source))
.unwrap_or_else(lex_catch)
}

fn lex_word(source: &[char]) -> Option<FoundToken> {
Expand Down Expand Up @@ -333,11 +344,11 @@ fn lex_quote(source: &[char]) -> Option<FoundToken> {
}

/// Covers cases not covered by the other lints.
fn lex_catch(_source: &[char]) -> Option<FoundToken> {
Some(FoundToken {
fn lex_catch() -> FoundToken {
FoundToken {
next_index: 1,
token: TokenKind::Unlintable,
})
}
}

#[cfg(test)]
Expand Down Expand Up @@ -461,10 +472,10 @@ mod tests {
let source: Vec<_> = "youtube.com".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Hostname,
next_index: source.len()
})
}
);
}

Expand All @@ -473,10 +484,10 @@ mod tests {
let source: Vec<_> = "[]".chars().collect();
assert!(!matches!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 2
})
}
))
}

Expand All @@ -485,10 +496,10 @@ mod tests {
let source: Vec<_> = "[a]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 3
})
}
);
}

Expand All @@ -497,10 +508,10 @@ mod tests {
let source: Vec<_> = "[az]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 4
})
}
);
}

Expand All @@ -509,10 +520,10 @@ mod tests {
let source: Vec<_> = "[123]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 5
})
}
);
}

Expand All @@ -521,10 +532,10 @@ mod tests {
let source: Vec<_> = "[a0b1c2]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 8
})
}
);
}

Expand All @@ -533,10 +544,10 @@ mod tests {
let source: Vec<_> = "[a-z]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 5
})
}
);
}

Expand All @@ -545,10 +556,10 @@ mod tests {
let source: Vec<_> = "[ax-z]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 6
})
}
);
}

Expand All @@ -557,10 +568,10 @@ mod tests {
let source: Vec<_> = "[a-cz]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 6
})
}
);
}

Expand All @@ -569,10 +580,10 @@ mod tests {
let source: Vec<_> = "[a-cx-z]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
next_index: 8
})
}
);
}

Expand All @@ -582,10 +593,10 @@ mod tests {
let source: Vec<_> = "[a-x-z]".chars().collect();
assert_eq!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Punctuation(Punctuation::OpenSquare),
next_index: 1
})
}
);
}

Expand All @@ -594,10 +605,10 @@ mod tests {
let source: Vec<_> = "[a-]".chars().collect();
assert!(!matches!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
..
})
}
));
}

Expand All @@ -606,10 +617,10 @@ mod tests {
let source: Vec<_> = "[-z]".chars().collect();
assert!(!matches!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Regexish,
..
})
}
));
}

Expand Down Expand Up @@ -814,10 +825,10 @@ mod tests {
let source: Vec<_> = "late 1980s".chars().collect();
assert!(matches!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Word(_),
..
})
}
));
}

Expand All @@ -826,10 +837,10 @@ mod tests {
let source: Vec<_> = "1980s and".chars().collect();
assert!(matches!(
lex_english_token(&source),
Some(FoundToken {
FoundToken {
token: TokenKind::Decade,
..
})
}
));
}

Expand Down Expand Up @@ -901,7 +912,7 @@ mod tests {
break; // Exit if we've processed the entire source
}

let token = lex_english_token(&sentence[next_index..]).expect("Failed to lex token");
let token = lex_english_token(&sentence[next_index..]);
assert_eq!(token.token, *expected_token);
next_index += token.next_index;
}
Expand All @@ -927,7 +938,7 @@ mod tests {
break; // Exit if we've processed the entire source
}

let token = lex_english_token(&sentence[next_index..]).expect("Failed to lex token");
let token = lex_english_token(&sentence[next_index..]);

if i < 6 {
assert_eq!(token.token, *expected_token);
Expand Down
25 changes: 3 additions & 22 deletions harper-core/src/parsers/plain_english.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::Parser;
use crate::lexing::{FoundToken, lex_english_token};
use crate::{Span, Token};
use crate::Token;
use crate::lexing::{lex_english_token, lex_with};

/// A parser that will attempt to lex as many tokens as possible,
/// without discrimination and until the end of input.
Expand All @@ -9,25 +9,6 @@ pub struct PlainEnglish;

impl Parser for PlainEnglish {
fn parse(&self, source: &[char]) -> Vec<Token> {
let mut cursor = 0;

// Lex tokens
let mut tokens = Vec::new();

loop {
if cursor >= source.len() {
return tokens;
}

if let Some(FoundToken { token, next_index }) = lex_english_token(&source[cursor..]) {
tokens.push(Token {
span: Span::new(cursor, cursor + next_index),
kind: token,
});
cursor += next_index;
} else {
panic!()
}
}
lex_with(source, lex_english_token)
}
}
24 changes: 3 additions & 21 deletions harper-core/src/weir/parsing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ use ast::{Ast, AstExprNode, AstStmtNode};
pub use expr::parse_expr_str;
pub use stmt::parse_str;

use crate::lexing::{FoundToken, lex_weir_token};
use crate::{Span, Token, TokenKind};
use crate::lexing::{lex_weir_token, lex_with};
use crate::{Token, TokenKind};

use super::{
ast,
Expand All @@ -17,25 +17,7 @@ use super::{

/// Lex the entirety of a Weir document.
fn lex(source: &[char]) -> Vec<Token> {
let mut cursor = 0;

let mut tokens = Vec::new();

loop {
if cursor >= source.len() {
return tokens;
}

if let Some(FoundToken { token, next_index }) = lex_weir_token(&source[cursor..]) {
tokens.push(Token {
span: Span::new(cursor, cursor + next_index),
kind: token,
});
cursor += next_index;
} else {
panic!()
}
}
lex_with(source, lex_weir_token)
}

#[derive(Debug)]
Expand Down