From 1f0168cc78aa5e6a033b18062baad01f2532d8a2 Mon Sep 17 00:00:00 2001 From: 86xsk <200443667+86xsk@users.noreply.github.com> Date: Mon, 26 Jan 2026 22:24:20 -0600 Subject: [PATCH 1/4] refactor(core): use more appropriate return type `lex_catch()` is infallible. By extension, functions that use it as a fallback are also infallible. As such, returning an option is misleading and results in callers having to implement unnecessary error handling. BREAKING CHANGE: Changes the return type of `lex_weir_token`, `lex_english_token`, and `lex_catch`. They now return `FoundToken` instead of `Option`. Also removes the unused `_source` argument from `lex_catch`. --- harper-core/src/lexing/mod.rs | 84 ++++++++++++------------ harper-core/src/parsers/plain_english.rs | 19 +++--- harper-core/src/weir/parsing/mod.rs | 17 ++--- 3 files changed, 56 insertions(+), 64 deletions(-) diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs index 2189f067a..ad1556bb0 100644 --- a/harper-core/src/lexing/mod.rs +++ b/harper-core/src/lexing/mod.rs @@ -19,7 +19,7 @@ pub struct FoundToken { pub token: TokenKind, } -pub fn lex_weir_token(source: &[char]) -> Option { +pub fn lex_weir_token(source: &[char]) -> FoundToken { let lexers = [ lex_punctuation, lex_tabs, @@ -33,19 +33,18 @@ pub fn lex_weir_token(source: &[char]) -> Option { lex_email_address, lex_hostname_token, lex_word, - lex_catch, ]; for lexer in lexers { if let Some(f) = lexer(source) { - return Some(f); + return f; } } - None + lex_catch() } -pub fn lex_english_token(source: &[char]) -> Option { +pub fn lex_english_token(source: &[char]) -> FoundToken { let lexers = [ lex_regexish, lex_punctuation, @@ -60,16 +59,15 @@ pub fn lex_english_token(source: &[char]) -> Option { lex_email_address, lex_hostname_token, lex_word, - lex_catch, ]; for lexer in lexers { if let Some(f) = lexer(source) { - return Some(f); + return f; } } - None + lex_catch() } fn lex_word(source: &[char]) -> Option { @@ -333,11 +331,11 @@ fn lex_quote(source: &[char]) -> Option { } /// Covers cases not covered by the other lints. -fn lex_catch(_source: &[char]) -> Option { - Some(FoundToken { +fn lex_catch() -> FoundToken { + FoundToken { next_index: 1, token: TokenKind::Unlintable, - }) + } } #[cfg(test)] @@ -461,10 +459,10 @@ mod tests { let source: Vec<_> = "youtube.com".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Hostname, next_index: source.len() - }) + } ); } @@ -473,10 +471,10 @@ mod tests { let source: Vec<_> = "[]".chars().collect(); assert!(!matches!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 2 - }) + } )) } @@ -485,10 +483,10 @@ mod tests { let source: Vec<_> = "[a]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 3 - }) + } ); } @@ -497,10 +495,10 @@ mod tests { let source: Vec<_> = "[az]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 4 - }) + } ); } @@ -509,10 +507,10 @@ mod tests { let source: Vec<_> = "[123]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 5 - }) + } ); } @@ -521,10 +519,10 @@ mod tests { let source: Vec<_> = "[a0b1c2]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 8 - }) + } ); } @@ -533,10 +531,10 @@ mod tests { let source: Vec<_> = "[a-z]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 5 - }) + } ); } @@ -545,10 +543,10 @@ mod tests { let source: Vec<_> = "[ax-z]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 6 - }) + } ); } @@ -557,10 +555,10 @@ mod tests { let source: Vec<_> = "[a-cz]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 6 - }) + } ); } @@ -569,10 +567,10 @@ mod tests { let source: Vec<_> = "[a-cx-z]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, next_index: 8 - }) + } ); } @@ -582,10 +580,10 @@ mod tests { let source: Vec<_> = "[a-x-z]".chars().collect(); assert_eq!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Punctuation(Punctuation::OpenSquare), next_index: 1 - }) + } ); } @@ -594,10 +592,10 @@ mod tests { let source: Vec<_> = "[a-]".chars().collect(); assert!(!matches!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, .. - }) + } )); } @@ -606,10 +604,10 @@ mod tests { let source: Vec<_> = "[-z]".chars().collect(); assert!(!matches!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Regexish, .. - }) + } )); } @@ -814,10 +812,10 @@ mod tests { let source: Vec<_> = "late 1980s".chars().collect(); assert!(matches!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Word(_), .. - }) + } )); } @@ -826,10 +824,10 @@ mod tests { let source: Vec<_> = "1980s and".chars().collect(); assert!(matches!( lex_english_token(&source), - Some(FoundToken { + FoundToken { token: TokenKind::Decade, .. - }) + } )); } @@ -901,7 +899,7 @@ mod tests { break; // Exit if we've processed the entire source } - let token = lex_english_token(&sentence[next_index..]).expect("Failed to lex token"); + let token = lex_english_token(&sentence[next_index..]); assert_eq!(token.token, *expected_token); next_index += token.next_index; } @@ -927,7 +925,7 @@ mod tests { break; // Exit if we've processed the entire source } - let token = lex_english_token(&sentence[next_index..]).expect("Failed to lex token"); + let token = lex_english_token(&sentence[next_index..]); if i < 6 { assert_eq!(token.token, *expected_token); diff --git a/harper-core/src/parsers/plain_english.rs b/harper-core/src/parsers/plain_english.rs index f36f15b59..fe0a7503a 100644 --- a/harper-core/src/parsers/plain_english.rs +++ b/harper-core/src/parsers/plain_english.rs @@ -9,9 +9,8 @@ pub struct PlainEnglish; impl Parser for PlainEnglish { fn parse(&self, source: &[char]) -> Vec { - let mut cursor = 0; - // Lex tokens + let mut cursor = 0; let mut tokens = Vec::new(); loop { @@ -19,15 +18,13 @@ impl Parser for PlainEnglish { return tokens; } - if let Some(FoundToken { token, next_index }) = lex_english_token(&source[cursor..]) { - tokens.push(Token { - span: Span::new(cursor, cursor + next_index), - kind: token, - }); - cursor += next_index; - } else { - panic!() - } + let FoundToken { token, next_index } = lex_english_token(&source[cursor..]); + + tokens.push(Token { + span: Span::new(cursor, cursor + next_index), + kind: token, + }); + cursor += next_index; } } } diff --git a/harper-core/src/weir/parsing/mod.rs b/harper-core/src/weir/parsing/mod.rs index fb63b0a06..c72224133 100644 --- a/harper-core/src/weir/parsing/mod.rs +++ b/harper-core/src/weir/parsing/mod.rs @@ -18,7 +18,6 @@ use super::{ /// Lex the entirety of a Weir document. fn lex(source: &[char]) -> Vec { let mut cursor = 0; - let mut tokens = Vec::new(); loop { @@ -26,15 +25,13 @@ fn lex(source: &[char]) -> Vec { return tokens; } - if let Some(FoundToken { token, next_index }) = lex_weir_token(&source[cursor..]) { - tokens.push(Token { - span: Span::new(cursor, cursor + next_index), - kind: token, - }); - cursor += next_index; - } else { - panic!() - } + let FoundToken { token, next_index } = lex_weir_token(&source[cursor..]); + + tokens.push(Token { + span: Span::new(cursor, cursor + next_index), + kind: token, + }); + cursor += next_index; } } From 9b2cc6934b507fb50a8a39ed32657ec47d330cd5 Mon Sep 17 00:00:00 2001 From: 86xsk <200443667+86xsk@users.noreply.github.com> Date: Mon, 26 Jan 2026 22:36:08 -0600 Subject: [PATCH 2/4] refactor(core): replace loops with iterators --- harper-core/src/lexing/mod.rs | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs index ad1556bb0..7540d1884 100644 --- a/harper-core/src/lexing/mod.rs +++ b/harper-core/src/lexing/mod.rs @@ -20,7 +20,7 @@ pub struct FoundToken { } pub fn lex_weir_token(source: &[char]) -> FoundToken { - let lexers = [ + [ lex_punctuation, lex_tabs, lex_spaces, @@ -33,19 +33,14 @@ pub fn lex_weir_token(source: &[char]) -> FoundToken { lex_email_address, lex_hostname_token, lex_word, - ]; - - for lexer in lexers { - if let Some(f) = lexer(source) { - return f; - } - } - - lex_catch() + ] + .into_iter() + .find_map(|lexer| lexer(source)) + .unwrap_or_else(lex_catch) } pub fn lex_english_token(source: &[char]) -> FoundToken { - let lexers = [ + [ lex_regexish, lex_punctuation, lex_tabs, @@ -59,15 +54,10 @@ pub fn lex_english_token(source: &[char]) -> FoundToken { lex_email_address, lex_hostname_token, lex_word, - ]; - - for lexer in lexers { - if let Some(f) = lexer(source) { - return f; - } - } - - lex_catch() + ] + .into_iter() + .find_map(|lexer| lexer(source)) + .unwrap_or_else(lex_catch) } fn lex_word(source: &[char]) -> Option { From f1e21d432621114321d0f1a92e4bf5af226ef203 Mon Sep 17 00:00:00 2001 From: 86xsk <200443667+86xsk@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:38:02 -0600 Subject: [PATCH 3/4] refactor(core): dedupe similar lexing functions --- harper-core/src/lexing/mod.rs | 21 ++++++++++++++++++++- harper-core/src/parsers/plain_english.rs | 22 +++------------------- harper-core/src/weir/parsing/mod.rs | 21 +++------------------ 3 files changed, 26 insertions(+), 38 deletions(-) diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs index 7540d1884..f11d98130 100644 --- a/harper-core/src/lexing/mod.rs +++ b/harper-core/src/lexing/mod.rs @@ -9,7 +9,7 @@ use url::lex_url; use self::email_address::lex_email_address; use crate::char_ext::CharExt; use crate::punctuation::{Punctuation, Quote}; -use crate::{Number, TokenKind}; +use crate::{Number, Span, Token, TokenKind}; #[derive(Debug, Eq, PartialEq)] pub struct FoundToken { @@ -19,6 +19,25 @@ pub struct FoundToken { pub token: TokenKind, } +pub fn lex_with(source: &[char], lex_fn: fn(&[char]) -> FoundToken) -> Vec { + let mut cursor = 0; + let mut tokens = Vec::new(); + + loop { + if cursor >= source.len() { + return tokens; + } + + let FoundToken { token, next_index } = lex_fn(&source[cursor..]); + + tokens.push(Token { + span: Span::new(cursor, cursor + next_index), + kind: token, + }); + cursor += next_index; + } +} + pub fn lex_weir_token(source: &[char]) -> FoundToken { [ lex_punctuation, diff --git a/harper-core/src/parsers/plain_english.rs b/harper-core/src/parsers/plain_english.rs index fe0a7503a..fce1b40d1 100644 --- a/harper-core/src/parsers/plain_english.rs +++ b/harper-core/src/parsers/plain_english.rs @@ -1,6 +1,6 @@ use super::Parser; -use crate::lexing::{FoundToken, lex_english_token}; -use crate::{Span, Token}; +use crate::Token; +use crate::lexing::{lex_english_token, lex_with}; /// A parser that will attempt to lex as many tokens as possible, /// without discrimination and until the end of input. @@ -9,22 +9,6 @@ pub struct PlainEnglish; impl Parser for PlainEnglish { fn parse(&self, source: &[char]) -> Vec { - // Lex tokens - let mut cursor = 0; - let mut tokens = Vec::new(); - - loop { - if cursor >= source.len() { - return tokens; - } - - let FoundToken { token, next_index } = lex_english_token(&source[cursor..]); - - tokens.push(Token { - span: Span::new(cursor, cursor + next_index), - kind: token, - }); - cursor += next_index; - } + lex_with(source, lex_english_token) } } diff --git a/harper-core/src/weir/parsing/mod.rs b/harper-core/src/weir/parsing/mod.rs index c72224133..2be8b9bd7 100644 --- a/harper-core/src/weir/parsing/mod.rs +++ b/harper-core/src/weir/parsing/mod.rs @@ -7,8 +7,8 @@ use ast::{Ast, AstExprNode, AstStmtNode}; pub use expr::parse_expr_str; pub use stmt::parse_str; -use crate::lexing::{FoundToken, lex_weir_token}; -use crate::{Span, Token, TokenKind}; +use crate::lexing::{lex_weir_token, lex_with}; +use crate::{Token, TokenKind}; use super::{ ast, @@ -17,22 +17,7 @@ use super::{ /// Lex the entirety of a Weir document. fn lex(source: &[char]) -> Vec { - let mut cursor = 0; - let mut tokens = Vec::new(); - - loop { - if cursor >= source.len() { - return tokens; - } - - let FoundToken { token, next_index } = lex_weir_token(&source[cursor..]); - - tokens.push(Token { - span: Span::new(cursor, cursor + next_index), - kind: token, - }); - cursor += next_index; - } + lex_with(source, lex_weir_token) } #[derive(Debug)] From f226f5c20a0d36a465debd5dd0f94600c2fd602e Mon Sep 17 00:00:00 2001 From: 86xsk <200443667+86xsk@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:45:59 -0600 Subject: [PATCH 4/4] docs(core): documentation for `lex_with()` --- harper-core/src/lexing/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs index f11d98130..0bb093149 100644 --- a/harper-core/src/lexing/mod.rs +++ b/harper-core/src/lexing/mod.rs @@ -19,6 +19,10 @@ pub struct FoundToken { pub token: TokenKind, } +/// Lex `source` with the provided `lex_fn`. +/// +/// `lex_fn` should be a function that takes a subslice of the source, and returns the first found +/// token. pub fn lex_with(source: &[char], lex_fn: fn(&[char]) -> FoundToken) -> Vec { let mut cursor = 0; let mut tokens = Vec::new();