Skip to content

Commit 75c9c47

Browse files
authored
Merge pull request #13 from mkpro118/fix-scanner-bugs
Coalesce comment tokens; tighten spans; add line/col conversion
2 parents a8b47c7 + 43f3f5f commit 75c9c47

File tree

2 files changed

+334
-7
lines changed

2 files changed

+334
-7
lines changed

src/core/scanner/lexer.rs

Lines changed: 222 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -433,17 +433,35 @@ impl Lexer {
433433
};
434434
};
435435

436+
macro_rules! coalesce_comments {
437+
($comment_type:ident, $comment_variants:ident) => {
438+
if let Some(ref mut last) = self.$comment_type {
439+
if let (
440+
TokenType::$comment_variants(prev),
441+
TokenType::$comment_variants(curr),
442+
) = (last.r#type(), token.r#type())
443+
{
444+
// Combine the comment text
445+
let combined = format!("{}\n{}", prev, curr);
446+
*last = Token::new(
447+
TokenType::$comment_variants(combined),
448+
(&last.span().start).into(),
449+
(&token.span().end).into(),
450+
);
451+
}
452+
} else {
453+
self.$comment_type = Some(token);
454+
}
455+
};
456+
}
457+
436458
// Handle comment coalescing
437459
match token.r#type() {
438460
TokenType::Comment(_) => {
439-
// Replace any existing buffered comment (coalescing consecutive comments)
440-
self.last_comment = Some(token);
441-
// Continue to next token
461+
coalesce_comments!(last_comment, Comment);
442462
}
443463
TokenType::DocComment(_) => {
444-
// Replace any existing buffered doc comment (coalescing consecutive doc comments)
445-
self.last_doc_comment = Some(token);
446-
// Continue to next token
464+
coalesce_comments!(last_doc_comment, DocComment);
447465
}
448466
_ => {
449467
// Non-comment token found - flush buffered comments and queue this token
@@ -1669,4 +1687,202 @@ mod tests {
16691687
assert_eq!(*token.r#type(), TokenType::RightBrace);
16701688
}
16711689
}
1690+
1691+
#[test]
1692+
fn comment_coalescing_adjacent_regular_comments() {
1693+
// Test that adjacent regular comments are coalesced
1694+
let input = "// First comment\n// Second comment\nidentifier";
1695+
let mut lexer = Lexer::default_for_input(input);
1696+
1697+
let first_token = lexer.next_token().unwrap().unwrap();
1698+
if let TokenType::Comment(content) = first_token.r#type() {
1699+
assert!(content.contains(" First comment"));
1700+
assert!(content.contains(" Second comment"));
1701+
// Should be joined with newline
1702+
assert!(content.contains('\n'));
1703+
} else {
1704+
panic!(
1705+
"Expected coalesced comment token, got: {:?}",
1706+
first_token.r#type()
1707+
);
1708+
}
1709+
1710+
let second_token = lexer.next_token().unwrap().unwrap();
1711+
assert_eq!(
1712+
*second_token.r#type(),
1713+
TokenType::Identifier("identifier".to_string())
1714+
);
1715+
}
1716+
1717+
#[test]
1718+
fn comment_coalescing_adjacent_doc_comments() {
1719+
// Test that adjacent doc comments are coalesced
1720+
let input = "/// First doc\n/// Second doc\nfn test() {}";
1721+
let mut lexer = Lexer::default_for_input(input);
1722+
1723+
let first_token = lexer.next_token().unwrap().unwrap();
1724+
if let TokenType::DocComment(content) = first_token.r#type() {
1725+
assert!(content.contains(" First doc"));
1726+
assert!(content.contains(" Second doc"));
1727+
// Should be joined with newline
1728+
assert!(content.contains('\n'));
1729+
} else {
1730+
panic!(
1731+
"Expected coalesced doc comment token, got: {:?}",
1732+
first_token.r#type()
1733+
);
1734+
}
1735+
}
1736+
1737+
#[test]
1738+
fn comment_coalescing_mixed_comment_types_separate() {
1739+
// Test that regular comments and doc comments don't coalesce together
1740+
let input = "// Regular comment\n/// Doc comment\nidentifier";
1741+
let mut lexer = Lexer::default_for_input(input);
1742+
1743+
let first_token = lexer.next_token().unwrap().unwrap();
1744+
if let TokenType::Comment(content) = first_token.r#type() {
1745+
assert!(content.contains(" Regular comment"));
1746+
assert!(!content.contains(" Doc comment"));
1747+
} else {
1748+
panic!(
1749+
"Expected regular comment token, got: {:?}",
1750+
first_token.r#type()
1751+
);
1752+
}
1753+
1754+
let second_token = lexer.next_token().unwrap().unwrap();
1755+
if let TokenType::DocComment(content) = second_token.r#type() {
1756+
assert!(content.contains(" Doc comment"));
1757+
assert!(!content.contains(" Regular comment"));
1758+
} else {
1759+
panic!(
1760+
"Expected doc comment token, got: {:?}",
1761+
second_token.r#type()
1762+
);
1763+
}
1764+
1765+
let third_token = lexer.next_token().unwrap().unwrap();
1766+
assert_eq!(
1767+
*third_token.r#type(),
1768+
TokenType::Identifier("identifier".to_string())
1769+
);
1770+
}
1771+
1772+
#[test]
1773+
fn comment_coalescing_with_intervening_code() {
1774+
// Test that comments separated by code don't coalesce
1775+
let input = "// First comment\nlet x = 1;\n// Second comment";
1776+
let mut lexer = Lexer::default_for_input(input);
1777+
1778+
let first_token = lexer.next_token().unwrap().unwrap();
1779+
if let TokenType::Comment(content) = first_token.r#type() {
1780+
assert!(content.contains(" First comment"));
1781+
assert!(!content.contains(" Second comment"));
1782+
} else {
1783+
panic!(
1784+
"Expected first comment token, got: {:?}",
1785+
first_token.r#type()
1786+
);
1787+
}
1788+
1789+
// Should get the 'let' identifier next
1790+
let second_token = lexer.next_token().unwrap().unwrap();
1791+
assert_eq!(
1792+
*second_token.r#type(),
1793+
TokenType::Identifier("let".to_string())
1794+
);
1795+
}
1796+
1797+
#[test]
1798+
fn comment_coalescing_multiple_adjacent() {
1799+
// Test coalescing of more than two comments
1800+
let input = "// First\n// Second\n// Third\ncode";
1801+
let mut lexer = Lexer::default_for_input(input);
1802+
1803+
let comment_token = lexer.next_token().unwrap().unwrap();
1804+
if let TokenType::Comment(content) = comment_token.r#type() {
1805+
assert!(content.contains(" First"));
1806+
assert!(content.contains(" Second"));
1807+
assert!(content.contains(" Third"));
1808+
// Should have two newlines joining three comments
1809+
let newline_count = content.matches('\n').count();
1810+
assert_eq!(newline_count, 2);
1811+
} else {
1812+
panic!(
1813+
"Expected coalesced comment token, got: {:?}",
1814+
comment_token.r#type()
1815+
);
1816+
}
1817+
}
1818+
1819+
#[test]
1820+
fn comment_coalescing_spans_updated() {
1821+
// Test that coalesced comment spans cover the entire range
1822+
let input = "// Start comment\n// End comment\n";
1823+
let mut lexer = Lexer::default_for_input(input);
1824+
1825+
let comment_token = lexer.next_token().unwrap().unwrap();
1826+
let span = comment_token.span();
1827+
1828+
// Should start at line 1, column 1 and end after the second comment
1829+
assert_eq!(span.start.line, 1);
1830+
assert_eq!(span.start.column, 1);
1831+
assert_eq!(span.end.line, 2);
1832+
// End column should be after "// End comment"
1833+
assert!(span.end.column > 10);
1834+
}
1835+
1836+
#[test]
1837+
fn comment_coalescing_empty_comments() {
1838+
// Test edge case with empty comments
1839+
let input = "//\n// Non-empty\ncode";
1840+
let mut lexer = Lexer::default_for_input(input);
1841+
1842+
let comment_token = lexer.next_token().unwrap().unwrap();
1843+
if let TokenType::Comment(content) = comment_token.r#type() {
1844+
// Should coalesce empty comment with non-empty one
1845+
assert!(content.contains("\n Non-empty"));
1846+
} else {
1847+
panic!("Expected comment token, got: {:?}", comment_token.r#type());
1848+
}
1849+
}
1850+
1851+
#[test]
1852+
fn comment_coalescing_flush_on_eof() {
1853+
// Test that buffered comments are flushed at end of input
1854+
let input = "// Only comment";
1855+
let mut lexer = Lexer::default_for_input(input);
1856+
1857+
let comment_token = lexer.next_token().unwrap().unwrap();
1858+
assert_eq!(
1859+
*comment_token.r#type(),
1860+
TokenType::Comment(" Only comment".to_string())
1861+
);
1862+
1863+
let eof_token = lexer.next_token().unwrap().unwrap();
1864+
assert_eq!(*eof_token.r#type(), TokenType::EOF);
1865+
}
1866+
1867+
#[test]
1868+
fn comment_coalescing_doc_comments_multiple() {
1869+
// Test multiple doc comment coalescing
1870+
let input = "/// Doc 1\n/// Doc 2\n/// Doc 3\nstruct Test;";
1871+
let mut lexer = Lexer::default_for_input(input);
1872+
1873+
let doc_comment_token = lexer.next_token().unwrap().unwrap();
1874+
if let TokenType::DocComment(content) = doc_comment_token.r#type() {
1875+
assert!(content.contains(" Doc 1"));
1876+
assert!(content.contains(" Doc 2"));
1877+
assert!(content.contains(" Doc 3"));
1878+
// Should have two newlines for three doc comments
1879+
let newline_count = content.matches('\n').count();
1880+
assert_eq!(newline_count, 2);
1881+
} else {
1882+
panic!(
1883+
"Expected coalesced doc comment token, got: {:?}",
1884+
doc_comment_token.r#type()
1885+
);
1886+
}
1887+
}
16721888
}

src/core/scanner/tokens.rs

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,12 @@ pub struct SymbolLocation {
204204
pub column: u32,
205205
}
206206

207+
impl From<&SymbolLocation> for (u32, u32) {
208+
fn from(value: &SymbolLocation) -> Self {
209+
(value.line, value.column)
210+
}
211+
}
212+
207213
/// Record a half-open range covered by a token.
208214
///
209215
/// Combine a start and end location. Only ordering is guaranteed.
@@ -263,7 +269,7 @@ impl Token {
263269
#[must_use]
264270
pub fn new(r#type: TokenType, start: (u32, u32), end: (u32, u32)) -> Self {
265271
assert!(
266-
start.0 <= end.0 && start.1 <= end.1,
272+
start.0 < end.0 || (start.0 == end.0 && start.1 <= end.1),
267273
"span should be monotonically increasing"
268274
);
269275
Self {
@@ -386,4 +392,109 @@ mod tests {
386392
// end before start should panic
387393
let _ = Token::new(TokenType::At, (2, 1), (1, 1));
388394
}
395+
396+
#[test]
397+
fn symbol_location_from_reference_conversion() {
398+
// Test the new From<&SymbolLocation> for (u32, u32) implementation
399+
let location = SymbolLocation {
400+
line: 5,
401+
column: 10,
402+
};
403+
let tuple: (u32, u32) = (&location).into();
404+
assert_eq!(tuple, (5, 10));
405+
406+
// Test with zero values
407+
let location_zero = SymbolLocation { line: 0, column: 0 };
408+
let tuple_zero: (u32, u32) = (&location_zero).into();
409+
assert_eq!(tuple_zero, (0, 0));
410+
411+
// Test with maximum values
412+
let location_max = SymbolLocation {
413+
line: u32::MAX,
414+
column: u32::MAX,
415+
};
416+
let tuple_max: (u32, u32) = (&location_max).into();
417+
assert_eq!(tuple_max, (u32::MAX, u32::MAX));
418+
}
419+
420+
#[test]
421+
fn token_new_lexicographic_span_ordering() {
422+
// Test the updated span monotonicity check with lexicographic ordering
423+
424+
// Same line: start.column <= end.column should work
425+
let start = SymbolLocation { line: 1, column: 5 };
426+
let end = SymbolLocation {
427+
line: 1,
428+
column: 10,
429+
};
430+
let token = Token::new(
431+
TokenType::Identifier("test".to_string()),
432+
(start.line, start.column),
433+
(end.line, end.column),
434+
);
435+
assert_eq!(token.span().start, start);
436+
assert_eq!(token.span().end, end);
437+
}
438+
439+
#[test]
440+
fn token_new_lexicographic_span_same_position() {
441+
// Same line, same column should be valid
442+
let location = SymbolLocation { line: 1, column: 5 };
443+
let token = Token::new(
444+
TokenType::Identifier("x".to_string()),
445+
(location.line, location.column),
446+
(location.line, location.column),
447+
);
448+
assert_eq!(token.span().start, location);
449+
assert_eq!(token.span().end, location);
450+
}
451+
452+
#[test]
453+
fn token_new_lexicographic_span_different_lines() {
454+
// Different lines: start.line < end.line should work regardless of columns
455+
let start = SymbolLocation {
456+
line: 1,
457+
column: 20,
458+
};
459+
let end = SymbolLocation { line: 2, column: 5 };
460+
let token = Token::new(
461+
TokenType::Literal("multiline".to_string()),
462+
(start.line, start.column),
463+
(end.line, end.column),
464+
);
465+
assert_eq!(token.span().start, start);
466+
assert_eq!(token.span().end, end);
467+
}
468+
469+
#[test]
470+
#[should_panic(expected = "span should be monotonically increasing")]
471+
fn token_new_lexicographic_span_invalid_same_line() {
472+
// Same line but end column before start column should panic
473+
let start = SymbolLocation {
474+
line: 1,
475+
column: 10,
476+
};
477+
let end = SymbolLocation { line: 1, column: 5 };
478+
let _ = Token::new(
479+
TokenType::Identifier("invalid".to_string()),
480+
(start.line, start.column),
481+
(end.line, end.column),
482+
);
483+
}
484+
485+
#[test]
486+
#[should_panic(expected = "span should be monotonically increasing")]
487+
fn token_new_lexicographic_span_invalid_reverse_lines() {
488+
// End line before start line should panic
489+
let start = SymbolLocation { line: 2, column: 5 };
490+
let end = SymbolLocation {
491+
line: 1,
492+
column: 10,
493+
};
494+
let _ = Token::new(
495+
TokenType::Identifier("invalid".to_string()),
496+
(start.line, start.column),
497+
(end.line, end.column),
498+
);
499+
}
389500
}

0 commit comments

Comments
 (0)