Merge pull request #13 from mkpro118/fix-scanner-bugs

mkpro118 · web-flow · commit 75c9c47e9f1c · 2025-09-07T20:58:28.000-07:00
Coalesce comment tokens; tighten spans; add line/col conversion
diff --git a/src/core/scanner/lexer.rs b/src/core/scanner/lexer.rs
@@ -433,17 +433,35 @@ impl Lexer {
                 };
             };
 
+            macro_rules! coalesce_comments {
+                ($comment_type:ident, $comment_variants:ident) => {
+                    if let Some(ref mut last) = self.$comment_type {
+                        if let (
+                            TokenType::$comment_variants(prev),
+                            TokenType::$comment_variants(curr),
+                        ) = (last.r#type(), token.r#type())
+                        {
+                            // Combine the comment text
+                            let combined = format!("{}\n{}", prev, curr);
+                            *last = Token::new(
+                                TokenType::$comment_variants(combined),
+                                (&last.span().start).into(),
+                                (&token.span().end).into(),
+                            );
+                        }
+                    } else {
+                        self.$comment_type = Some(token);
+                    }
+                };
+            }
+
             // Handle comment coalescing
             match token.r#type() {
                 TokenType::Comment(_) => {
-                    // Replace any existing buffered comment (coalescing consecutive comments)
-                    self.last_comment = Some(token);
-                    // Continue to next token
+                    coalesce_comments!(last_comment, Comment);
                 }
                 TokenType::DocComment(_) => {
-                    // Replace any existing buffered doc comment (coalescing consecutive doc comments)
-                    self.last_doc_comment = Some(token);
-                    // Continue to next token
+                    coalesce_comments!(last_doc_comment, DocComment);
                 }
                 _ => {
                     // Non-comment token found - flush buffered comments and queue this token
@@ -1669,4 +1687,202 @@ mod tests {
             assert_eq!(*token.r#type(), TokenType::RightBrace);
         }
     }
+
+    #[test]
+    fn comment_coalescing_adjacent_regular_comments() {
+        // Test that adjacent regular comments are coalesced
+        let input = "// First comment\n// Second comment\nidentifier";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let first_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::Comment(content) = first_token.r#type() {
+            assert!(content.contains(" First comment"));
+            assert!(content.contains(" Second comment"));
+            // Should be joined with newline
+            assert!(content.contains('\n'));
+        } else {
+            panic!(
+                "Expected coalesced comment token, got: {:?}",
+                first_token.r#type()
+            );
+        }
+
+        let second_token = lexer.next_token().unwrap().unwrap();
+        assert_eq!(
+            *second_token.r#type(),
+            TokenType::Identifier("identifier".to_string())
+        );
+    }
+
+    #[test]
+    fn comment_coalescing_adjacent_doc_comments() {
+        // Test that adjacent doc comments are coalesced
+        let input = "/// First doc\n/// Second doc\nfn test() {}";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let first_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::DocComment(content) = first_token.r#type() {
+            assert!(content.contains(" First doc"));
+            assert!(content.contains(" Second doc"));
+            // Should be joined with newline
+            assert!(content.contains('\n'));
+        } else {
+            panic!(
+                "Expected coalesced doc comment token, got: {:?}",
+                first_token.r#type()
+            );
+        }
+    }
+
+    #[test]
+    fn comment_coalescing_mixed_comment_types_separate() {
+        // Test that regular comments and doc comments don't coalesce together
+        let input = "// Regular comment\n/// Doc comment\nidentifier";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let first_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::Comment(content) = first_token.r#type() {
+            assert!(content.contains(" Regular comment"));
+            assert!(!content.contains(" Doc comment"));
+        } else {
+            panic!(
+                "Expected regular comment token, got: {:?}",
+                first_token.r#type()
+            );
+        }
+
+        let second_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::DocComment(content) = second_token.r#type() {
+            assert!(content.contains(" Doc comment"));
+            assert!(!content.contains(" Regular comment"));
+        } else {
+            panic!(
+                "Expected doc comment token, got: {:?}",
+                second_token.r#type()
+            );
+        }
+
+        let third_token = lexer.next_token().unwrap().unwrap();
+        assert_eq!(
+            *third_token.r#type(),
+            TokenType::Identifier("identifier".to_string())
+        );
+    }
+
+    #[test]
+    fn comment_coalescing_with_intervening_code() {
+        // Test that comments separated by code don't coalesce
+        let input = "// First comment\nlet x = 1;\n// Second comment";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let first_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::Comment(content) = first_token.r#type() {
+            assert!(content.contains(" First comment"));
+            assert!(!content.contains(" Second comment"));
+        } else {
+            panic!(
+                "Expected first comment token, got: {:?}",
+                first_token.r#type()
+            );
+        }
+
+        // Should get the 'let' identifier next
+        let second_token = lexer.next_token().unwrap().unwrap();
+        assert_eq!(
+            *second_token.r#type(),
+            TokenType::Identifier("let".to_string())
+        );
+    }
+
+    #[test]
+    fn comment_coalescing_multiple_adjacent() {
+        // Test coalescing of more than two comments
+        let input = "// First\n// Second\n// Third\ncode";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let comment_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::Comment(content) = comment_token.r#type() {
+            assert!(content.contains(" First"));
+            assert!(content.contains(" Second"));
+            assert!(content.contains(" Third"));
+            // Should have two newlines joining three comments
+            let newline_count = content.matches('\n').count();
+            assert_eq!(newline_count, 2);
+        } else {
+            panic!(
+                "Expected coalesced comment token, got: {:?}",
+                comment_token.r#type()
+            );
+        }
+    }
+
+    #[test]
+    fn comment_coalescing_spans_updated() {
+        // Test that coalesced comment spans cover the entire range
+        let input = "// Start comment\n// End comment\n";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let comment_token = lexer.next_token().unwrap().unwrap();
+        let span = comment_token.span();
+
+        // Should start at line 1, column 1 and end after the second comment
+        assert_eq!(span.start.line, 1);
+        assert_eq!(span.start.column, 1);
+        assert_eq!(span.end.line, 2);
+        // End column should be after "// End comment"
+        assert!(span.end.column > 10);
+    }
+
+    #[test]
+    fn comment_coalescing_empty_comments() {
+        // Test edge case with empty comments
+        let input = "//\n// Non-empty\ncode";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let comment_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::Comment(content) = comment_token.r#type() {
+            // Should coalesce empty comment with non-empty one
+            assert!(content.contains("\n Non-empty"));
+        } else {
+            panic!("Expected comment token, got: {:?}", comment_token.r#type());
+        }
+    }
+
+    #[test]
+    fn comment_coalescing_flush_on_eof() {
+        // Test that buffered comments are flushed at end of input
+        let input = "// Only comment";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let comment_token = lexer.next_token().unwrap().unwrap();
+        assert_eq!(
+            *comment_token.r#type(),
+            TokenType::Comment(" Only comment".to_string())
+        );
+
+        let eof_token = lexer.next_token().unwrap().unwrap();
+        assert_eq!(*eof_token.r#type(), TokenType::EOF);
+    }
+
+    #[test]
+    fn comment_coalescing_doc_comments_multiple() {
+        // Test multiple doc comment coalescing
+        let input = "/// Doc 1\n/// Doc 2\n/// Doc 3\nstruct Test;";
+        let mut lexer = Lexer::default_for_input(input);
+
+        let doc_comment_token = lexer.next_token().unwrap().unwrap();
+        if let TokenType::DocComment(content) = doc_comment_token.r#type() {
+            assert!(content.contains(" Doc 1"));
+            assert!(content.contains(" Doc 2"));
+            assert!(content.contains(" Doc 3"));
+            // Should have two newlines for three doc comments
+            let newline_count = content.matches('\n').count();
+            assert_eq!(newline_count, 2);
+        } else {
+            panic!(
+                "Expected coalesced doc comment token, got: {:?}",
+                doc_comment_token.r#type()
+            );
+        }
+    }
 }
diff --git a/src/core/scanner/tokens.rs b/src/core/scanner/tokens.rs
@@ -204,6 +204,12 @@ pub struct SymbolLocation {
     pub column: u32,
 }
 
+impl From<&SymbolLocation> for (u32, u32) {
+    fn from(value: &SymbolLocation) -> Self {
+        (value.line, value.column)
+    }
+}
+
 /// Record a half-open range covered by a token.
 ///
 /// Combine a start and end location. Only ordering is guaranteed.
@@ -263,7 +269,7 @@ impl Token {
     #[must_use]
     pub fn new(r#type: TokenType, start: (u32, u32), end: (u32, u32)) -> Self {
         assert!(
-            start.0 <= end.0 && start.1 <= end.1,
+            start.0 < end.0 || (start.0 == end.0 && start.1 <= end.1),
             "span should be monotonically increasing"
         );
         Self {
@@ -386,4 +392,109 @@ mod tests {
         // end before start should panic
         let _ = Token::new(TokenType::At, (2, 1), (1, 1));
     }
+
+    #[test]
+    fn symbol_location_from_reference_conversion() {
+        // Test the new From<&SymbolLocation> for (u32, u32) implementation
+        let location = SymbolLocation {
+            line: 5,
+            column: 10,
+        };
+        let tuple: (u32, u32) = (&location).into();
+        assert_eq!(tuple, (5, 10));
+
+        // Test with zero values
+        let location_zero = SymbolLocation { line: 0, column: 0 };
+        let tuple_zero: (u32, u32) = (&location_zero).into();
+        assert_eq!(tuple_zero, (0, 0));
+
+        // Test with maximum values
+        let location_max = SymbolLocation {
+            line: u32::MAX,
+            column: u32::MAX,
+        };
+        let tuple_max: (u32, u32) = (&location_max).into();
+        assert_eq!(tuple_max, (u32::MAX, u32::MAX));
+    }
+
+    #[test]
+    fn token_new_lexicographic_span_ordering() {
+        // Test the updated span monotonicity check with lexicographic ordering
+
+        // Same line: start.column <= end.column should work
+        let start = SymbolLocation { line: 1, column: 5 };
+        let end = SymbolLocation {
+            line: 1,
+            column: 10,
+        };
+        let token = Token::new(
+            TokenType::Identifier("test".to_string()),
+            (start.line, start.column),
+            (end.line, end.column),
+        );
+        assert_eq!(token.span().start, start);
+        assert_eq!(token.span().end, end);
+    }
+
+    #[test]
+    fn token_new_lexicographic_span_same_position() {
+        // Same line, same column should be valid
+        let location = SymbolLocation { line: 1, column: 5 };
+        let token = Token::new(
+            TokenType::Identifier("x".to_string()),
+            (location.line, location.column),
+            (location.line, location.column),
+        );
+        assert_eq!(token.span().start, location);
+        assert_eq!(token.span().end, location);
+    }
+
+    #[test]
+    fn token_new_lexicographic_span_different_lines() {
+        // Different lines: start.line < end.line should work regardless of columns
+        let start = SymbolLocation {
+            line: 1,
+            column: 20,
+        };
+        let end = SymbolLocation { line: 2, column: 5 };
+        let token = Token::new(
+            TokenType::Literal("multiline".to_string()),
+            (start.line, start.column),
+            (end.line, end.column),
+        );
+        assert_eq!(token.span().start, start);
+        assert_eq!(token.span().end, end);
+    }
+
+    #[test]
+    #[should_panic(expected = "span should be monotonically increasing")]
+    fn token_new_lexicographic_span_invalid_same_line() {
+        // Same line but end column before start column should panic
+        let start = SymbolLocation {
+            line: 1,
+            column: 10,
+        };
+        let end = SymbolLocation { line: 1, column: 5 };
+        let _ = Token::new(
+            TokenType::Identifier("invalid".to_string()),
+            (start.line, start.column),
+            (end.line, end.column),
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "span should be monotonically increasing")]
+    fn token_new_lexicographic_span_invalid_reverse_lines() {
+        // End line before start line should panic
+        let start = SymbolLocation { line: 2, column: 5 };
+        let end = SymbolLocation {
+            line: 1,
+            column: 10,
+        };
+        let _ = Token::new(
+            TokenType::Identifier("invalid".to_string()),
+            (start.line, start.column),
+            (end.line, end.column),
+        );
+    }
 }