docs: handle some edge cases and add error messages (#101)

eyurtsev · web-flow · commit a6d04c675751 · 2025-08-02T11:05:46.000-04:00
diff --git a/pipeline/cli.py b/pipeline/cli.py
@@ -10,21 +10,19 @@
 import sys
 from pathlib import Path
 
-from tqdm import tqdm
-
 from pipeline.commands.build import build_command
 from pipeline.commands.dev import dev_command
 from pipeline.tools.docusaurus_parser import convert_docusaurus_to_mintlify
 from pipeline.tools.links import drop_suffix_from_links, move_file_with_link_updates
 from pipeline.tools.notebook.convert import convert_notebook
-from pipeline.tools.parser import to_mint
+from pipeline.tools.parser import ParseError, to_mint
 
 
 def setup_logging() -> None:
     """Configure logging for the CLI application."""
     logging.basicConfig(
         level=logging.INFO,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        format="%(levelname)s - %(message)s",
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
@@ -68,51 +66,65 @@ def _find_files_to_migrate(
 
 def _process_single_file(
     file_path: Path, output_path: Path, *, dry_run: bool, migration_type: str = "mkdocs"
-) -> None:
+) -> bool:
     """Process a single file for migration.
 
     Args:
         file_path: Input file path
         output_path: Output file path
         dry_run: Whether to print to stdout instead of writing
         migration_type: Type of migration ("mkdocs" or "docusaurus")
-    """
-    extension = file_path.suffix.lower()
-    content = file_path.read_text()
 
-    if extension in {".md", ".markdown", ".mdx"}:
-        if migration_type == "docusaurus":
-            mint_markdown = convert_docusaurus_to_mintlify(content, file_path)
-        else:
-            mint_markdown = to_mint(content)
-    elif extension == ".ipynb":
-        markdown = convert_notebook(file_path)
-        if migration_type == "docusaurus":
-            mint_markdown = convert_docusaurus_to_mintlify(markdown, file_path)
+    Returns:
+        True if processing was successful, False if there was an error
+    """
+    try:
+        extension = file_path.suffix.lower()
+        content = file_path.read_text()
+
+        if extension in {".md", ".markdown", ".mdx"}:
+            if migration_type == "docusaurus":
+                mint_markdown = convert_docusaurus_to_mintlify(content, file_path)
+            else:
+                mint_markdown = to_mint(content, str(file_path))
+        elif extension == ".ipynb":
+            markdown = convert_notebook(file_path)
+            if migration_type == "docusaurus":
+                mint_markdown = convert_docusaurus_to_mintlify(markdown, file_path)
+            else:
+                mint_markdown = to_mint(markdown, str(file_path))
         else:
-            mint_markdown = to_mint(markdown)
-    else:
-        logger.warning(
-            "Skipping unsupported file extension %s: %s", extension, file_path
-        )
-        return
+            logger.warning(
+                "Skipping unsupported file extension %s: %s", extension, file_path
+            )
+            return True  # Not an error, just unsupported
 
-    _, mint_markdown = drop_suffix_from_links(mint_markdown)
+        _, mint_markdown = drop_suffix_from_links(mint_markdown)
 
-    if dry_run:
-        # Print the converted markdown to stdout
-        print(f"=== {file_path} ===")  # noqa: T201 (OK to use print)
-        print(mint_markdown)  # noqa: T201 (OK to use print)
-        print()  # noqa: T201 (OK to use print)
-    else:
-        # Ensure output directory exists
-        output_path.parent.mkdir(parents=True, exist_ok=True)
+        if dry_run:
+            # Print the converted markdown to stdout
+            print(f"=== {file_path} ===")  # noqa: T201 (OK to use print)
+            print(mint_markdown)  # noqa: T201 (OK to use print)
+            print()  # noqa: T201 (OK to use print)
+        else:
+            # Ensure output directory exists
+            output_path.parent.mkdir(parents=True, exist_ok=True)
 
-        # Write the converted content
-        with output_path.open("w", encoding="utf-8") as file:
-            file.write(mint_markdown)
+            # Write the converted content
+            with output_path.open("w", encoding="utf-8") as file:
+                file.write(mint_markdown)
 
-        logger.info("Converted %s -> %s", file_path, output_path)
+            logger.info("Converted %s -> %s", file_path, output_path)
+    except ParseError as e:
+        # We want to use logger.error rather than exception here. We do not need the
+        # full stack trace! ParseError should have a nice message.
+        logger.error("Parse error while processing file: %s", str(e))  # noqa: TRY400
+        return False
+    except Exception:
+        logger.exception("Unexpected error while processing file %s", file_path)
+        return False
+
+    return True
 
 
 def _determine_output_path(
@@ -167,6 +179,8 @@ def migrate_command(args) -> None:  # noqa: ANN001
         logger.info("No %s files found in %s", file_types, input_path)
         return
 
+    logger.info("Found %d files to migrate", len(files_to_migrate))
+
     if input_path.is_dir() and args.output and not args.output.exists():
         # Create output directory if it doesn't exist
         args.output.mkdir(parents=True, exist_ok=True)
@@ -175,24 +189,43 @@ def migrate_command(args) -> None:  # noqa: ANN001
     if len(files_to_migrate) > 1:
         logger.info("Processing %d files...", len(files_to_migrate))
 
-    with tqdm(
-        files_to_migrate, desc="Migrating files", disable=len(files_to_migrate) == 1
-    ) as pbar:
-        for file_path in pbar:
-            pbar.set_description(f"Processing {file_path.name}")
+    successful_files = 0
+    failed_files = 0
 
-            output_path = _determine_output_path(
-                input_path, file_path, args, migration_type
-            )
+    for file_path in files_to_migrate:
+        logger.info("Processing %s", file_path.name)
 
-            _process_single_file(
-                file_path,
-                output_path,
-                dry_run=args.dry_run,
-                migration_type=migration_type,
-            )
+        output_path = _determine_output_path(
+            input_path, file_path, args, migration_type
+        )
+
+        success = _process_single_file(
+            file_path,
+            output_path,
+            dry_run=args.dry_run,
+            migration_type=migration_type,
+        )
 
+        if success:
+            successful_files += 1
             _cleanup_original_file(file_path, args, dry_run=args.dry_run)
+        else:
+            failed_files += 1
+
+    # Report final results
+    if len(files_to_migrate) > 1:
+        logger.info(
+            "Migration completed: %d successful, %d failed out of %d total files",
+            successful_files,
+            failed_files,
+            len(files_to_migrate),
+        )
+        if failed_files > 0:
+            logger.warning(
+                "%d files failed to migrate. Check the error messages above for "
+                "details.",
+                failed_files,
+            )
 
 
 def main() -> None:
diff --git a/pipeline/preprocessors/link_map.py b/pipeline/preprocessors/link_map.py
@@ -100,8 +100,6 @@
     "START": "reference/variables/langgraph.START.html",
     "CompiledStateGraph.stream": "reference/classes/langgraph.CompiledStateGraph.html#stream",
     "task": "reference/functions/langgraph.task.html",
-    ## TODO (hntrl): export Topic from langgraphjs
-    # "Topic": "reference/classes/langgraph_channels.Topic.html",
     "update_state": "reference/classes/langgraph.CompiledStateGraph.html#updateState",
 }
 
diff --git a/pipeline/tools/parser.py b/pipeline/tools/parser.py
@@ -21,6 +21,54 @@
     from collections.abc import Iterator
 
 
+class ParseError(Exception):
+    """Exception raised when parsing fails with detailed context information."""
+
+    def __init__(  # noqa: PLR0913
+        self,
+        message: str,
+        *,
+        line: int | None = None,
+        token: Token | None = None,
+        expected: str | None = None,
+        found: str | None = None,
+        file_path: str | None = None,
+    ) -> None:
+        """Initialize ParseError with detailed context information."""
+        self.message = message
+        self.line = line
+        self.token = token
+        self.expected = expected
+        self.found = found
+        self.file_path = file_path
+
+        # Build detailed error message
+        error_parts = []
+
+        if file_path is not None:
+            error_parts.append(f"'{file_path}':")
+
+        error_parts.append(message)
+
+        if line is not None:
+            error_parts.append(f"at line {line}")
+
+        if token is not None:
+            error_parts.append(f"found token {token.type.name} '{token.value}'")
+
+        if expected:
+            error_parts.append(f"expected {expected}")
+
+        if found:
+            error_parts.append(f"but found {found}")
+
+        super().__init__(", ".join(error_parts))
+
+    def __str__(self) -> str:
+        """Return string representation of the exception."""
+        return super().__str__()
+
+
 @dataclass(kw_only=True)
 class Node:
     """Base-class for all AST nodes."""
@@ -165,7 +213,19 @@ def parse(self) -> Document:
     def _advance(self) -> Token:
         """Consume the current token and return it."""
         previous = self._token
-        self._token = next(self._tokens)
+        try:
+            self._token = next(self._tokens)
+        except StopIteration:
+            # This should not happen if the lexer is working correctly
+            # (it should always end with an EOF token), but handle it gracefully
+            msg = "Unexpected end of input"
+            raise ParseError(
+                msg,
+                line=previous.line,
+                token=previous,
+                expected="more tokens",
+                found="end of input",
+            ) from None
         return previous
 
     def _check(self, *kinds: TokenType) -> bool:
@@ -218,6 +278,40 @@ def _parse_blocks_until_indent(self, min_indent: int) -> list[Node]:
         while not self._check(TokenType.EOF) and (
             self._token.indent > min_indent or self._token.type == TokenType.BLANK
         ):
+            # Check for unexpected structural tokens that shouldn't appear in
+            # this context
+            if self._check(TokenType.CONDITIONAL_BLOCK_CLOSE, TokenType.FRONT_MATTER):
+                token_descriptions = {
+                    TokenType.CONDITIONAL_BLOCK_CLOSE: "conditional block close ':::'",
+                    TokenType.FRONT_MATTER: "front matter delimiter '---'",
+                }
+                found_desc = token_descriptions[self._token.type]
+
+                # Special message for conditional block close with indentation info
+                if self._token.type == TokenType.CONDITIONAL_BLOCK_CLOSE:
+                    msg = (
+                        "Conditional block close ':::' has mismatched indentation - "
+                        "check that opening and closing tags have the same "
+                        "indentation level"
+                    )
+                    raise ParseError(
+                        msg,
+                        line=self._token.line,
+                        token=self._token,
+                        expected=f"content with indent > {min_indent} or properly "
+                        f"indented closing tag",
+                        found=f"conditional block close ':::' at indent "
+                        f"{self._token.indent} (should match opening tag indent)",
+                    )
+                token_type_desc = self._token.type.name.lower().replace("_", " ")
+                msg = f"Unexpected {token_type_desc} token"
+                raise ParseError(
+                    msg,
+                    line=self._token.line,
+                    token=self._token,
+                    expected="content or block end",
+                    found=found_desc,
+                )
             if self._match(TokenType.BLANK):
                 continue  # skip blank lines at this level
             blocks.append(self._parse_block())
@@ -267,6 +361,15 @@ def _parse_code_block(self) -> CodeBlock:
         fence_indent = open_token.indent
         body_lines: list[str] = []
         while not self._check(TokenType.FENCE):
+            if self._check(TokenType.EOF):
+                msg = "Unclosed code block"
+                raise ParseError(
+                    msg,
+                    line=open_token.line,
+                    token=open_token,
+                    expected="closing fence '```'",
+                    found="end of file",
+                )
             tok = self._advance()
             # Preserve **relative** indentation of the code block
             rel_ident = max(0, tok.indent - fence_indent)
@@ -704,11 +807,24 @@ def _visit_conditionalblock(self, node: ConditionalBlock) -> None:
         self._add_line(":::")
 
 
-def to_mint(markdown: str) -> str:
+def to_mint(markdown: str, file_path: str | None = None) -> str:
     """Convenience function to print an AST node as Mintlify markdown."""
     if not markdown:
         return ""
-    parser = Parser(markdown)
-    doc = parser.parse()
-    printer = MintPrinter()
-    return printer.print(doc)
+    try:
+        parser = Parser(markdown)
+        doc = parser.parse()
+        printer = MintPrinter()
+        return printer.print(doc)
+    except ParseError as e:
+        # Re-raise with file path context if not already present
+        if e.file_path is None and file_path is not None:
+            raise ParseError(
+                e.message,
+                line=e.line,
+                token=e.token,
+                expected=e.expected,
+                found=e.found,
+                file_path=file_path,
+            ) from e
+        raise
diff --git a/tests/unit_tests/test_lexer.py b/tests/unit_tests/test_lexer.py
@@ -241,3 +241,36 @@ def test_indented_conditional_block() -> None:
     assert tokens[0].indent == 4
     assert tokens[1].indent == 4
     assert tokens[2].indent == 4
+
+
+def test_conditional_block_with_whitespace_before_close() -> None:
+    """Test lexing conditional block with whitespace before closing tag."""
+    # This is the problematic case that causes infinite loop in parser
+    test_content = """\
+:::python
+some text here
+1. blah
+2. moove
+ :::"""
+
+    tokens = list(lex(test_content))
+
+    # Expected token sequence
+    expected_types = [
+        TokenType.CONDITIONAL_BLOCK_OPEN,  # :::python
+        TokenType.TEXT,  # some text here
+        TokenType.OL_MARKER,  # 1. blah
+        TokenType.OL_MARKER,  # 2. moove
+        TokenType.CONDITIONAL_BLOCK_CLOSE,  # :::
+        TokenType.EOF,
+    ]
+
+    actual_types = [token.type for token in tokens]
+
+    assert actual_types == expected_types
+
+    # Verify the closing tag has correct indent and value
+    close_token = tokens[4]  # The closing ::: token
+    assert close_token.type == TokenType.CONDITIONAL_BLOCK_CLOSE
+    assert close_token.value == ":::"
+    assert close_token.indent == 1  # Should have 1 space of indent
diff --git a/tests/unit_tests/test_parser.py b/tests/unit_tests/test_parser.py

Original file line number	Diff line number	Diff line change
`@@ -100,8 +100,6 @@`
`100`	`100`	`"START": "reference/variables/langgraph.START.html",`
`101`	`101`	`"CompiledStateGraph.stream": "reference/classes/langgraph.CompiledStateGraph.html#stream",`
`102`	`102`	`"task": "reference/functions/langgraph.task.html",`
`103`		`- ## TODO (hntrl): export Topic from langgraphjs`
`104`		`- # "Topic": "reference/classes/langgraph_channels.Topic.html",`
`105`	`103`	`"update_state": "reference/classes/langgraph.CompiledStateGraph.html#updateState",`
`106`	`104`	`}`
`107`	`105`