Skip to content

Commit a6d04c6

Browse files
authored
docs: handle some edge cases and add error messages (#101)
1 parent 6fee75a commit a6d04c6

File tree

5 files changed

+355
-58
lines changed

5 files changed

+355
-58
lines changed

pipeline/cli.py

Lines changed: 83 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,19 @@
1010
import sys
1111
from pathlib import Path
1212

13-
from tqdm import tqdm
14-
1513
from pipeline.commands.build import build_command
1614
from pipeline.commands.dev import dev_command
1715
from pipeline.tools.docusaurus_parser import convert_docusaurus_to_mintlify
1816
from pipeline.tools.links import drop_suffix_from_links, move_file_with_link_updates
1917
from pipeline.tools.notebook.convert import convert_notebook
20-
from pipeline.tools.parser import to_mint
18+
from pipeline.tools.parser import ParseError, to_mint
2119

2220

2321
def setup_logging() -> None:
2422
"""Configure logging for the CLI application."""
2523
logging.basicConfig(
2624
level=logging.INFO,
27-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
25+
format="%(levelname)s - %(message)s",
2826
handlers=[logging.StreamHandler(sys.stdout)],
2927
)
3028

@@ -68,51 +66,65 @@ def _find_files_to_migrate(
6866

6967
def _process_single_file(
7068
file_path: Path, output_path: Path, *, dry_run: bool, migration_type: str = "mkdocs"
71-
) -> None:
69+
) -> bool:
7270
"""Process a single file for migration.
7371
7472
Args:
7573
file_path: Input file path
7674
output_path: Output file path
7775
dry_run: Whether to print to stdout instead of writing
7876
migration_type: Type of migration ("mkdocs" or "docusaurus")
79-
"""
80-
extension = file_path.suffix.lower()
81-
content = file_path.read_text()
8277
83-
if extension in {".md", ".markdown", ".mdx"}:
84-
if migration_type == "docusaurus":
85-
mint_markdown = convert_docusaurus_to_mintlify(content, file_path)
86-
else:
87-
mint_markdown = to_mint(content)
88-
elif extension == ".ipynb":
89-
markdown = convert_notebook(file_path)
90-
if migration_type == "docusaurus":
91-
mint_markdown = convert_docusaurus_to_mintlify(markdown, file_path)
78+
Returns:
79+
True if processing was successful, False if there was an error
80+
"""
81+
try:
82+
extension = file_path.suffix.lower()
83+
content = file_path.read_text()
84+
85+
if extension in {".md", ".markdown", ".mdx"}:
86+
if migration_type == "docusaurus":
87+
mint_markdown = convert_docusaurus_to_mintlify(content, file_path)
88+
else:
89+
mint_markdown = to_mint(content, str(file_path))
90+
elif extension == ".ipynb":
91+
markdown = convert_notebook(file_path)
92+
if migration_type == "docusaurus":
93+
mint_markdown = convert_docusaurus_to_mintlify(markdown, file_path)
94+
else:
95+
mint_markdown = to_mint(markdown, str(file_path))
9296
else:
93-
mint_markdown = to_mint(markdown)
94-
else:
95-
logger.warning(
96-
"Skipping unsupported file extension %s: %s", extension, file_path
97-
)
98-
return
97+
logger.warning(
98+
"Skipping unsupported file extension %s: %s", extension, file_path
99+
)
100+
return True # Not an error, just unsupported
99101

100-
_, mint_markdown = drop_suffix_from_links(mint_markdown)
102+
_, mint_markdown = drop_suffix_from_links(mint_markdown)
101103

102-
if dry_run:
103-
# Print the converted markdown to stdout
104-
print(f"=== {file_path} ===") # noqa: T201 (OK to use print)
105-
print(mint_markdown) # noqa: T201 (OK to use print)
106-
print() # noqa: T201 (OK to use print)
107-
else:
108-
# Ensure output directory exists
109-
output_path.parent.mkdir(parents=True, exist_ok=True)
104+
if dry_run:
105+
# Print the converted markdown to stdout
106+
print(f"=== {file_path} ===") # noqa: T201 (OK to use print)
107+
print(mint_markdown) # noqa: T201 (OK to use print)
108+
print() # noqa: T201 (OK to use print)
109+
else:
110+
# Ensure output directory exists
111+
output_path.parent.mkdir(parents=True, exist_ok=True)
110112

111-
# Write the converted content
112-
with output_path.open("w", encoding="utf-8") as file:
113-
file.write(mint_markdown)
113+
# Write the converted content
114+
with output_path.open("w", encoding="utf-8") as file:
115+
file.write(mint_markdown)
114116

115-
logger.info("Converted %s -> %s", file_path, output_path)
117+
logger.info("Converted %s -> %s", file_path, output_path)
118+
except ParseError as e:
119+
# We want to use logger.error rather than exception here. We do not need the
120+
# full stack trace! ParseError should have a nice message.
121+
logger.error("Parse error while processing file: %s", str(e)) # noqa: TRY400
122+
return False
123+
except Exception:
124+
logger.exception("Unexpected error while processing file %s", file_path)
125+
return False
126+
127+
return True
116128

117129

118130
def _determine_output_path(
@@ -167,6 +179,8 @@ def migrate_command(args) -> None: # noqa: ANN001
167179
logger.info("No %s files found in %s", file_types, input_path)
168180
return
169181

182+
logger.info("Found %d files to migrate", len(files_to_migrate))
183+
170184
if input_path.is_dir() and args.output and not args.output.exists():
171185
# Create output directory if it doesn't exist
172186
args.output.mkdir(parents=True, exist_ok=True)
@@ -175,24 +189,43 @@ def migrate_command(args) -> None: # noqa: ANN001
175189
if len(files_to_migrate) > 1:
176190
logger.info("Processing %d files...", len(files_to_migrate))
177191

178-
with tqdm(
179-
files_to_migrate, desc="Migrating files", disable=len(files_to_migrate) == 1
180-
) as pbar:
181-
for file_path in pbar:
182-
pbar.set_description(f"Processing {file_path.name}")
192+
successful_files = 0
193+
failed_files = 0
183194

184-
output_path = _determine_output_path(
185-
input_path, file_path, args, migration_type
186-
)
195+
for file_path in files_to_migrate:
196+
logger.info("Processing %s", file_path.name)
187197

188-
_process_single_file(
189-
file_path,
190-
output_path,
191-
dry_run=args.dry_run,
192-
migration_type=migration_type,
193-
)
198+
output_path = _determine_output_path(
199+
input_path, file_path, args, migration_type
200+
)
201+
202+
success = _process_single_file(
203+
file_path,
204+
output_path,
205+
dry_run=args.dry_run,
206+
migration_type=migration_type,
207+
)
194208

209+
if success:
210+
successful_files += 1
195211
_cleanup_original_file(file_path, args, dry_run=args.dry_run)
212+
else:
213+
failed_files += 1
214+
215+
# Report final results
216+
if len(files_to_migrate) > 1:
217+
logger.info(
218+
"Migration completed: %d successful, %d failed out of %d total files",
219+
successful_files,
220+
failed_files,
221+
len(files_to_migrate),
222+
)
223+
if failed_files > 0:
224+
logger.warning(
225+
"%d files failed to migrate. Check the error messages above for "
226+
"details.",
227+
failed_files,
228+
)
196229

197230

198231
def main() -> None:

pipeline/preprocessors/link_map.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,6 @@
100100
"START": "reference/variables/langgraph.START.html",
101101
"CompiledStateGraph.stream": "reference/classes/langgraph.CompiledStateGraph.html#stream",
102102
"task": "reference/functions/langgraph.task.html",
103-
## TODO (hntrl): export Topic from langgraphjs
104-
# "Topic": "reference/classes/langgraph_channels.Topic.html",
105103
"update_state": "reference/classes/langgraph.CompiledStateGraph.html#updateState",
106104
}
107105

pipeline/tools/parser.py

Lines changed: 122 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,54 @@
2121
from collections.abc import Iterator
2222

2323

24+
class ParseError(Exception):
25+
"""Exception raised when parsing fails with detailed context information."""
26+
27+
def __init__( # noqa: PLR0913
28+
self,
29+
message: str,
30+
*,
31+
line: int | None = None,
32+
token: Token | None = None,
33+
expected: str | None = None,
34+
found: str | None = None,
35+
file_path: str | None = None,
36+
) -> None:
37+
"""Initialize ParseError with detailed context information."""
38+
self.message = message
39+
self.line = line
40+
self.token = token
41+
self.expected = expected
42+
self.found = found
43+
self.file_path = file_path
44+
45+
# Build detailed error message
46+
error_parts = []
47+
48+
if file_path is not None:
49+
error_parts.append(f"'{file_path}':")
50+
51+
error_parts.append(message)
52+
53+
if line is not None:
54+
error_parts.append(f"at line {line}")
55+
56+
if token is not None:
57+
error_parts.append(f"found token {token.type.name} '{token.value}'")
58+
59+
if expected:
60+
error_parts.append(f"expected {expected}")
61+
62+
if found:
63+
error_parts.append(f"but found {found}")
64+
65+
super().__init__(", ".join(error_parts))
66+
67+
def __str__(self) -> str:
68+
"""Return string representation of the exception."""
69+
return super().__str__()
70+
71+
2472
@dataclass(kw_only=True)
2573
class Node:
2674
"""Base-class for all AST nodes."""
@@ -165,7 +213,19 @@ def parse(self) -> Document:
165213
def _advance(self) -> Token:
166214
"""Consume the current token and return it."""
167215
previous = self._token
168-
self._token = next(self._tokens)
216+
try:
217+
self._token = next(self._tokens)
218+
except StopIteration:
219+
# This should not happen if the lexer is working correctly
220+
# (it should always end with an EOF token), but handle it gracefully
221+
msg = "Unexpected end of input"
222+
raise ParseError(
223+
msg,
224+
line=previous.line,
225+
token=previous,
226+
expected="more tokens",
227+
found="end of input",
228+
) from None
169229
return previous
170230

171231
def _check(self, *kinds: TokenType) -> bool:
@@ -218,6 +278,40 @@ def _parse_blocks_until_indent(self, min_indent: int) -> list[Node]:
218278
while not self._check(TokenType.EOF) and (
219279
self._token.indent > min_indent or self._token.type == TokenType.BLANK
220280
):
281+
# Check for unexpected structural tokens that shouldn't appear in
282+
# this context
283+
if self._check(TokenType.CONDITIONAL_BLOCK_CLOSE, TokenType.FRONT_MATTER):
284+
token_descriptions = {
285+
TokenType.CONDITIONAL_BLOCK_CLOSE: "conditional block close ':::'",
286+
TokenType.FRONT_MATTER: "front matter delimiter '---'",
287+
}
288+
found_desc = token_descriptions[self._token.type]
289+
290+
# Special message for conditional block close with indentation info
291+
if self._token.type == TokenType.CONDITIONAL_BLOCK_CLOSE:
292+
msg = (
293+
"Conditional block close ':::' has mismatched indentation - "
294+
"check that opening and closing tags have the same "
295+
"indentation level"
296+
)
297+
raise ParseError(
298+
msg,
299+
line=self._token.line,
300+
token=self._token,
301+
expected=f"content with indent > {min_indent} or properly "
302+
f"indented closing tag",
303+
found=f"conditional block close ':::' at indent "
304+
f"{self._token.indent} (should match opening tag indent)",
305+
)
306+
token_type_desc = self._token.type.name.lower().replace("_", " ")
307+
msg = f"Unexpected {token_type_desc} token"
308+
raise ParseError(
309+
msg,
310+
line=self._token.line,
311+
token=self._token,
312+
expected="content or block end",
313+
found=found_desc,
314+
)
221315
if self._match(TokenType.BLANK):
222316
continue # skip blank lines at this level
223317
blocks.append(self._parse_block())
@@ -267,6 +361,15 @@ def _parse_code_block(self) -> CodeBlock:
267361
fence_indent = open_token.indent
268362
body_lines: list[str] = []
269363
while not self._check(TokenType.FENCE):
364+
if self._check(TokenType.EOF):
365+
msg = "Unclosed code block"
366+
raise ParseError(
367+
msg,
368+
line=open_token.line,
369+
token=open_token,
370+
expected="closing fence '```'",
371+
found="end of file",
372+
)
270373
tok = self._advance()
271374
# Preserve **relative** indentation of the code block
272375
rel_ident = max(0, tok.indent - fence_indent)
@@ -704,11 +807,24 @@ def _visit_conditionalblock(self, node: ConditionalBlock) -> None:
704807
self._add_line(":::")
705808

706809

707-
def to_mint(markdown: str) -> str:
810+
def to_mint(markdown: str, file_path: str | None = None) -> str:
708811
"""Convenience function to print an AST node as Mintlify markdown."""
709812
if not markdown:
710813
return ""
711-
parser = Parser(markdown)
712-
doc = parser.parse()
713-
printer = MintPrinter()
714-
return printer.print(doc)
814+
try:
815+
parser = Parser(markdown)
816+
doc = parser.parse()
817+
printer = MintPrinter()
818+
return printer.print(doc)
819+
except ParseError as e:
820+
# Re-raise with file path context if not already present
821+
if e.file_path is None and file_path is not None:
822+
raise ParseError(
823+
e.message,
824+
line=e.line,
825+
token=e.token,
826+
expected=e.expected,
827+
found=e.found,
828+
file_path=file_path,
829+
) from e
830+
raise

tests/unit_tests/test_lexer.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,3 +241,36 @@ def test_indented_conditional_block() -> None:
241241
assert tokens[0].indent == 4
242242
assert tokens[1].indent == 4
243243
assert tokens[2].indent == 4
244+
245+
246+
def test_conditional_block_with_whitespace_before_close() -> None:
247+
"""Test lexing conditional block with whitespace before closing tag."""
248+
# This is the problematic case that causes infinite loop in parser
249+
test_content = """\
250+
:::python
251+
some text here
252+
1. blah
253+
2. moove
254+
:::"""
255+
256+
tokens = list(lex(test_content))
257+
258+
# Expected token sequence
259+
expected_types = [
260+
TokenType.CONDITIONAL_BLOCK_OPEN, # :::python
261+
TokenType.TEXT, # some text here
262+
TokenType.OL_MARKER, # 1. blah
263+
TokenType.OL_MARKER, # 2. moove
264+
TokenType.CONDITIONAL_BLOCK_CLOSE, # :::
265+
TokenType.EOF,
266+
]
267+
268+
actual_types = [token.type for token in tokens]
269+
270+
assert actual_types == expected_types
271+
272+
# Verify the closing tag has correct indent and value
273+
close_token = tokens[4] # The closing ::: token
274+
assert close_token.type == TokenType.CONDITIONAL_BLOCK_CLOSE
275+
assert close_token.value == ":::"
276+
assert close_token.indent == 1 # Should have 1 space of indent

0 commit comments

Comments
 (0)