|
| 1 | +"""Generate llms.txt, llms-full.txt, and per-page markdown (https://llmstxt.org/). |
| 2 | +
|
| 3 | +The hook publishes three artifacts into the built site: |
| 4 | +
|
| 5 | +- `llms.txt`: a markdown index of the documentation, one link per page, |
| 6 | + grouped by nav section. |
| 7 | +- a `.md` rendition of every prose page next to its HTML (e.g. |
| 8 | + `server/index.md`), which is what the llms.txt links point at. |
| 9 | +- `llms-full.txt`: every prose page concatenated for single-fetch consumption. |
| 10 | +
|
| 11 | +Page markdown is the source markdown with `--8<--` snippet includes resolved |
| 12 | +and relative links rewritten to absolute URLs. The API reference page |
| 13 | +(`api.md`) is a mkdocstrings stub with no markdown source, so it is linked as |
| 14 | +rendered HTML from an Optional section instead of being embedded. |
| 15 | +
|
| 16 | +Incremental builds (`mkdocs build --dirty`) are rejected: they skip unmodified |
| 17 | +pages, which would silently truncate the generated artifacts. |
| 18 | +""" |
| 19 | + |
| 20 | +from __future__ import annotations |
| 21 | + |
| 22 | +import posixpath |
| 23 | +import re |
| 24 | +from dataclasses import dataclass, field |
| 25 | +from pathlib import Path |
| 26 | + |
| 27 | +from mkdocs.config.defaults import MkDocsConfig |
| 28 | +from mkdocs.exceptions import PluginError |
| 29 | +from mkdocs.structure.files import File, Files |
| 30 | +from mkdocs.structure.nav import Navigation, Section |
| 31 | +from mkdocs.structure.pages import Page |
| 32 | + |
| 33 | +# Pages with no markdown source, linked as HTML under "## Optional". |
| 34 | +_OPTIONAL_PAGES = [ |
| 35 | + ("api.md", "API reference", "Auto-generated API reference for the mcp package (rendered HTML)"), |
| 36 | +] |
| 37 | + |
| 38 | +_SNIPPET_LINE = re.compile(r'^(?P<indent>[ \t]*)--8<-- "(?P<path>[^"\n]+)"$', flags=re.MULTILINE) |
| 39 | +_MD_LINK = re.compile(r'(\]\()([^)\s]+\.md)(#[^)\s]*)?( +"[^"]*")?(\))') |
| 40 | + |
| 41 | + |
| 42 | +@dataclass |
| 43 | +class _State: |
| 44 | + page_markdown: dict[str, str] = field(default_factory=dict) |
| 45 | + rendition_uris: set[str] = field(default_factory=set) |
| 46 | + nav: Navigation | None = None |
| 47 | + files: Files | None = None |
| 48 | + |
| 49 | + |
| 50 | +_state = _State() |
| 51 | + |
| 52 | + |
| 53 | +def _site_url(config: MkDocsConfig) -> str: |
| 54 | + assert config.site_url is not None |
| 55 | + return config.site_url.rstrip("/") + "/" |
| 56 | + |
| 57 | + |
| 58 | +def _md_uri(file: File) -> str: |
| 59 | + return re.sub(r"\.html$", ".md", file.dest_uri) |
| 60 | + |
| 61 | + |
| 62 | +def on_config(config: MkDocsConfig) -> None: |
| 63 | + # `mkdocs serve` rebuilds reuse the imported module; start each build clean. |
| 64 | + _state.page_markdown.clear() |
| 65 | + _state.rendition_uris.clear() |
| 66 | + _state.nav = _state.files = None |
| 67 | + |
| 68 | + |
| 69 | +def on_nav(nav: Navigation, config: MkDocsConfig, files: Files) -> None: |
| 70 | + _state.nav = nav |
| 71 | + _state.files = files |
| 72 | + _state.rendition_uris.update(page.file.src_uri for page in nav.pages if page.file.src_uri != "api.md") |
| 73 | + |
| 74 | + |
| 75 | +def on_page_markdown(markdown: str, page: Page, config: MkDocsConfig, files: Files) -> str | None: |
| 76 | + if page.file.src_uri not in _state.rendition_uris: |
| 77 | + return None |
| 78 | + |
| 79 | + # Same anchor as the pymdownx.snippets `base_path` in mkdocs.yml. |
| 80 | + repo_root = Path(config.config_file_path).parent |
| 81 | + |
| 82 | + def include(match: re.Match[str]) -> str: |
| 83 | + indent, path = match["indent"], match["path"] |
| 84 | + # Mirror the snippets extension's restrict_base_path: reject paths |
| 85 | + # that resolve outside the repo root. |
| 86 | + resolved_path = (repo_root / path).resolve() |
| 87 | + if not resolved_path.is_relative_to(repo_root.resolve()): |
| 88 | + raise PluginError(f"llms_txt: snippet path {path!r} in {page.file.src_uri} escapes the repo root") |
| 89 | + try: |
| 90 | + content = resolved_path.read_text(encoding="utf-8").rstrip("\n") |
| 91 | + except OSError as exc: |
| 92 | + raise PluginError(f"llms_txt: cannot read snippet {path!r} in {page.file.src_uri}") from exc |
| 93 | + # Keep a pointer to the embedded file so readers can find it on disk. |
| 94 | + if path.endswith(".py"): |
| 95 | + content = f"# {path}\n{content}" |
| 96 | + if indent: |
| 97 | + content = "\n".join(indent + line if line else line for line in content.split("\n")) |
| 98 | + return content |
| 99 | + |
| 100 | + resolved, substitutions = _SNIPPET_LINE.subn(include, markdown) |
| 101 | + if substitutions != sum("--8<--" in line for line in markdown.splitlines()): |
| 102 | + raise PluginError(f"llms_txt: unresolved snippet include in {page.file.src_uri}") |
| 103 | + |
| 104 | + site_url = _site_url(config) |
| 105 | + src_dir = posixpath.dirname(page.file.src_uri) |
| 106 | + |
| 107 | + def rewrite(match: re.Match[str]) -> str: |
| 108 | + opening, target, anchor, title, closing = match.groups() |
| 109 | + if "://" in target: |
| 110 | + return match.group(0) |
| 111 | + linked = files.get_file_from_path(posixpath.normpath(posixpath.join(src_dir, target))) |
| 112 | + if linked is None: |
| 113 | + raise PluginError(f"llms_txt: cannot resolve link target {target!r} in {page.file.src_uri}") |
| 114 | + # Pages without a markdown rendition (the api.md stub) link to their HTML instead. |
| 115 | + url = _md_uri(linked) if linked.src_uri in _state.rendition_uris else linked.url |
| 116 | + return f"{opening}{site_url}{url}{anchor or ''}{title or ''}{closing}" |
| 117 | + |
| 118 | + _state.page_markdown[page.file.src_uri] = _MD_LINK.sub(rewrite, resolved) |
| 119 | + return None |
| 120 | + |
| 121 | + |
| 122 | +def _section_pages(section: Section) -> list[Page]: |
| 123 | + pages: list[Page] = [] |
| 124 | + for child in section.children: |
| 125 | + if isinstance(child, Page) and child.file.src_uri in _state.rendition_uris: |
| 126 | + pages.append(child) |
| 127 | + elif isinstance(child, Section): |
| 128 | + pages.extend(_section_pages(child)) |
| 129 | + return pages |
| 130 | + |
| 131 | + |
| 132 | +def on_post_build(config: MkDocsConfig) -> None: |
| 133 | + assert _state.nav is not None and _state.files is not None |
| 134 | + missing = _state.rendition_uris - _state.page_markdown.keys() |
| 135 | + if missing: |
| 136 | + raise PluginError(f"llms_txt: pages skipped this build (is this a --dirty build?): {sorted(missing)}") |
| 137 | + |
| 138 | + site_dir = Path(config.site_dir) |
| 139 | + site_url = _site_url(config) |
| 140 | + |
| 141 | + top_level = [ |
| 142 | + item for item in _state.nav.items if isinstance(item, Page) and item.file.src_uri in _state.rendition_uris |
| 143 | + ] |
| 144 | + sections: list[tuple[str, list[Page]]] = [("Docs", top_level)] if top_level else [] |
| 145 | + for item in _state.nav.items: |
| 146 | + if isinstance(item, Section): |
| 147 | + pages = _section_pages(item) |
| 148 | + if pages: |
| 149 | + sections.append((item.title, pages)) |
| 150 | + |
| 151 | + index = [f"# {config.site_name}", "", f"> {config.site_description}", ""] |
| 152 | + full: list[str] = [] |
| 153 | + for title, pages in sections: |
| 154 | + index += [f"## {title}", ""] |
| 155 | + for page in pages: |
| 156 | + markdown = _state.page_markdown[page.file.src_uri] |
| 157 | + (site_dir / _md_uri(page.file)).write_text(markdown, encoding="utf-8") |
| 158 | + |
| 159 | + description = page.meta.get("description") |
| 160 | + tail = f": {description}" if description else "" |
| 161 | + index.append(f"- [{page.title}]({site_url}{_md_uri(page.file)}){tail}") |
| 162 | + |
| 163 | + body, h1_found = re.subn(r"\A\s*# .+\n", "", markdown) |
| 164 | + if not h1_found: |
| 165 | + raise PluginError(f"llms_txt: page {page.file.src_uri} does not start with an H1") |
| 166 | + full += [f"# {page.title}", "", f"Source: {page.canonical_url}", "", body.strip(), ""] |
| 167 | + index.append("") |
| 168 | + |
| 169 | + index += ["## Optional", ""] |
| 170 | + for src_uri, title, description in _OPTIONAL_PAGES: |
| 171 | + linked = _state.files.get_file_from_path(src_uri) |
| 172 | + if linked is None: |
| 173 | + raise PluginError(f"llms_txt: optional page {src_uri} not found") |
| 174 | + index.append(f"- [{title}]({site_url}{linked.url}): {description}") |
| 175 | + index.append("") |
| 176 | + |
| 177 | + (site_dir / "llms.txt").write_text("\n".join(index), encoding="utf-8") |
| 178 | + (site_dir / "llms-full.txt").write_text("\n".join(full), encoding="utf-8") |
0 commit comments