|
| 1 | +# Author: Felix Fontein <[email protected]> |
| 2 | +# GNU General Public License v3.0+ (see LICENSES/GPL-3.0-or-later.txt or |
| 3 | +# https://www.gnu.org/licenses/gpl-3.0.txt) |
| 4 | +# SPDX-License-Identifier: GPL-3.0-or-later |
| 5 | +# SPDX-FileCopyrightText: 2024, Ansible Project |
| 6 | + |
| 7 | +""" |
| 8 | +Find code blocks in RST files. |
| 9 | +""" |
| 10 | + |
| 11 | +from __future__ import annotations |
| 12 | + |
| 13 | +import io |
| 14 | +import os |
| 15 | +import typing as t |
| 16 | +from collections.abc import Mapping |
| 17 | +from dataclasses import dataclass |
| 18 | + |
| 19 | +from docutils import nodes |
| 20 | +from docutils.core import Publisher |
| 21 | +from docutils.io import StringInput |
| 22 | +from docutils.parsers.rst import Directive |
| 23 | +from docutils.parsers.rst.directives import register_directive |
| 24 | +from docutils.parsers.rst.directives import unchanged as directive_param_unchanged |
| 25 | +from docutils.utils import Reporter, SystemMessage |
| 26 | + |
| 27 | +_SPECIAL_ATTRIBUTES = ( |
| 28 | + "antsibull-code-language", |
| 29 | + "antsibull-code-block", |
| 30 | + "antsibull-code-lineno", |
| 31 | +) |
| 32 | + |
| 33 | + |
| 34 | +class IgnoreDirective(Directive): |
| 35 | + """ |
| 36 | + Directive that simply ignores its content. |
| 37 | + """ |
| 38 | + |
| 39 | + has_content = True |
| 40 | + |
| 41 | + def run(self) -> list: |
| 42 | + return [] |
| 43 | + |
| 44 | + |
| 45 | +def mark_antsibull_code_block( |
| 46 | + node: nodes.literal_block, |
| 47 | + *, |
| 48 | + language: str | None, |
| 49 | + line: int, |
| 50 | + other: dict[str, t.Any] | None = None, |
| 51 | +) -> None: |
| 52 | + """ |
| 53 | + Mark a literal block as an Antsibull code block with given language and line number. |
| 54 | +
|
| 55 | + Everything in ``other`` will be available as ``antsibull-other-{key}`` for a key ``key`` |
| 56 | + in ``other`` in the node's attributes. |
| 57 | + """ |
| 58 | + node["antsibull-code-language"] = language |
| 59 | + node["antsibull-code-block"] = True |
| 60 | + node["antsibull-code-lineno"] = line |
| 61 | + if other: |
| 62 | + for key, value in other.items(): |
| 63 | + node[f"antsibull-other-{key}"] = value |
| 64 | + |
| 65 | + |
| 66 | +class CodeBlockDirective(Directive): |
| 67 | + """ |
| 68 | + Fake code block directive. |
| 69 | +
|
| 70 | + Acts similar to Sphinx's code block directives, except that it calls |
| 71 | + ``mark_antsibull_code_block()`` on the generated literal blocks. |
| 72 | + """ |
| 73 | + |
| 74 | + has_content = True |
| 75 | + optional_arguments = 1 |
| 76 | + |
| 77 | + # These are all options Sphinx allows for code blocks. |
| 78 | + # We need to have them here so that docutils successfully parses this extension. |
| 79 | + option_spec = { |
| 80 | + "caption": directive_param_unchanged, |
| 81 | + "class": directive_param_unchanged, |
| 82 | + "dedent": directive_param_unchanged, |
| 83 | + "emphasize-lines": directive_param_unchanged, |
| 84 | + "name": directive_param_unchanged, |
| 85 | + "force": directive_param_unchanged, |
| 86 | + "linenos": directive_param_unchanged, |
| 87 | + "lineno-start": directive_param_unchanged, |
| 88 | + } |
| 89 | + |
| 90 | + def run(self) -> list[nodes.literal_block]: |
| 91 | + code = "\n".join(self.content) |
| 92 | + literal = nodes.literal_block(code, code) |
| 93 | + literal["classes"].append("code-block") |
| 94 | + mark_antsibull_code_block( |
| 95 | + literal, |
| 96 | + language=self.arguments[0] if self.arguments else None, |
| 97 | + line=self.lineno, |
| 98 | + ) |
| 99 | + return [literal] |
| 100 | + |
| 101 | + |
| 102 | +class CodeBlockVisitor(nodes.SparseNodeVisitor): |
| 103 | + """ |
| 104 | + Visitor that calls callbacks for all code blocks. |
| 105 | + """ |
| 106 | + |
| 107 | + def __init__( |
| 108 | + self, |
| 109 | + document: nodes.document, |
| 110 | + content: str, |
| 111 | + callback: t.Callable[ |
| 112 | + [str, int, int, bool, bool, str, nodes.literal_block], None |
| 113 | + ], |
| 114 | + warn_unknown_block: t.Callable[[int | str, int, nodes.literal_block], None], |
| 115 | + ): |
| 116 | + super().__init__(document) |
| 117 | + self.__content_lines = content.splitlines() |
| 118 | + self.__callback = callback |
| 119 | + self.__warn_unknown_block = warn_unknown_block |
| 120 | + |
| 121 | + def visit_system_message(self, node: nodes.system_message) -> None: |
| 122 | + """ |
| 123 | + Ignore system messages. |
| 124 | + """ |
| 125 | + raise nodes.SkipNode |
| 126 | + |
| 127 | + def visit_error(self, node: nodes.error) -> None: |
| 128 | + """ |
| 129 | + Ignore errors. |
| 130 | + """ |
| 131 | + raise nodes.SkipNode |
| 132 | + |
| 133 | + @staticmethod |
| 134 | + def _find_indent(content: str) -> int | None: |
| 135 | + """ |
| 136 | + Given concatenated lines, find the minimum indent if possible. |
| 137 | +
|
| 138 | + If all lines consist only out of whitespace (or are empty), |
| 139 | + ``None`` is returned. |
| 140 | + """ |
| 141 | + min_indent = None |
| 142 | + for line in content.split("\n"): |
| 143 | + stripped_line = line.lstrip() |
| 144 | + if stripped_line: |
| 145 | + indent = len(line) - len(line.lstrip()) |
| 146 | + if min_indent is None or min_indent > indent: |
| 147 | + min_indent = indent |
| 148 | + return min_indent |
| 149 | + |
| 150 | + def _find_offset(self, lineno: int, content: str) -> tuple[int, int, bool]: |
| 151 | + """ |
| 152 | + Try to identify the row/col offset of the code in ``content`` in the document. |
| 153 | +
|
| 154 | + ``lineno`` is assumed to be the line where the code-block starts. |
| 155 | + This function looks for an empty line, followed by the right pattern of |
| 156 | + empty and non-empty lines. |
| 157 | + """ |
| 158 | + row_offset = lineno |
| 159 | + found_empty_line = False |
| 160 | + found_content_lines = False |
| 161 | + content_lines = content.count("\n") + 1 |
| 162 | + min_indent = None |
| 163 | + for offset, line in enumerate(self.__content_lines[lineno:]): |
| 164 | + stripped_line = line.strip() |
| 165 | + if not stripped_line: |
| 166 | + if not found_empty_line: |
| 167 | + row_offset = lineno + offset + 1 |
| 168 | + found_empty_line = True |
| 169 | + elif not found_content_lines: |
| 170 | + found_content_lines = True |
| 171 | + row_offset = lineno + offset |
| 172 | + |
| 173 | + if found_content_lines and content_lines > 0: |
| 174 | + if stripped_line: |
| 175 | + indent = len(line) - len(line.lstrip()) |
| 176 | + if min_indent is None or min_indent > indent: |
| 177 | + min_indent = indent |
| 178 | + content_lines -= 1 |
| 179 | + elif not content_lines: |
| 180 | + break |
| 181 | + |
| 182 | + min_source_indent = self._find_indent(content) |
| 183 | + col_offset = max(0, (min_indent or 0) - (min_source_indent or 0)) |
| 184 | + return row_offset, col_offset, content_lines == 0 |
| 185 | + |
| 186 | + def _find_in_code(self, row_offset: int, col_offset: int, content: str) -> bool: |
| 187 | + """ |
| 188 | + Check whether the code can be found at the given row/col offset in a way |
| 189 | + that makes it easy to replace. |
| 190 | +
|
| 191 | + That is, it is surrounded only by whitespace. |
| 192 | + """ |
| 193 | + for index, line in enumerate(content.split("\n")): |
| 194 | + if row_offset + index >= len(self.__content_lines): |
| 195 | + return False |
| 196 | + found_line = self.__content_lines[row_offset + index] |
| 197 | + if found_line[:col_offset].strip(): |
| 198 | + return False |
| 199 | + eol = found_line[col_offset:] |
| 200 | + if eol[: len(line)] != line: |
| 201 | + return False |
| 202 | + if eol[len(line) :].strip(): |
| 203 | + return False |
| 204 | + return True |
| 205 | + |
| 206 | + def visit_literal_block(self, node: nodes.literal_block) -> None: |
| 207 | + """ |
| 208 | + Visit a code block. |
| 209 | + """ |
| 210 | + if "antsibull-code-block" not in node.attributes: |
| 211 | + if node.attributes["classes"]: |
| 212 | + # This could be a `::` block, or something else (unknown) |
| 213 | + self.__warn_unknown_block(node.line or "unknown", 0, node) |
| 214 | + raise nodes.SkipNode |
| 215 | + |
| 216 | + language = node.attributes["antsibull-code-language"] |
| 217 | + lineno = node.attributes["antsibull-code-lineno"] |
| 218 | + row_offset, col_offset, position_exact = self._find_offset( |
| 219 | + lineno, node.rawsource |
| 220 | + ) |
| 221 | + found_in_code = False |
| 222 | + if position_exact: |
| 223 | + # If we think we have the exact position, try to identify the code. |
| 224 | + # ``found_in_code`` indicates that it is easy to replace the code, |
| 225 | + # and at the same time it's easy to identify it. |
| 226 | + found_in_code = self._find_in_code(row_offset, col_offset, node.rawsource) |
| 227 | + if not found_in_code: |
| 228 | + position_exact = False |
| 229 | + if not found_in_code: |
| 230 | + # We were not able to find the code 'the easy way'. This could be because |
| 231 | + # it is inside a table. |
| 232 | + |
| 233 | + # pylint: disable-next=fixme |
| 234 | + pass # TODO search for the content, f.ex. in tables |
| 235 | + self.__callback( |
| 236 | + language, |
| 237 | + row_offset, |
| 238 | + col_offset, |
| 239 | + position_exact, |
| 240 | + found_in_code, |
| 241 | + node.rawsource.rstrip() + "\n", |
| 242 | + node, |
| 243 | + ) |
| 244 | + raise nodes.SkipNode |
| 245 | + |
| 246 | + |
| 247 | +_DIRECTIVES: dict[str, t.Type[Directive]] = { |
| 248 | + # Replace Sphinx code blocks with our code block directive: |
| 249 | + "code": CodeBlockDirective, |
| 250 | + "code-block": CodeBlockDirective, |
| 251 | + "sourcecode": CodeBlockDirective, |
| 252 | + # The following docutils directives should better be ignored: |
| 253 | + "parsed-literal": IgnoreDirective, |
| 254 | +} |
| 255 | + |
| 256 | + |
| 257 | +def _parse_document( |
| 258 | + content: str, |
| 259 | + *, |
| 260 | + path: str | os.PathLike[str] | None, |
| 261 | + root_prefix: str | os.PathLike[str] | None, |
| 262 | + directives: dict[str, t.Type[Directive]], |
| 263 | +) -> nodes.document: |
| 264 | + # pylint: disable-next=fixme |
| 265 | + # TODO: figure out how to register a directive only temporarily |
| 266 | + for directive_name, directive_class in directives.items(): |
| 267 | + register_directive(directive_name, directive_class) |
| 268 | + |
| 269 | + # We create a Publisher only to have a mechanism which gives us the settings object. |
| 270 | + # Doing this more explicit is a bad idea since the classes used are deprecated and will |
| 271 | + # eventually get replaced. Publisher.get_settings() looks like a stable enough API that |
| 272 | + # we can 'just use'. |
| 273 | + publisher = Publisher(source_class=StringInput) |
| 274 | + publisher.set_components("standalone", "restructuredtext", "pseudoxml") |
| 275 | + override = { |
| 276 | + "root_prefix": root_prefix, |
| 277 | + "input_encoding": "utf-8", |
| 278 | + "file_insertion_enabled": False, |
| 279 | + "raw_enabled": False, |
| 280 | + "_disable_config": True, |
| 281 | + "report_level": Reporter.ERROR_LEVEL, |
| 282 | + "warning_stream": io.StringIO(), |
| 283 | + } |
| 284 | + publisher.process_programmatic_settings(None, override, None) |
| 285 | + publisher.set_source(content, path) |
| 286 | + |
| 287 | + # Parse the document |
| 288 | + try: |
| 289 | + return publisher.reader.read( |
| 290 | + publisher.source, publisher.parser, publisher.settings |
| 291 | + ) |
| 292 | + except SystemMessage as exc: |
| 293 | + raise ValueError(f"Cannot parse document: {exc}") from exc |
| 294 | + except Exception as exc: |
| 295 | + raise ValueError(f"Unexpected error while parsing document: {exc}") from exc |
| 296 | + |
| 297 | + |
| 298 | +@dataclass |
| 299 | +class CodeBlockInfo: |
| 300 | + """ |
| 301 | + Information on a code block |
| 302 | + """ |
| 303 | + |
| 304 | + # The code block's language (if known) |
| 305 | + language: str | None |
| 306 | + |
| 307 | + # The code block's line and column offset |
| 308 | + row_offset: int |
| 309 | + col_offset: int |
| 310 | + |
| 311 | + # Whether the position (row/col_offset) is exact. |
| 312 | + # If set to ``False``, the position is approximate and col_offset is often 0. |
| 313 | + position_exact: bool |
| 314 | + |
| 315 | + # Whether the code block's contents can be found as-is in the RST file, |
| 316 | + # only indented by whitespace, and with potentially trailing whitespace |
| 317 | + directly_replacable_in_content: bool |
| 318 | + |
| 319 | + # The code block's contents |
| 320 | + content: str |
| 321 | + |
| 322 | + # The code block's attributes that start with ``antsibull-``. |
| 323 | + # Special attributes used by ``find_code_blocks()`` to keep track of |
| 324 | + # certain properties are not present. |
| 325 | + attributes: dict[str, t.Any] |
| 326 | + |
| 327 | + |
| 328 | +def find_code_blocks( |
| 329 | + content: str, |
| 330 | + *, |
| 331 | + path: str | os.PathLike[str] | None = None, |
| 332 | + root_prefix: str | os.PathLike[str] | None = None, |
| 333 | + extra_directives: Mapping[str, t.Type[Directive]] | None = None, |
| 334 | + warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None, |
| 335 | +) -> t.Generator[CodeBlockInfo]: |
| 336 | + """ |
| 337 | + Given a RST document, finds all code blocks. |
| 338 | + """ |
| 339 | + directives = _DIRECTIVES.copy() |
| 340 | + if extra_directives: |
| 341 | + directives.update(extra_directives) |
| 342 | + |
| 343 | + doc = _parse_document( |
| 344 | + content, directives=directives, path=path, root_prefix=root_prefix |
| 345 | + ) |
| 346 | + |
| 347 | + # If someone can figure out how to yield from a sub-function, we can avoid |
| 348 | + # using this ugly list |
| 349 | + results = [] |
| 350 | + |
| 351 | + def callback( # pylint: disable=too-many-arguments,too-many-positional-arguments |
| 352 | + language: str, |
| 353 | + row_offset: int, |
| 354 | + col_offset: int, |
| 355 | + position_exact: bool, |
| 356 | + directly_replacable_in_content: bool, |
| 357 | + content: str, |
| 358 | + node: nodes.literal_block, |
| 359 | + ) -> None: |
| 360 | + results.append( |
| 361 | + CodeBlockInfo( |
| 362 | + language=language, |
| 363 | + row_offset=row_offset, |
| 364 | + col_offset=col_offset, |
| 365 | + position_exact=position_exact, |
| 366 | + directly_replacable_in_content=directly_replacable_in_content, |
| 367 | + content=content, |
| 368 | + attributes={ |
| 369 | + key: value |
| 370 | + for key, value in node.attributes.items() |
| 371 | + if key not in _SPECIAL_ATTRIBUTES and key.startswith("antsibull-") |
| 372 | + }, |
| 373 | + ) |
| 374 | + ) |
| 375 | + |
| 376 | + def warn_unknown_block_cb( |
| 377 | + line: int | str, |
| 378 | + col: int, |
| 379 | + node: nodes.literal_block, |
| 380 | + ) -> None: |
| 381 | + if warn_unknown_block: |
| 382 | + warn_unknown_block(line, col, node.rawsource) |
| 383 | + |
| 384 | + # Process the document |
| 385 | + try: |
| 386 | + visitor = CodeBlockVisitor(doc, content, callback, warn_unknown_block_cb) |
| 387 | + doc.walk(visitor) |
| 388 | + except Exception as exc: |
| 389 | + raise ValueError(f"Cannot process document: {exc}") from exc |
| 390 | + finally: |
| 391 | + yield from results |
| 392 | + |
| 393 | + |
| 394 | +__all__ = ("CodeBlockInfo", "mark_antsibull_code_block", "find_code_blocks") |
0 commit comments