Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelogs/fragments/12-rst-find-code.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
minor_changes:
- "Add helper ``antsibull_docutils.rst_code_finder.find_code_blocks()`` that allows to find code blocks in RST files.
This is useful for linters and also code that wants to modify the code block's contents.
(https://github.com/ansible-community/antsibull-docutils/pull/12)."
394 changes: 394 additions & 0 deletions src/antsibull_docutils/rst_code_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,394 @@
# Author: Felix Fontein <[email protected]>
# GNU General Public License v3.0+ (see LICENSES/GPL-3.0-or-later.txt or
# https://www.gnu.org/licenses/gpl-3.0.txt)
# SPDX-License-Identifier: GPL-3.0-or-later
# SPDX-FileCopyrightText: 2024, Ansible Project

"""
Find code blocks in RST files.
"""

from __future__ import annotations

import io
import os
import typing as t
from collections.abc import Mapping
from dataclasses import dataclass

from docutils import nodes
from docutils.core import Publisher
from docutils.io import StringInput
from docutils.parsers.rst import Directive
from docutils.parsers.rst.directives import register_directive
from docutils.parsers.rst.directives import unchanged as directive_param_unchanged
from docutils.utils import Reporter, SystemMessage

_SPECIAL_ATTRIBUTES = (
"antsibull-code-language",
"antsibull-code-block",
"antsibull-code-lineno",
)


class IgnoreDirective(Directive):
"""
Directive that simply ignores its content.
"""

has_content = True

def run(self) -> list:
return []


def mark_antsibull_code_block(
node: nodes.literal_block,
*,
language: str | None,
line: int,
other: dict[str, t.Any] | None = None,
) -> None:
"""
Mark a literal block as an Antsibull code block with given language and line number.

Everything in ``other`` will be available as ``antsibull-other-{key}`` for a key ``key``
in ``other`` in the node's attributes.
"""
node["antsibull-code-language"] = language
node["antsibull-code-block"] = True
node["antsibull-code-lineno"] = line
if other:
for key, value in other.items():
node[f"antsibull-other-{key}"] = value


class CodeBlockDirective(Directive):
"""
Fake code block directive.

Acts similar to Sphinx's code block directives, except that it calls
``mark_antsibull_code_block()`` on the generated literal blocks.
"""

has_content = True
optional_arguments = 1

# These are all options Sphinx allows for code blocks.
# We need to have them here so that docutils successfully parses this extension.
option_spec = {
"caption": directive_param_unchanged,
"class": directive_param_unchanged,
"dedent": directive_param_unchanged,
"emphasize-lines": directive_param_unchanged,
"name": directive_param_unchanged,
"force": directive_param_unchanged,
"linenos": directive_param_unchanged,
"lineno-start": directive_param_unchanged,
}

def run(self) -> list[nodes.literal_block]:
code = "\n".join(self.content)
literal = nodes.literal_block(code, code)
literal["classes"].append("code-block")
mark_antsibull_code_block(
literal,
language=self.arguments[0] if self.arguments else None,
line=self.lineno,
)
return [literal]


class CodeBlockVisitor(nodes.SparseNodeVisitor):
"""
Visitor that calls callbacks for all code blocks.
"""

def __init__(
self,
document: nodes.document,
content: str,
callback: t.Callable[
[str, int, int, bool, bool, str, nodes.literal_block], None
],
warn_unknown_block: t.Callable[[int | str, int, nodes.literal_block], None],
):
super().__init__(document)
self.__content_lines = content.splitlines()
self.__callback = callback
self.__warn_unknown_block = warn_unknown_block

def visit_system_message(self, node: nodes.system_message) -> None:
"""
Ignore system messages.
"""
raise nodes.SkipNode

def visit_error(self, node: nodes.error) -> None:
"""
Ignore errors.
"""
raise nodes.SkipNode

@staticmethod
def _find_indent(content: str) -> int | None:
"""
Given concatenated lines, find the minimum indent if possible.

If all lines consist only out of whitespace (or are empty),
``None`` is returned.
"""
min_indent = None
for line in content.split("\n"):
stripped_line = line.lstrip()
if stripped_line:
indent = len(line) - len(line.lstrip())
if min_indent is None or min_indent > indent:
min_indent = indent
return min_indent

def _find_offset(self, lineno: int, content: str) -> tuple[int, int, bool]:
"""
Try to identify the row/col offset of the code in ``content`` in the document.

``lineno`` is assumed to be the line where the code-block starts.
This function looks for an empty line, followed by the right pattern of
empty and non-empty lines.
"""
row_offset = lineno
found_empty_line = False
found_content_lines = False
content_lines = content.count("\n") + 1
min_indent = None
for offset, line in enumerate(self.__content_lines[lineno:]):
stripped_line = line.strip()
if not stripped_line:
if not found_empty_line:
row_offset = lineno + offset + 1
found_empty_line = True
elif not found_content_lines:
found_content_lines = True
row_offset = lineno + offset

if found_content_lines and content_lines > 0:
if stripped_line:
indent = len(line) - len(line.lstrip())
if min_indent is None or min_indent > indent:
min_indent = indent
content_lines -= 1
elif not content_lines:
break

min_source_indent = self._find_indent(content)
col_offset = max(0, (min_indent or 0) - (min_source_indent or 0))
return row_offset, col_offset, content_lines == 0

def _find_in_code(self, row_offset: int, col_offset: int, content: str) -> bool:
"""
Check whether the code can be found at the given row/col offset in a way
that makes it easy to replace.

That is, it is surrounded only by whitespace.
"""
for index, line in enumerate(content.split("\n")):
if row_offset + index >= len(self.__content_lines):
return False
found_line = self.__content_lines[row_offset + index]
if found_line[:col_offset].strip():
return False
eol = found_line[col_offset:]
if eol[: len(line)] != line:
return False
if eol[len(line) :].strip():
return False
return True

def visit_literal_block(self, node: nodes.literal_block) -> None:
"""
Visit a code block.
"""
if "antsibull-code-block" not in node.attributes:
if node.attributes["classes"]:
# This could be a `::` block, or something else (unknown)
self.__warn_unknown_block(node.line or "unknown", 0, node)
raise nodes.SkipNode

language = node.attributes["antsibull-code-language"]
lineno = node.attributes["antsibull-code-lineno"]
row_offset, col_offset, position_exact = self._find_offset(
lineno, node.rawsource
)
found_in_code = False
if position_exact:
# If we think we have the exact position, try to identify the code.
# ``found_in_code`` indicates that it is easy to replace the code,
# and at the same time it's easy to identify it.
found_in_code = self._find_in_code(row_offset, col_offset, node.rawsource)
if not found_in_code:
position_exact = False
if not found_in_code:
# We were not able to find the code 'the easy way'. This could be because
# it is inside a table.

# pylint: disable-next=fixme
pass # TODO search for the content, f.ex. in tables
self.__callback(
language,
row_offset,
col_offset,
position_exact,
found_in_code,
node.rawsource.rstrip() + "\n",
node,
)
raise nodes.SkipNode


_DIRECTIVES: dict[str, t.Type[Directive]] = {
# Replace Sphinx code blocks with our code block directive:
"code": CodeBlockDirective,
"code-block": CodeBlockDirective,
"sourcecode": CodeBlockDirective,
# The following docutils directives should better be ignored:
"parsed-literal": IgnoreDirective,
}


def _parse_document(
content: str,
*,
path: str | os.PathLike[str] | None,
root_prefix: str | os.PathLike[str] | None,
directives: dict[str, t.Type[Directive]],
) -> nodes.document:
# pylint: disable-next=fixme
# TODO: figure out how to register a directive only temporarily
for directive_name, directive_class in directives.items():
register_directive(directive_name, directive_class)

# We create a Publisher only to have a mechanism which gives us the settings object.
# Doing this more explicit is a bad idea since the classes used are deprecated and will
# eventually get replaced. Publisher.get_settings() looks like a stable enough API that
# we can 'just use'.
publisher = Publisher(source_class=StringInput)
publisher.set_components("standalone", "restructuredtext", "pseudoxml")
override = {
"root_prefix": root_prefix,
"input_encoding": "utf-8",
"file_insertion_enabled": False,
"raw_enabled": False,
"_disable_config": True,
"report_level": Reporter.ERROR_LEVEL,
"warning_stream": io.StringIO(),
}
publisher.process_programmatic_settings(None, override, None)
publisher.set_source(content, path)

# Parse the document
try:
return publisher.reader.read(
publisher.source, publisher.parser, publisher.settings
)
except SystemMessage as exc:
raise ValueError(f"Cannot parse document: {exc}") from exc
except Exception as exc:
raise ValueError(f"Unexpected error while parsing document: {exc}") from exc


@dataclass
class CodeBlockInfo:
"""
Information on a code block
"""

# The code block's language (if known)
language: str | None

# The code block's line and column offset
row_offset: int
col_offset: int

# Whether the position (row/col_offset) is exact.
# If set to ``False``, the position is approximate and col_offset is often 0.
position_exact: bool

# Whether the code block's contents can be found as-is in the RST file,
# only indented by whitespace, and with potentially trailing whitespace
directly_replacable_in_content: bool

# The code block's contents
content: str

# The code block's attributes that start with ``antsibull-``.
# Special attributes used by ``find_code_blocks()`` to keep track of
# certain properties are not present.
attributes: dict[str, t.Any]


def find_code_blocks(
content: str,
*,
path: str | os.PathLike[str] | None = None,
root_prefix: str | os.PathLike[str] | None = None,
extra_directives: Mapping[str, t.Type[Directive]] | None = None,
warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None,
) -> t.Generator[CodeBlockInfo]:
"""
Given a RST document, finds all code blocks.
"""
directives = _DIRECTIVES.copy()
if extra_directives:
directives.update(extra_directives)

doc = _parse_document(
content, directives=directives, path=path, root_prefix=root_prefix
)

# If someone can figure out how to yield from a sub-function, we can avoid
# using this ugly list
results = []

def callback( # pylint: disable=too-many-arguments,too-many-positional-arguments
language: str,
row_offset: int,
col_offset: int,
position_exact: bool,
directly_replacable_in_content: bool,
content: str,
node: nodes.literal_block,
) -> None:
results.append(
CodeBlockInfo(
language=language,
row_offset=row_offset,
col_offset=col_offset,
position_exact=position_exact,
directly_replacable_in_content=directly_replacable_in_content,
content=content,
attributes={
key: value
for key, value in node.attributes.items()
if key not in _SPECIAL_ATTRIBUTES and key.startswith("antsibull-")
},
)
)

def warn_unknown_block_cb(
line: int | str,
col: int,
node: nodes.literal_block,
) -> None:
if warn_unknown_block:
warn_unknown_block(line, col, node.rawsource)

# Process the document
try:
visitor = CodeBlockVisitor(doc, content, callback, warn_unknown_block_cb)
doc.walk(visitor)
except Exception as exc:
raise ValueError(f"Cannot process document: {exc}") from exc
finally:
yield from results


__all__ = ("CodeBlockInfo", "mark_antsibull_code_block", "find_code_blocks")
Loading