Skip to content

Commit e3570da

Browse files
authored
Add utility to find code blocks in RST files (#12)
* Add utility to find code blocks in RST files. * Add more tests. Add exactness result. * Fix typo, make pylint shut up about TODO. * Add changelog fragment. * Fix tests.
1 parent 9214ee7 commit e3570da

File tree

3 files changed

+642
-0
lines changed

3 files changed

+642
-0
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
minor_changes:
2+
- "Add helper ``antsibull_docutils.rst_code_finder.find_code_blocks()`` that allows to find code blocks in RST files.
3+
This is useful for linters and also code that wants to modify the code block's contents.
4+
(https://github.com/ansible-community/antsibull-docutils/pull/12)."
Lines changed: 394 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,394 @@
1+
# Author: Felix Fontein <[email protected]>
2+
# GNU General Public License v3.0+ (see LICENSES/GPL-3.0-or-later.txt or
3+
# https://www.gnu.org/licenses/gpl-3.0.txt)
4+
# SPDX-License-Identifier: GPL-3.0-or-later
5+
# SPDX-FileCopyrightText: 2024, Ansible Project
6+
7+
"""
8+
Find code blocks in RST files.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import io
14+
import os
15+
import typing as t
16+
from collections.abc import Mapping
17+
from dataclasses import dataclass
18+
19+
from docutils import nodes
20+
from docutils.core import Publisher
21+
from docutils.io import StringInput
22+
from docutils.parsers.rst import Directive
23+
from docutils.parsers.rst.directives import register_directive
24+
from docutils.parsers.rst.directives import unchanged as directive_param_unchanged
25+
from docutils.utils import Reporter, SystemMessage
26+
27+
_SPECIAL_ATTRIBUTES = (
28+
"antsibull-code-language",
29+
"antsibull-code-block",
30+
"antsibull-code-lineno",
31+
)
32+
33+
34+
class IgnoreDirective(Directive):
35+
"""
36+
Directive that simply ignores its content.
37+
"""
38+
39+
has_content = True
40+
41+
def run(self) -> list:
42+
return []
43+
44+
45+
def mark_antsibull_code_block(
46+
node: nodes.literal_block,
47+
*,
48+
language: str | None,
49+
line: int,
50+
other: dict[str, t.Any] | None = None,
51+
) -> None:
52+
"""
53+
Mark a literal block as an Antsibull code block with given language and line number.
54+
55+
Everything in ``other`` will be available as ``antsibull-other-{key}`` for a key ``key``
56+
in ``other`` in the node's attributes.
57+
"""
58+
node["antsibull-code-language"] = language
59+
node["antsibull-code-block"] = True
60+
node["antsibull-code-lineno"] = line
61+
if other:
62+
for key, value in other.items():
63+
node[f"antsibull-other-{key}"] = value
64+
65+
66+
class CodeBlockDirective(Directive):
67+
"""
68+
Fake code block directive.
69+
70+
Acts similar to Sphinx's code block directives, except that it calls
71+
``mark_antsibull_code_block()`` on the generated literal blocks.
72+
"""
73+
74+
has_content = True
75+
optional_arguments = 1
76+
77+
# These are all options Sphinx allows for code blocks.
78+
# We need to have them here so that docutils successfully parses this extension.
79+
option_spec = {
80+
"caption": directive_param_unchanged,
81+
"class": directive_param_unchanged,
82+
"dedent": directive_param_unchanged,
83+
"emphasize-lines": directive_param_unchanged,
84+
"name": directive_param_unchanged,
85+
"force": directive_param_unchanged,
86+
"linenos": directive_param_unchanged,
87+
"lineno-start": directive_param_unchanged,
88+
}
89+
90+
def run(self) -> list[nodes.literal_block]:
91+
code = "\n".join(self.content)
92+
literal = nodes.literal_block(code, code)
93+
literal["classes"].append("code-block")
94+
mark_antsibull_code_block(
95+
literal,
96+
language=self.arguments[0] if self.arguments else None,
97+
line=self.lineno,
98+
)
99+
return [literal]
100+
101+
102+
class CodeBlockVisitor(nodes.SparseNodeVisitor):
103+
"""
104+
Visitor that calls callbacks for all code blocks.
105+
"""
106+
107+
def __init__(
108+
self,
109+
document: nodes.document,
110+
content: str,
111+
callback: t.Callable[
112+
[str, int, int, bool, bool, str, nodes.literal_block], None
113+
],
114+
warn_unknown_block: t.Callable[[int | str, int, nodes.literal_block], None],
115+
):
116+
super().__init__(document)
117+
self.__content_lines = content.splitlines()
118+
self.__callback = callback
119+
self.__warn_unknown_block = warn_unknown_block
120+
121+
def visit_system_message(self, node: nodes.system_message) -> None:
122+
"""
123+
Ignore system messages.
124+
"""
125+
raise nodes.SkipNode
126+
127+
def visit_error(self, node: nodes.error) -> None:
128+
"""
129+
Ignore errors.
130+
"""
131+
raise nodes.SkipNode
132+
133+
@staticmethod
134+
def _find_indent(content: str) -> int | None:
135+
"""
136+
Given concatenated lines, find the minimum indent if possible.
137+
138+
If all lines consist only out of whitespace (or are empty),
139+
``None`` is returned.
140+
"""
141+
min_indent = None
142+
for line in content.split("\n"):
143+
stripped_line = line.lstrip()
144+
if stripped_line:
145+
indent = len(line) - len(line.lstrip())
146+
if min_indent is None or min_indent > indent:
147+
min_indent = indent
148+
return min_indent
149+
150+
def _find_offset(self, lineno: int, content: str) -> tuple[int, int, bool]:
151+
"""
152+
Try to identify the row/col offset of the code in ``content`` in the document.
153+
154+
``lineno`` is assumed to be the line where the code-block starts.
155+
This function looks for an empty line, followed by the right pattern of
156+
empty and non-empty lines.
157+
"""
158+
row_offset = lineno
159+
found_empty_line = False
160+
found_content_lines = False
161+
content_lines = content.count("\n") + 1
162+
min_indent = None
163+
for offset, line in enumerate(self.__content_lines[lineno:]):
164+
stripped_line = line.strip()
165+
if not stripped_line:
166+
if not found_empty_line:
167+
row_offset = lineno + offset + 1
168+
found_empty_line = True
169+
elif not found_content_lines:
170+
found_content_lines = True
171+
row_offset = lineno + offset
172+
173+
if found_content_lines and content_lines > 0:
174+
if stripped_line:
175+
indent = len(line) - len(line.lstrip())
176+
if min_indent is None or min_indent > indent:
177+
min_indent = indent
178+
content_lines -= 1
179+
elif not content_lines:
180+
break
181+
182+
min_source_indent = self._find_indent(content)
183+
col_offset = max(0, (min_indent or 0) - (min_source_indent or 0))
184+
return row_offset, col_offset, content_lines == 0
185+
186+
def _find_in_code(self, row_offset: int, col_offset: int, content: str) -> bool:
187+
"""
188+
Check whether the code can be found at the given row/col offset in a way
189+
that makes it easy to replace.
190+
191+
That is, it is surrounded only by whitespace.
192+
"""
193+
for index, line in enumerate(content.split("\n")):
194+
if row_offset + index >= len(self.__content_lines):
195+
return False
196+
found_line = self.__content_lines[row_offset + index]
197+
if found_line[:col_offset].strip():
198+
return False
199+
eol = found_line[col_offset:]
200+
if eol[: len(line)] != line:
201+
return False
202+
if eol[len(line) :].strip():
203+
return False
204+
return True
205+
206+
def visit_literal_block(self, node: nodes.literal_block) -> None:
207+
"""
208+
Visit a code block.
209+
"""
210+
if "antsibull-code-block" not in node.attributes:
211+
if node.attributes["classes"]:
212+
# This could be a `::` block, or something else (unknown)
213+
self.__warn_unknown_block(node.line or "unknown", 0, node)
214+
raise nodes.SkipNode
215+
216+
language = node.attributes["antsibull-code-language"]
217+
lineno = node.attributes["antsibull-code-lineno"]
218+
row_offset, col_offset, position_exact = self._find_offset(
219+
lineno, node.rawsource
220+
)
221+
found_in_code = False
222+
if position_exact:
223+
# If we think we have the exact position, try to identify the code.
224+
# ``found_in_code`` indicates that it is easy to replace the code,
225+
# and at the same time it's easy to identify it.
226+
found_in_code = self._find_in_code(row_offset, col_offset, node.rawsource)
227+
if not found_in_code:
228+
position_exact = False
229+
if not found_in_code:
230+
# We were not able to find the code 'the easy way'. This could be because
231+
# it is inside a table.
232+
233+
# pylint: disable-next=fixme
234+
pass # TODO search for the content, f.ex. in tables
235+
self.__callback(
236+
language,
237+
row_offset,
238+
col_offset,
239+
position_exact,
240+
found_in_code,
241+
node.rawsource.rstrip() + "\n",
242+
node,
243+
)
244+
raise nodes.SkipNode
245+
246+
247+
_DIRECTIVES: dict[str, t.Type[Directive]] = {
248+
# Replace Sphinx code blocks with our code block directive:
249+
"code": CodeBlockDirective,
250+
"code-block": CodeBlockDirective,
251+
"sourcecode": CodeBlockDirective,
252+
# The following docutils directives should better be ignored:
253+
"parsed-literal": IgnoreDirective,
254+
}
255+
256+
257+
def _parse_document(
258+
content: str,
259+
*,
260+
path: str | os.PathLike[str] | None,
261+
root_prefix: str | os.PathLike[str] | None,
262+
directives: dict[str, t.Type[Directive]],
263+
) -> nodes.document:
264+
# pylint: disable-next=fixme
265+
# TODO: figure out how to register a directive only temporarily
266+
for directive_name, directive_class in directives.items():
267+
register_directive(directive_name, directive_class)
268+
269+
# We create a Publisher only to have a mechanism which gives us the settings object.
270+
# Doing this more explicit is a bad idea since the classes used are deprecated and will
271+
# eventually get replaced. Publisher.get_settings() looks like a stable enough API that
272+
# we can 'just use'.
273+
publisher = Publisher(source_class=StringInput)
274+
publisher.set_components("standalone", "restructuredtext", "pseudoxml")
275+
override = {
276+
"root_prefix": root_prefix,
277+
"input_encoding": "utf-8",
278+
"file_insertion_enabled": False,
279+
"raw_enabled": False,
280+
"_disable_config": True,
281+
"report_level": Reporter.ERROR_LEVEL,
282+
"warning_stream": io.StringIO(),
283+
}
284+
publisher.process_programmatic_settings(None, override, None)
285+
publisher.set_source(content, path)
286+
287+
# Parse the document
288+
try:
289+
return publisher.reader.read(
290+
publisher.source, publisher.parser, publisher.settings
291+
)
292+
except SystemMessage as exc:
293+
raise ValueError(f"Cannot parse document: {exc}") from exc
294+
except Exception as exc:
295+
raise ValueError(f"Unexpected error while parsing document: {exc}") from exc
296+
297+
298+
@dataclass
299+
class CodeBlockInfo:
300+
"""
301+
Information on a code block
302+
"""
303+
304+
# The code block's language (if known)
305+
language: str | None
306+
307+
# The code block's line and column offset
308+
row_offset: int
309+
col_offset: int
310+
311+
# Whether the position (row/col_offset) is exact.
312+
# If set to ``False``, the position is approximate and col_offset is often 0.
313+
position_exact: bool
314+
315+
# Whether the code block's contents can be found as-is in the RST file,
316+
# only indented by whitespace, and with potentially trailing whitespace
317+
directly_replacable_in_content: bool
318+
319+
# The code block's contents
320+
content: str
321+
322+
# The code block's attributes that start with ``antsibull-``.
323+
# Special attributes used by ``find_code_blocks()`` to keep track of
324+
# certain properties are not present.
325+
attributes: dict[str, t.Any]
326+
327+
328+
def find_code_blocks(
329+
content: str,
330+
*,
331+
path: str | os.PathLike[str] | None = None,
332+
root_prefix: str | os.PathLike[str] | None = None,
333+
extra_directives: Mapping[str, t.Type[Directive]] | None = None,
334+
warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None,
335+
) -> t.Generator[CodeBlockInfo]:
336+
"""
337+
Given a RST document, finds all code blocks.
338+
"""
339+
directives = _DIRECTIVES.copy()
340+
if extra_directives:
341+
directives.update(extra_directives)
342+
343+
doc = _parse_document(
344+
content, directives=directives, path=path, root_prefix=root_prefix
345+
)
346+
347+
# If someone can figure out how to yield from a sub-function, we can avoid
348+
# using this ugly list
349+
results = []
350+
351+
def callback( # pylint: disable=too-many-arguments,too-many-positional-arguments
352+
language: str,
353+
row_offset: int,
354+
col_offset: int,
355+
position_exact: bool,
356+
directly_replacable_in_content: bool,
357+
content: str,
358+
node: nodes.literal_block,
359+
) -> None:
360+
results.append(
361+
CodeBlockInfo(
362+
language=language,
363+
row_offset=row_offset,
364+
col_offset=col_offset,
365+
position_exact=position_exact,
366+
directly_replacable_in_content=directly_replacable_in_content,
367+
content=content,
368+
attributes={
369+
key: value
370+
for key, value in node.attributes.items()
371+
if key not in _SPECIAL_ATTRIBUTES and key.startswith("antsibull-")
372+
},
373+
)
374+
)
375+
376+
def warn_unknown_block_cb(
377+
line: int | str,
378+
col: int,
379+
node: nodes.literal_block,
380+
) -> None:
381+
if warn_unknown_block:
382+
warn_unknown_block(line, col, node.rawsource)
383+
384+
# Process the document
385+
try:
386+
visitor = CodeBlockVisitor(doc, content, callback, warn_unknown_block_cb)
387+
doc.walk(visitor)
388+
except Exception as exc:
389+
raise ValueError(f"Cannot process document: {exc}") from exc
390+
finally:
391+
yield from results
392+
393+
394+
__all__ = ("CodeBlockInfo", "mark_antsibull_code_block", "find_code_blocks")

0 commit comments

Comments
 (0)