Skip to content

Commit b5551e7

Browse files
authored
Split code block finding up into smaller parts to allow reusing the parsed document tree (#14)
* Code block finding: separate RST loading from document processing. * Increase testability, add first basic tests. * Extend tests. * Add changelog fragment.
1 parent b7bd16b commit b5551e7

File tree

5 files changed

+608
-146
lines changed

5 files changed

+608
-146
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
minor_changes:
2+
- "Add functionality to parse documents, and to search for code blocks in parsed documents.
3+
This allows to perform other operations on the parsed document, instead of having to parse
4+
it multiple times
5+
(https://github.com/ansible-community/antsibull-docutils/pull/14)."

src/antsibull_docutils/rst_code_finder.py

Lines changed: 151 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,16 @@
1010

1111
from __future__ import annotations
1212

13-
import io
1413
import os
1514
import typing as t
1615
from collections.abc import Mapping
1716
from dataclasses import dataclass
1817

1918
from docutils import nodes
20-
from docutils.core import Publisher
21-
from docutils.io import StringInput
2219
from docutils.parsers.rst import Directive
23-
from docutils.parsers.rst.directives import register_directive
2420
from docutils.parsers.rst.directives import unchanged as directive_param_unchanged
25-
from docutils.utils import Reporter, SystemMessage
21+
22+
from .utils import parse_document
2623

2724
_SPECIAL_ATTRIBUTES = (
2825
"antsibull-code-language",
@@ -99,6 +96,89 @@ def run(self) -> list[nodes.literal_block]:
9996
return [literal]
10097

10198

99+
def _find_indent(content: str) -> int | None:
100+
"""
101+
Given concatenated lines, find the minimum indent if possible.
102+
103+
If all lines consist only out of whitespace (or are empty),
104+
``None`` is returned.
105+
"""
106+
min_indent = None
107+
for line in content.split("\n"):
108+
stripped_line = line.lstrip()
109+
if stripped_line:
110+
indent = len(line) - len(line.lstrip())
111+
if min_indent is None or min_indent > indent:
112+
min_indent = indent
113+
return min_indent
114+
115+
116+
def _find_offset(
117+
lineno: int, content: str, *, document_content_lines: list[str]
118+
) -> tuple[int, int, bool]:
119+
"""
120+
Try to identify the row/col offset of the code in ``content`` in the document.
121+
122+
``lineno`` is assumed to be the line where the code-block starts.
123+
This function looks for an empty line, followed by the right pattern of
124+
empty and non-empty lines.
125+
"""
126+
row_offset = lineno
127+
found_empty_line = False
128+
found_content_lines = False
129+
content_lines = content.count("\n") + 1
130+
min_indent = None
131+
for offset, line in enumerate(document_content_lines[lineno:]):
132+
stripped_line = line.strip()
133+
if not stripped_line:
134+
if not found_empty_line:
135+
row_offset = lineno + offset + 1
136+
found_empty_line = True
137+
elif not found_content_lines:
138+
found_content_lines = True
139+
row_offset = lineno + offset
140+
141+
if found_content_lines and content_lines > 0:
142+
if stripped_line:
143+
indent = len(line) - len(line.lstrip())
144+
if min_indent is None or min_indent > indent:
145+
min_indent = indent
146+
content_lines -= 1
147+
elif not content_lines:
148+
break
149+
150+
min_source_indent = _find_indent(content)
151+
col_offset = max(0, (min_indent or 0) - (min_source_indent or 0))
152+
return row_offset, col_offset, content_lines == 0
153+
154+
155+
def _find_in_code(
156+
row_offset: int,
157+
col_offset: int,
158+
content: str,
159+
*,
160+
document_content_lines: list[str],
161+
) -> bool:
162+
"""
163+
Check whether the code can be found at the given row/col offset in a way
164+
that makes it easy to replace.
165+
166+
That is, it is surrounded only by whitespace.
167+
"""
168+
for index, line in enumerate(content.split("\n")):
169+
if row_offset + index >= len(document_content_lines):
170+
return False
171+
found_line = document_content_lines[row_offset + index]
172+
if found_line[:col_offset].strip():
173+
return False
174+
eol = found_line[col_offset:]
175+
if eol[: len(line)] != line:
176+
return False
177+
if eol[len(line) :].strip():
178+
return False
179+
return True
180+
181+
102182
class CodeBlockVisitor(nodes.SparseNodeVisitor):
103183
"""
104184
Visitor that calls callbacks for all code blocks.
@@ -130,100 +210,31 @@ def visit_error(self, node: nodes.error) -> None:
130210
"""
131211
raise nodes.SkipNode
132212

133-
@staticmethod
134-
def _find_indent(content: str) -> int | None:
135-
"""
136-
Given concatenated lines, find the minimum indent if possible.
137-
138-
If all lines consist only out of whitespace (or are empty),
139-
``None`` is returned.
140-
"""
141-
min_indent = None
142-
for line in content.split("\n"):
143-
stripped_line = line.lstrip()
144-
if stripped_line:
145-
indent = len(line) - len(line.lstrip())
146-
if min_indent is None or min_indent > indent:
147-
min_indent = indent
148-
return min_indent
149-
150-
def _find_offset(self, lineno: int, content: str) -> tuple[int, int, bool]:
151-
"""
152-
Try to identify the row/col offset of the code in ``content`` in the document.
153-
154-
``lineno`` is assumed to be the line where the code-block starts.
155-
This function looks for an empty line, followed by the right pattern of
156-
empty and non-empty lines.
157-
"""
158-
row_offset = lineno
159-
found_empty_line = False
160-
found_content_lines = False
161-
content_lines = content.count("\n") + 1
162-
min_indent = None
163-
for offset, line in enumerate(self.__content_lines[lineno:]):
164-
stripped_line = line.strip()
165-
if not stripped_line:
166-
if not found_empty_line:
167-
row_offset = lineno + offset + 1
168-
found_empty_line = True
169-
elif not found_content_lines:
170-
found_content_lines = True
171-
row_offset = lineno + offset
172-
173-
if found_content_lines and content_lines > 0:
174-
if stripped_line:
175-
indent = len(line) - len(line.lstrip())
176-
if min_indent is None or min_indent > indent:
177-
min_indent = indent
178-
content_lines -= 1
179-
elif not content_lines:
180-
break
181-
182-
min_source_indent = self._find_indent(content)
183-
col_offset = max(0, (min_indent or 0) - (min_source_indent or 0))
184-
return row_offset, col_offset, content_lines == 0
185-
186-
def _find_in_code(self, row_offset: int, col_offset: int, content: str) -> bool:
187-
"""
188-
Check whether the code can be found at the given row/col offset in a way
189-
that makes it easy to replace.
190-
191-
That is, it is surrounded only by whitespace.
192-
"""
193-
for index, line in enumerate(content.split("\n")):
194-
if row_offset + index >= len(self.__content_lines):
195-
return False
196-
found_line = self.__content_lines[row_offset + index]
197-
if found_line[:col_offset].strip():
198-
return False
199-
eol = found_line[col_offset:]
200-
if eol[: len(line)] != line:
201-
return False
202-
if eol[len(line) :].strip():
203-
return False
204-
return True
205-
206213
def visit_literal_block(self, node: nodes.literal_block) -> None:
207214
"""
208215
Visit a code block.
209216
"""
210217
if "antsibull-code-block" not in node.attributes:
211-
if node.attributes["classes"]:
212-
# This could be a `::` block, or something else (unknown)
213-
self.__warn_unknown_block(node.line or "unknown", 0, node)
218+
# This could be a `::` block, or something else (unknown)
219+
self.__warn_unknown_block(node.line or "unknown", 0, node)
214220
raise nodes.SkipNode
215221

216222
language = node.attributes["antsibull-code-language"]
217223
lineno = node.attributes["antsibull-code-lineno"]
218-
row_offset, col_offset, position_exact = self._find_offset(
219-
lineno, node.rawsource
224+
row_offset, col_offset, position_exact = _find_offset(
225+
lineno, node.rawsource, document_content_lines=self.__content_lines
220226
)
221227
found_in_code = False
222228
if position_exact:
223229
# If we think we have the exact position, try to identify the code.
224230
# ``found_in_code`` indicates that it is easy to replace the code,
225231
# and at the same time it's easy to identify it.
226-
found_in_code = self._find_in_code(row_offset, col_offset, node.rawsource)
232+
found_in_code = _find_in_code(
233+
row_offset,
234+
col_offset,
235+
node.rawsource,
236+
document_content_lines=self.__content_lines,
237+
)
227238
if not found_in_code:
228239
position_exact = False
229240
if not found_in_code:
@@ -254,51 +265,6 @@ def visit_literal_block(self, node: nodes.literal_block) -> None:
254265
}
255266

256267

257-
def _parse_document(
258-
content: str,
259-
*,
260-
path: str | os.PathLike[str] | None,
261-
root_prefix: str | os.PathLike[str] | None,
262-
directives: dict[str, t.Type[Directive]],
263-
) -> nodes.document:
264-
# pylint: disable-next=fixme
265-
# TODO: figure out how to register a directive only temporarily
266-
for directive_name, directive_class in directives.items():
267-
register_directive(directive_name, directive_class)
268-
269-
# We create a Publisher only to have a mechanism which gives us the settings object.
270-
# Doing this more explicit is a bad idea since the classes used are deprecated and will
271-
# eventually get replaced. Publisher.get_settings() looks like a stable enough API that
272-
# we can 'just use'.
273-
publisher = Publisher(source_class=StringInput)
274-
publisher.set_components("standalone", "restructuredtext", "pseudoxml")
275-
override = {
276-
"root_prefix": str(root_prefix),
277-
"input_encoding": "utf-8",
278-
"file_insertion_enabled": False,
279-
"raw_enabled": False,
280-
"_disable_config": True,
281-
"report_level": Reporter.ERROR_LEVEL,
282-
"warning_stream": io.StringIO(),
283-
}
284-
publisher.process_programmatic_settings(None, override, None)
285-
publisher.set_source(content, str(path))
286-
287-
# Parse the document
288-
try:
289-
# mypy gives errors for the next line, but this is literally what docutils itself
290-
# is also doing. So we're going to ignore this error...
291-
return publisher.reader.read(
292-
publisher.source,
293-
publisher.parser,
294-
publisher.settings, # type: ignore
295-
)
296-
except SystemMessage as exc:
297-
raise ValueError(f"Cannot parse document: {exc}") from exc
298-
except Exception as exc:
299-
raise ValueError(f"Unexpected error while parsing document: {exc}") from exc
300-
301-
302268
@dataclass
303269
class CodeBlockInfo:
304270
"""
@@ -329,25 +295,35 @@ class CodeBlockInfo:
329295
attributes: dict[str, t.Any]
330296

331297

332-
def find_code_blocks(
333-
content: str,
298+
def get_code_block_directives(
334299
*,
335-
path: str | os.PathLike[str] | None = None,
336-
root_prefix: str | os.PathLike[str] | None = None,
337300
extra_directives: Mapping[str, t.Type[Directive]] | None = None,
338-
warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None,
339-
) -> t.Generator[CodeBlockInfo]:
301+
) -> Mapping[str, t.Type[Directive]]:
340302
"""
341-
Given a RST document, finds all code blocks.
303+
Return directives needed to find all code blocks.
304+
305+
You can pass an optional mapping with directives that will be added
306+
to the result.
342307
"""
343308
directives = _DIRECTIVES.copy()
344309
if extra_directives:
345310
directives.update(extra_directives)
311+
return directives
346312

347-
doc = _parse_document(
348-
content, directives=directives, path=path, root_prefix=root_prefix
349-
)
350313

314+
def find_code_blocks_in_document(
315+
*,
316+
document: nodes.document,
317+
content: str,
318+
warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None,
319+
) -> t.Generator[CodeBlockInfo]:
320+
"""
321+
Given a parsed RST document, finds all code blocks.
322+
323+
All code blocks must be parsed with special directives
324+
(see ``get_code_block_directives()``) that have appropriate metadata
325+
registered with ``mark_antsibull_code_block()``.
326+
"""
351327
# If someone can figure out how to yield from a sub-function, we can avoid
352328
# using this ugly list
353329
results = []
@@ -387,12 +363,44 @@ def warn_unknown_block_cb(
387363

388364
# Process the document
389365
try:
390-
visitor = CodeBlockVisitor(doc, content, callback, warn_unknown_block_cb)
391-
doc.walk(visitor)
392-
except Exception as exc:
393-
raise ValueError(f"Cannot process document: {exc}") from exc
366+
visitor = CodeBlockVisitor(document, content, callback, warn_unknown_block_cb)
367+
document.walk(visitor)
368+
except Exception as exc: # pragma: no cover
369+
raise ValueError(f"Cannot process document: {exc}") from exc # pragma: no cover
394370
finally:
395371
yield from results
396372

397373

374+
def find_code_blocks(
375+
content: str,
376+
*,
377+
path: str | os.PathLike[str] | None = None,
378+
root_prefix: str | os.PathLike[str] | None = None,
379+
extra_directives: Mapping[str, t.Type[Directive]] | None = None,
380+
warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None,
381+
) -> t.Generator[CodeBlockInfo]:
382+
"""
383+
Given a RST document, finds all code blocks.
384+
385+
To add support for own types of code blocks, you can pass these
386+
as ``extra_directives``. Use ``mark_antsibull_code_block()`` to
387+
mark them to be found by ``find_code_blocks()``.
388+
"""
389+
directives = get_code_block_directives(extra_directives=extra_directives)
390+
391+
doc = parse_document(
392+
content,
393+
parser_name="restructuredtext",
394+
path=path,
395+
root_prefix=root_prefix,
396+
rst_directives=directives,
397+
)
398+
399+
yield from find_code_blocks_in_document(
400+
document=doc,
401+
content=content,
402+
warn_unknown_block=warn_unknown_block,
403+
)
404+
405+
398406
__all__ = ("CodeBlockInfo", "mark_antsibull_code_block", "find_code_blocks")

0 commit comments

Comments
 (0)