|
10 | 10 |
|
11 | 11 | from __future__ import annotations
|
12 | 12 |
|
13 |
| -import io |
14 | 13 | import os
|
15 | 14 | import typing as t
|
16 | 15 | from collections.abc import Mapping
|
17 | 16 | from dataclasses import dataclass
|
18 | 17 |
|
19 | 18 | from docutils import nodes
|
20 |
| -from docutils.core import Publisher |
21 |
| -from docutils.io import StringInput |
22 | 19 | from docutils.parsers.rst import Directive
|
23 |
| -from docutils.parsers.rst.directives import register_directive |
24 | 20 | from docutils.parsers.rst.directives import unchanged as directive_param_unchanged
|
25 |
| -from docutils.utils import Reporter, SystemMessage |
| 21 | + |
| 22 | +from .utils import parse_document |
26 | 23 |
|
27 | 24 | _SPECIAL_ATTRIBUTES = (
|
28 | 25 | "antsibull-code-language",
|
@@ -99,6 +96,89 @@ def run(self) -> list[nodes.literal_block]:
|
99 | 96 | return [literal]
|
100 | 97 |
|
101 | 98 |
|
| 99 | +def _find_indent(content: str) -> int | None: |
| 100 | + """ |
| 101 | + Given concatenated lines, find the minimum indent if possible. |
| 102 | +
|
| 103 | + If all lines consist only out of whitespace (or are empty), |
| 104 | + ``None`` is returned. |
| 105 | + """ |
| 106 | + min_indent = None |
| 107 | + for line in content.split("\n"): |
| 108 | + stripped_line = line.lstrip() |
| 109 | + if stripped_line: |
| 110 | + indent = len(line) - len(line.lstrip()) |
| 111 | + if min_indent is None or min_indent > indent: |
| 112 | + min_indent = indent |
| 113 | + return min_indent |
| 114 | + |
| 115 | + |
| 116 | +def _find_offset( |
| 117 | + lineno: int, content: str, *, document_content_lines: list[str] |
| 118 | +) -> tuple[int, int, bool]: |
| 119 | + """ |
| 120 | + Try to identify the row/col offset of the code in ``content`` in the document. |
| 121 | +
|
| 122 | + ``lineno`` is assumed to be the line where the code-block starts. |
| 123 | + This function looks for an empty line, followed by the right pattern of |
| 124 | + empty and non-empty lines. |
| 125 | + """ |
| 126 | + row_offset = lineno |
| 127 | + found_empty_line = False |
| 128 | + found_content_lines = False |
| 129 | + content_lines = content.count("\n") + 1 |
| 130 | + min_indent = None |
| 131 | + for offset, line in enumerate(document_content_lines[lineno:]): |
| 132 | + stripped_line = line.strip() |
| 133 | + if not stripped_line: |
| 134 | + if not found_empty_line: |
| 135 | + row_offset = lineno + offset + 1 |
| 136 | + found_empty_line = True |
| 137 | + elif not found_content_lines: |
| 138 | + found_content_lines = True |
| 139 | + row_offset = lineno + offset |
| 140 | + |
| 141 | + if found_content_lines and content_lines > 0: |
| 142 | + if stripped_line: |
| 143 | + indent = len(line) - len(line.lstrip()) |
| 144 | + if min_indent is None or min_indent > indent: |
| 145 | + min_indent = indent |
| 146 | + content_lines -= 1 |
| 147 | + elif not content_lines: |
| 148 | + break |
| 149 | + |
| 150 | + min_source_indent = _find_indent(content) |
| 151 | + col_offset = max(0, (min_indent or 0) - (min_source_indent or 0)) |
| 152 | + return row_offset, col_offset, content_lines == 0 |
| 153 | + |
| 154 | + |
| 155 | +def _find_in_code( |
| 156 | + row_offset: int, |
| 157 | + col_offset: int, |
| 158 | + content: str, |
| 159 | + *, |
| 160 | + document_content_lines: list[str], |
| 161 | +) -> bool: |
| 162 | + """ |
| 163 | + Check whether the code can be found at the given row/col offset in a way |
| 164 | + that makes it easy to replace. |
| 165 | +
|
| 166 | + That is, it is surrounded only by whitespace. |
| 167 | + """ |
| 168 | + for index, line in enumerate(content.split("\n")): |
| 169 | + if row_offset + index >= len(document_content_lines): |
| 170 | + return False |
| 171 | + found_line = document_content_lines[row_offset + index] |
| 172 | + if found_line[:col_offset].strip(): |
| 173 | + return False |
| 174 | + eol = found_line[col_offset:] |
| 175 | + if eol[: len(line)] != line: |
| 176 | + return False |
| 177 | + if eol[len(line) :].strip(): |
| 178 | + return False |
| 179 | + return True |
| 180 | + |
| 181 | + |
102 | 182 | class CodeBlockVisitor(nodes.SparseNodeVisitor):
|
103 | 183 | """
|
104 | 184 | Visitor that calls callbacks for all code blocks.
|
@@ -130,100 +210,31 @@ def visit_error(self, node: nodes.error) -> None:
|
130 | 210 | """
|
131 | 211 | raise nodes.SkipNode
|
132 | 212 |
|
133 |
| - @staticmethod |
134 |
| - def _find_indent(content: str) -> int | None: |
135 |
| - """ |
136 |
| - Given concatenated lines, find the minimum indent if possible. |
137 |
| -
|
138 |
| - If all lines consist only out of whitespace (or are empty), |
139 |
| - ``None`` is returned. |
140 |
| - """ |
141 |
| - min_indent = None |
142 |
| - for line in content.split("\n"): |
143 |
| - stripped_line = line.lstrip() |
144 |
| - if stripped_line: |
145 |
| - indent = len(line) - len(line.lstrip()) |
146 |
| - if min_indent is None or min_indent > indent: |
147 |
| - min_indent = indent |
148 |
| - return min_indent |
149 |
| - |
150 |
| - def _find_offset(self, lineno: int, content: str) -> tuple[int, int, bool]: |
151 |
| - """ |
152 |
| - Try to identify the row/col offset of the code in ``content`` in the document. |
153 |
| -
|
154 |
| - ``lineno`` is assumed to be the line where the code-block starts. |
155 |
| - This function looks for an empty line, followed by the right pattern of |
156 |
| - empty and non-empty lines. |
157 |
| - """ |
158 |
| - row_offset = lineno |
159 |
| - found_empty_line = False |
160 |
| - found_content_lines = False |
161 |
| - content_lines = content.count("\n") + 1 |
162 |
| - min_indent = None |
163 |
| - for offset, line in enumerate(self.__content_lines[lineno:]): |
164 |
| - stripped_line = line.strip() |
165 |
| - if not stripped_line: |
166 |
| - if not found_empty_line: |
167 |
| - row_offset = lineno + offset + 1 |
168 |
| - found_empty_line = True |
169 |
| - elif not found_content_lines: |
170 |
| - found_content_lines = True |
171 |
| - row_offset = lineno + offset |
172 |
| - |
173 |
| - if found_content_lines and content_lines > 0: |
174 |
| - if stripped_line: |
175 |
| - indent = len(line) - len(line.lstrip()) |
176 |
| - if min_indent is None or min_indent > indent: |
177 |
| - min_indent = indent |
178 |
| - content_lines -= 1 |
179 |
| - elif not content_lines: |
180 |
| - break |
181 |
| - |
182 |
| - min_source_indent = self._find_indent(content) |
183 |
| - col_offset = max(0, (min_indent or 0) - (min_source_indent or 0)) |
184 |
| - return row_offset, col_offset, content_lines == 0 |
185 |
| - |
186 |
| - def _find_in_code(self, row_offset: int, col_offset: int, content: str) -> bool: |
187 |
| - """ |
188 |
| - Check whether the code can be found at the given row/col offset in a way |
189 |
| - that makes it easy to replace. |
190 |
| -
|
191 |
| - That is, it is surrounded only by whitespace. |
192 |
| - """ |
193 |
| - for index, line in enumerate(content.split("\n")): |
194 |
| - if row_offset + index >= len(self.__content_lines): |
195 |
| - return False |
196 |
| - found_line = self.__content_lines[row_offset + index] |
197 |
| - if found_line[:col_offset].strip(): |
198 |
| - return False |
199 |
| - eol = found_line[col_offset:] |
200 |
| - if eol[: len(line)] != line: |
201 |
| - return False |
202 |
| - if eol[len(line) :].strip(): |
203 |
| - return False |
204 |
| - return True |
205 |
| - |
206 | 213 | def visit_literal_block(self, node: nodes.literal_block) -> None:
|
207 | 214 | """
|
208 | 215 | Visit a code block.
|
209 | 216 | """
|
210 | 217 | if "antsibull-code-block" not in node.attributes:
|
211 |
| - if node.attributes["classes"]: |
212 |
| - # This could be a `::` block, or something else (unknown) |
213 |
| - self.__warn_unknown_block(node.line or "unknown", 0, node) |
| 218 | + # This could be a `::` block, or something else (unknown) |
| 219 | + self.__warn_unknown_block(node.line or "unknown", 0, node) |
214 | 220 | raise nodes.SkipNode
|
215 | 221 |
|
216 | 222 | language = node.attributes["antsibull-code-language"]
|
217 | 223 | lineno = node.attributes["antsibull-code-lineno"]
|
218 |
| - row_offset, col_offset, position_exact = self._find_offset( |
219 |
| - lineno, node.rawsource |
| 224 | + row_offset, col_offset, position_exact = _find_offset( |
| 225 | + lineno, node.rawsource, document_content_lines=self.__content_lines |
220 | 226 | )
|
221 | 227 | found_in_code = False
|
222 | 228 | if position_exact:
|
223 | 229 | # If we think we have the exact position, try to identify the code.
|
224 | 230 | # ``found_in_code`` indicates that it is easy to replace the code,
|
225 | 231 | # and at the same time it's easy to identify it.
|
226 |
| - found_in_code = self._find_in_code(row_offset, col_offset, node.rawsource) |
| 232 | + found_in_code = _find_in_code( |
| 233 | + row_offset, |
| 234 | + col_offset, |
| 235 | + node.rawsource, |
| 236 | + document_content_lines=self.__content_lines, |
| 237 | + ) |
227 | 238 | if not found_in_code:
|
228 | 239 | position_exact = False
|
229 | 240 | if not found_in_code:
|
@@ -254,51 +265,6 @@ def visit_literal_block(self, node: nodes.literal_block) -> None:
|
254 | 265 | }
|
255 | 266 |
|
256 | 267 |
|
257 |
| -def _parse_document( |
258 |
| - content: str, |
259 |
| - *, |
260 |
| - path: str | os.PathLike[str] | None, |
261 |
| - root_prefix: str | os.PathLike[str] | None, |
262 |
| - directives: dict[str, t.Type[Directive]], |
263 |
| -) -> nodes.document: |
264 |
| - # pylint: disable-next=fixme |
265 |
| - # TODO: figure out how to register a directive only temporarily |
266 |
| - for directive_name, directive_class in directives.items(): |
267 |
| - register_directive(directive_name, directive_class) |
268 |
| - |
269 |
| - # We create a Publisher only to have a mechanism which gives us the settings object. |
270 |
| - # Doing this more explicit is a bad idea since the classes used are deprecated and will |
271 |
| - # eventually get replaced. Publisher.get_settings() looks like a stable enough API that |
272 |
| - # we can 'just use'. |
273 |
| - publisher = Publisher(source_class=StringInput) |
274 |
| - publisher.set_components("standalone", "restructuredtext", "pseudoxml") |
275 |
| - override = { |
276 |
| - "root_prefix": str(root_prefix), |
277 |
| - "input_encoding": "utf-8", |
278 |
| - "file_insertion_enabled": False, |
279 |
| - "raw_enabled": False, |
280 |
| - "_disable_config": True, |
281 |
| - "report_level": Reporter.ERROR_LEVEL, |
282 |
| - "warning_stream": io.StringIO(), |
283 |
| - } |
284 |
| - publisher.process_programmatic_settings(None, override, None) |
285 |
| - publisher.set_source(content, str(path)) |
286 |
| - |
287 |
| - # Parse the document |
288 |
| - try: |
289 |
| - # mypy gives errors for the next line, but this is literally what docutils itself |
290 |
| - # is also doing. So we're going to ignore this error... |
291 |
| - return publisher.reader.read( |
292 |
| - publisher.source, |
293 |
| - publisher.parser, |
294 |
| - publisher.settings, # type: ignore |
295 |
| - ) |
296 |
| - except SystemMessage as exc: |
297 |
| - raise ValueError(f"Cannot parse document: {exc}") from exc |
298 |
| - except Exception as exc: |
299 |
| - raise ValueError(f"Unexpected error while parsing document: {exc}") from exc |
300 |
| - |
301 |
| - |
302 | 268 | @dataclass
|
303 | 269 | class CodeBlockInfo:
|
304 | 270 | """
|
@@ -329,25 +295,35 @@ class CodeBlockInfo:
|
329 | 295 | attributes: dict[str, t.Any]
|
330 | 296 |
|
331 | 297 |
|
332 |
| -def find_code_blocks( |
333 |
| - content: str, |
| 298 | +def get_code_block_directives( |
334 | 299 | *,
|
335 |
| - path: str | os.PathLike[str] | None = None, |
336 |
| - root_prefix: str | os.PathLike[str] | None = None, |
337 | 300 | extra_directives: Mapping[str, t.Type[Directive]] | None = None,
|
338 |
| - warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None, |
339 |
| -) -> t.Generator[CodeBlockInfo]: |
| 301 | +) -> Mapping[str, t.Type[Directive]]: |
340 | 302 | """
|
341 |
| - Given a RST document, finds all code blocks. |
| 303 | + Return directives needed to find all code blocks. |
| 304 | +
|
| 305 | + You can pass an optional mapping with directives that will be added |
| 306 | + to the result. |
342 | 307 | """
|
343 | 308 | directives = _DIRECTIVES.copy()
|
344 | 309 | if extra_directives:
|
345 | 310 | directives.update(extra_directives)
|
| 311 | + return directives |
346 | 312 |
|
347 |
| - doc = _parse_document( |
348 |
| - content, directives=directives, path=path, root_prefix=root_prefix |
349 |
| - ) |
350 | 313 |
|
| 314 | +def find_code_blocks_in_document( |
| 315 | + *, |
| 316 | + document: nodes.document, |
| 317 | + content: str, |
| 318 | + warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None, |
| 319 | +) -> t.Generator[CodeBlockInfo]: |
| 320 | + """ |
| 321 | + Given a parsed RST document, finds all code blocks. |
| 322 | +
|
| 323 | + All code blocks must be parsed with special directives |
| 324 | + (see ``get_code_block_directives()``) that have appropriate metadata |
| 325 | + registered with ``mark_antsibull_code_block()``. |
| 326 | + """ |
351 | 327 | # If someone can figure out how to yield from a sub-function, we can avoid
|
352 | 328 | # using this ugly list
|
353 | 329 | results = []
|
@@ -387,12 +363,44 @@ def warn_unknown_block_cb(
|
387 | 363 |
|
388 | 364 | # Process the document
|
389 | 365 | try:
|
390 |
| - visitor = CodeBlockVisitor(doc, content, callback, warn_unknown_block_cb) |
391 |
| - doc.walk(visitor) |
392 |
| - except Exception as exc: |
393 |
| - raise ValueError(f"Cannot process document: {exc}") from exc |
| 366 | + visitor = CodeBlockVisitor(document, content, callback, warn_unknown_block_cb) |
| 367 | + document.walk(visitor) |
| 368 | + except Exception as exc: # pragma: no cover |
| 369 | + raise ValueError(f"Cannot process document: {exc}") from exc # pragma: no cover |
394 | 370 | finally:
|
395 | 371 | yield from results
|
396 | 372 |
|
397 | 373 |
|
| 374 | +def find_code_blocks( |
| 375 | + content: str, |
| 376 | + *, |
| 377 | + path: str | os.PathLike[str] | None = None, |
| 378 | + root_prefix: str | os.PathLike[str] | None = None, |
| 379 | + extra_directives: Mapping[str, t.Type[Directive]] | None = None, |
| 380 | + warn_unknown_block: t.Callable[[int | str, int, str], None] | None = None, |
| 381 | +) -> t.Generator[CodeBlockInfo]: |
| 382 | + """ |
| 383 | + Given a RST document, finds all code blocks. |
| 384 | +
|
| 385 | + To add support for own types of code blocks, you can pass these |
| 386 | + as ``extra_directives``. Use ``mark_antsibull_code_block()`` to |
| 387 | + mark them to be found by ``find_code_blocks()``. |
| 388 | + """ |
| 389 | + directives = get_code_block_directives(extra_directives=extra_directives) |
| 390 | + |
| 391 | + doc = parse_document( |
| 392 | + content, |
| 393 | + parser_name="restructuredtext", |
| 394 | + path=path, |
| 395 | + root_prefix=root_prefix, |
| 396 | + rst_directives=directives, |
| 397 | + ) |
| 398 | + |
| 399 | + yield from find_code_blocks_in_document( |
| 400 | + document=doc, |
| 401 | + content=content, |
| 402 | + warn_unknown_block=warn_unknown_block, |
| 403 | + ) |
| 404 | + |
| 405 | + |
398 | 406 | __all__ = ("CodeBlockInfo", "mark_antsibull_code_block", "find_code_blocks")
|
0 commit comments