|
19 | 19 | from collections import defaultdict |
20 | 20 | import binascii |
21 | 21 | import os |
| 22 | +import re |
22 | 23 |
|
23 | 24 | from docutils import nodes |
24 | 25 | from docutils.parsers.rst import directives |
@@ -206,6 +207,14 @@ def setup(app): |
206 | 207 | app.add_config_value("generate_component_labels", True, "env") |
207 | 208 |
|
208 | 209 |
|
| 210 | +# Roughly what an XML ID allows, per the spec for an `XML Name <https://www.w3.org/TR/REC-xml/#NT-Name>`_. However: |
| 211 | +# |
| 212 | +# - This regex does allow identifiers which begin with a number, which isn't allowed by XML. I don't know of easy ways to implement an AND operation in a regex, so code which uses this regex checks this separately. |
| 213 | +# - This disallows characters that need escaping for CSS to avoid problems there, such as ``.`` and ``:`` |
| 214 | +# - I'm not certain how closely Python's definition of a "word character (\w)" matches XML's definition. |
| 215 | +xml_id_regex = re.compile(r"\w[\w-]*", re.UNICODE) |
| 216 | + |
| 217 | + |
209 | 218 | # A base class for all Runestone directives. |
210 | 219 | class RunestoneDirective(Directive): |
211 | 220 | option_spec = { |
@@ -268,23 +277,12 @@ def __init__(self, *args, **kwargs): |
268 | 277 |
|
269 | 278 | self.explain_text = [] |
270 | 279 |
|
271 | | - # Check for a `valid HTML5 divid <https://html.spec.whatwg.org/multipage/dom.html#the-id-attribute>`_. |
| 280 | + # Check for a valid XML id. This is more restrictive then checking for a `valid HTML5 divid <https://html.spec.whatwg.org/multipage/dom.html#the-id-attribute>`_, so we don't bother with a separate HTML ID check. |
272 | 281 | def validate_divid(self, divid): |
273 | 282 | if ( |
274 | | - # Per the spec, a divid must not contain `whitespace <https://infra.spec.whatwg.org/#ascii-whitespace>`_ (see also `Python string escape sequences <https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals>`_). |
275 | | - "\t" in divid |
276 | | - or "\n" in divid |
277 | | - or "\f" in divid |
278 | | - or "\r" in divid |
279 | | - or " " in divid |
280 | | - or |
281 | | - # Also avoid characters that need escaping for CSS to avoid problems there. |
282 | | - "." in divid |
283 | | - or "#" in divid |
284 | | - or ":" in divid |
285 | | - or |
286 | | - # It must also be at least one character long. This is probably taken care of by the existence of ``self.arguments[0]``, but here's a bit of paranoia: |
287 | | - len(divid) == 0 |
| 283 | + # Look for invalid XML IDs (they must not begin with a number, which the regex doesn't catch). Use ``fullmatch`` since the entire string must match the regex for an valid id. |
| 284 | + (divid[0] >= "0" and (divid[0] <= "9")) |
| 285 | + or not re.fullmatch(xml_id_regex, divid) |
288 | 286 | ): |
289 | 287 | logger.error( |
290 | 288 | f"Invalid divid '{divid}'.", |
|
0 commit comments