Skip to content
This repository was archived by the owner on Jun 7, 2023. It is now read-only.

Commit ed6656e

Browse files
committed
Fix: Check for invalid XML IDs as well.
1 parent 5b5d1d7 commit ed6656e

File tree

1 file changed

+13
-15
lines changed

1 file changed

+13
-15
lines changed

runestone/common/runestonedirective.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from collections import defaultdict
2020
import binascii
2121
import os
22+
import re
2223

2324
from docutils import nodes
2425
from docutils.parsers.rst import directives
@@ -206,6 +207,14 @@ def setup(app):
206207
app.add_config_value("generate_component_labels", True, "env")
207208

208209

210+
# Roughly what an XML ID allows, per the spec for an `XML Name <https://www.w3.org/TR/REC-xml/#NT-Name>`_. However:
211+
#
212+
# - This regex does allow identifiers which begin with a number, which isn't allowed by XML. I don't know of easy ways to implement an AND operation in a regex, so code which uses this regex checks this separately.
213+
# - This disallows characters that need escaping for CSS to avoid problems there, such as ``.`` and ``:``
214+
# - I'm not certain how closely Python's definition of a "word character (\w)" matches XML's definition.
215+
xml_id_regex = re.compile(r"\w[\w-]*", re.UNICODE)
216+
217+
209218
# A base class for all Runestone directives.
210219
class RunestoneDirective(Directive):
211220
option_spec = {
@@ -268,23 +277,12 @@ def __init__(self, *args, **kwargs):
268277

269278
self.explain_text = []
270279

271-
# Check for a `valid HTML5 divid <https://html.spec.whatwg.org/multipage/dom.html#the-id-attribute>`_.
280+
# Check for a valid XML id. This is more restrictive then checking for a `valid HTML5 divid <https://html.spec.whatwg.org/multipage/dom.html#the-id-attribute>`_, so we don't bother with a separate HTML ID check.
272281
def validate_divid(self, divid):
273282
if (
274-
# Per the spec, a divid must not contain `whitespace <https://infra.spec.whatwg.org/#ascii-whitespace>`_ (see also `Python string escape sequences <https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals>`_).
275-
"\t" in divid
276-
or "\n" in divid
277-
or "\f" in divid
278-
or "\r" in divid
279-
or " " in divid
280-
or
281-
# Also avoid characters that need escaping for CSS to avoid problems there.
282-
"." in divid
283-
or "#" in divid
284-
or ":" in divid
285-
or
286-
# It must also be at least one character long. This is probably taken care of by the existence of ``self.arguments[0]``, but here's a bit of paranoia:
287-
len(divid) == 0
283+
# Look for invalid XML IDs (they must not begin with a number, which the regex doesn't catch). Use ``fullmatch`` since the entire string must match the regex for an valid id.
284+
(divid[0] >= "0" and (divid[0] <= "9"))
285+
or not re.fullmatch(xml_id_regex, divid)
288286
):
289287
logger.error(
290288
f"Invalid divid '{divid}'.",

0 commit comments

Comments
 (0)