Skip to content

Commit 0b69f4f

Browse files
authored
Replace html5lib with html5lib-modern (#2911)
* Replace html5lib with html5lib-modern, this removes another source of `six` dependency. * Fix import sorting * Remove "html" extras installation from tests.
1 parent a21b96d commit 0b69f4f

File tree

7 files changed

+27
-67
lines changed

7 files changed

+27
-67
lines changed

.readthedocs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ build:
2121
# the readthedocs environment.
2222
- pip install -r devtools/requirements-poetry.in
2323
post_install:
24-
- poetry export --only=main --only=docs --extras=html -o requirements.txt
24+
- poetry export --only=main --only=docs -o requirements.txt
2525
- pip install --no-cache-dir -r requirements.txt
2626
- pip install .
2727
- python -c "from rdflib import Graph; print(Graph)"

docker/latest/requirements.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile --config=pyproject.toml docker/latest/requirements.in
66
#
7-
html5lib==1.1
7+
html5lib-modern==1.2
88
# via -r docker/latest/requirements.in
99
isodate==0.6.1
1010
# via rdflib
@@ -14,7 +14,4 @@ rdflib==7.0.0
1414
# via -r docker/latest/requirements.in
1515
six==1.16.0
1616
# via
17-
# html5lib
1817
# isodate
19-
webencodings==0.5.1
20-
# via html5lib

poetry.lock

Lines changed: 11 additions & 28 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ isodate = "^0.6.0"
4343
pyparsing = ">=2.1.0,<4"
4444
berkeleydb = {version = "^18.1.0", optional = true}
4545
networkx = {version = ">=2,<4", optional = true}
46-
html5lib = {version = "^1.0", optional = true}
46+
html5lib-modern = "^1.2"
4747
lxml = {version = ">=4.3,<6.0", optional = true}
4848
orjson = {version = ">=3.9.14,<4", optional = true}
4949

@@ -73,7 +73,6 @@ ruff = ">=0.0.286,<0.7.0"
7373
[tool.poetry.extras]
7474
berkeleydb = ["berkeleydb"]
7575
networkx = ["networkx"]
76-
html = ["html5lib"]
7776
lxml = ["lxml"]
7877
orjson = ["orjson"]
7978

rdflib/term.py

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
from urllib.parse import urldefrag, urljoin, urlparse
6767
from uuid import uuid4
6868

69+
import html5lib
6970
from isodate import (
7071
Duration,
7172
duration_isoformat,
@@ -83,14 +84,6 @@
8384
from .namespace import NamespaceManager
8485
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath
8586

86-
_HAS_HTML5LIB = False
87-
88-
try:
89-
import html5lib
90-
91-
_HAS_HTML5LIB = True
92-
except ImportError:
93-
html5lib = None
9487

9588
_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
9689

@@ -1677,7 +1670,11 @@ def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
16771670
parser = html5lib.HTMLParser(
16781671
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
16791672
)
1680-
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
1673+
try:
1674+
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
1675+
except html5lib.html5parser.ParseError as e:
1676+
logger.info(f"Failed to parse HTML: {e}")
1677+
raise e
16811678
result.normalize()
16821679
return result
16831680

@@ -2007,20 +2004,13 @@ def _castPythonToLiteral( # noqa: N802
20072004
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
20082005
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
20092006
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
2010-
(Fraction, (None, _OWL_RATIONAL)),
2011-
]
2012-
2013-
if html5lib is not None:
20142007
# This is a bit dirty, by accident the html5lib parser produces
20152008
# DocumentFragments, and the xml parser Documents, letting this
20162009
# decide what datatype to use makes roundtripping easier, but it a
20172010
# bit random.
2018-
#
2019-
# This must happen before _GenericPythonToXSDRules is assigned to
2020-
# _OriginalGenericPythonToXSDRules.
2021-
_GenericPythonToXSDRules.append(
2022-
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
2023-
)
2011+
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
2012+
(Fraction, (None, _OWL_RATIONAL)),
2013+
]
20242014

20252015
_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)
20262016

@@ -2071,14 +2061,10 @@ def _castPythonToLiteral( # noqa: N802
20712061
URIRef(_XSD_PFX + "double"): float,
20722062
URIRef(_XSD_PFX + "base64Binary"): b64decode,
20732063
URIRef(_XSD_PFX + "anyURI"): None,
2064+
_RDF_HTMLLITERAL: _parse_html,
20742065
_RDF_XMLLITERAL: _parseXML,
20752066
}
20762067

2077-
if html5lib is not None:
2078-
# It is probably best to keep this close to the definition of
2079-
# _GenericPythonToXSDRules so nobody misses it.
2080-
XSDToPython[_RDF_HTMLLITERAL] = _parse_html
2081-
20822068
_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
20832069
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
20842070
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,

test/test_literal/test_literal_html5lib.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import xml.dom.minidom
22
from typing import Callable
33

4+
import html5lib # noqa: F401
45
import pytest
56

67
import rdflib.term
@@ -9,14 +10,8 @@
910
from test.utils.literal import LiteralChecker
1011
from test.utils.outcome import OutcomeChecker, OutcomePrimitives
1112

12-
try:
13-
import html5lib as _ # noqa: F401
14-
except ImportError:
15-
pytest.skip("html5lib not installed", allow_module_level=True)
16-
1713

1814
def test_has_html5lib() -> None:
19-
assert rdflib.term._HAS_HTML5LIB is True
2015
assert RDF.HTML in rdflib.term.XSDToPython
2116
rule = next(
2217
(

tox.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ setenv =
1515
COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
1616
MYPY_CACHE_DIR = {envdir}/.mypy_cache
1717
docs: POETRY_ARGS_docs = --only=docs
18-
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
18+
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
1919
lxml: POETRY_ARGS_lxml = --extras=lxml
2020
commands_pre =
2121
py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
@@ -59,7 +59,7 @@ setenv =
5959
PYTHONHASHSEED = 0
6060
commands_pre =
6161
poetry lock --check
62-
poetry install --only=main --only=docs --extras=html
62+
poetry install --only=main --only=docs
6363
poetry env info
6464
commands =
6565
poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html

0 commit comments

Comments
 (0)