Skip to content

Commit bc17fb7

Browse files
authored
jsonld: Do not merge nodes with different invalid URIs (#3011)
When parsing JSON-LD with invalid URIs in the `@id`, the `generalized_rdf: True` option allows parsing these nodes as blank nodes instead of outright rejecting the document. However, all nodes with invalid URIs were mapped to the same blank node, resulting in incorrect data. For example, without this patch, the new test fails with: ``` AssertionError: Expected: @Prefix schema: <https://schema.org/> . <https://example.org/root-object> schema:author [ schema:familyName "Doe" ; schema:givenName "Jane" ; schema:name "Jane Doe" ], [ schema:familyName "Doe" ; schema:givenName "John" ; schema:name "John Doe" ] . Got: @Prefix schema: <https://schema.org/> . <https://example.org/root-object> schema:author <> . <> schema:familyName "Doe" ; schema:givenName "Jane", "John" ; schema:name "Jane Doe", "John Doe" . ```
1 parent 182c3ba commit bc17fb7

File tree

4 files changed

+49
-1
lines changed

4 files changed

+49
-1
lines changed

rdflib/plugins/parsers/jsonld.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
# we should consider streaming the input to deal with arbitrarily large graphs.
3535
from __future__ import annotations
3636

37+
import secrets
3738
import warnings
3839
from collections.abc import Iterable
3940
from typing import TYPE_CHECKING, Any, Union
@@ -215,6 +216,7 @@ def __init__(
215216
if allow_lists_of_lists is not None
216217
else ALLOW_LISTS_OF_LISTS
217218
)
219+
self.invalid_uri_to_bnode: dict[str, BNode] = {}
218220

219221
def parse(self, data: Any, context: Context, dataset: Graph) -> Graph:
220222
topcontext = False
@@ -623,7 +625,12 @@ def _to_rdf_id(self, context: Context, id_val: str) -> IdentifiedNode | None:
623625
uri = context.resolve(id_val)
624626
if not self.generalized_rdf and ":" not in uri:
625627
return None
626-
return URIRef(uri)
628+
node: IdentifiedNode = URIRef(uri)
629+
if not str(node):
630+
if id_val not in self.invalid_uri_to_bnode:
631+
self.invalid_uri_to_bnode[id_val] = BNode(secrets.token_urlsafe(20))
632+
node = self.invalid_uri_to_bnode[id_val]
633+
return node
627634

628635
def _get_bnodeid(self, ref: str) -> str | None:
629636
if not ref.startswith("_:"):

test/jsonld/local-suite/manifest.jsonld

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@
2727
"purpose": "Multiple @id aliases. Issue #2164",
2828
"input": "toRdf-twoimports-in.jsonld",
2929
"expect": "toRdf-twoimports-out.nq"
30+
},
31+
{
32+
"@id": "#toRdf-two-invalid-ids",
33+
"@type": ["jld:PositiveEvaluationTest", "jld:ToRDFTest"],
34+
"name": "Two invalid identifiers",
35+
"purpose": "Multiple nodes with invalid @ids are not merged together.",
36+
"option": {
37+
"produceGeneralizedRdf": true
38+
},
39+
"input": "toRdf-twoinvalidids-in.jsonld",
40+
"expect": "toRdf-twoinvalidids-out.nq"
3041
}
3142
]
3243
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"@id": "https://example.org/root-object",
3+
"https://schema.org/author": [
4+
{
5+
"@id": "https://example.org/ invalid url 1",
6+
"https://schema.org/name": "Jane Doe"
7+
},
8+
{
9+
"@id": "https://example.org/ invalid url 1",
10+
"https://schema.org/givenName": "Jane",
11+
"https://schema.org/familyName": "Doe"
12+
},
13+
{
14+
"@id": "https://example.org/ invalid url 2",
15+
"https://schema.org/name": "John Doe",
16+
"https://schema.org/givenName": "John",
17+
"https://schema.org/familyName": "Doe"
18+
}
19+
]
20+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
<https://example.org/root-object> <https://schema.org/author> _:b1.
3+
<https://example.org/root-object> <https://schema.org/author> _:b2.
4+
5+
_:b1 <https://schema.org/name> "Jane Doe".
6+
_:b1 <https://schema.org/givenName> "Jane".
7+
_:b1 <https://schema.org/familyName> "Doe".
8+
_:b2 <https://schema.org/name> "John Doe".
9+
_:b2 <https://schema.org/givenName> "John".
10+
_:b2 <https://schema.org/familyName> "Doe".

0 commit comments

Comments
 (0)