Skip to content

Commit 4091626

Browse files
committed
(Temporarily?) move crawling to Registry, and explicitly track uncrawled resources.
1 parent 1a8a994 commit 4091626

File tree

2 files changed

+95
-96
lines changed

2 files changed

+95
-96
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Issues = "https://github.com/python-jsonschema/referencing/issues/"
4242
Source = "https://github.com/python-jsonschema/referencing"
4343

4444
[tool.isort]
45+
combine_as_imports = true
4546
from_first = true
4647
include_trailing_comma = true
4748
multi_line_output = 3

referencing/_core.py

Lines changed: 94 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
from __future__ import annotations
22

3-
from collections.abc import Iterable, Mapping, Sequence
3+
from collections.abc import Mapping, Sequence
44
from typing import TYPE_CHECKING, Any, Union
55
from urllib.parse import unquote, urldefrag, urljoin
66

7-
from pyrsistent import m
8-
from pyrsistent.typing import PMap
9-
import attrs
7+
from pyrsistent import m, s
8+
from pyrsistent.typing import PMap, PSet
109

1110
try:
1211
Mapping[str, str]
@@ -29,16 +28,17 @@ class UnidentifiedResource(Exception):
2928

3029

3130
if TYPE_CHECKING:
32-
from attrs import define, frozen
31+
from attrs import define, evolve, field, frozen
3332
else:
33+
from attrs import define as _define, evolve, field, frozen as _frozen
3434

3535
def define(cls):
3636
cls.__init_subclass__ = UnsupportedSubclassing.complain
37-
return attrs.define(cls)
37+
return _define(cls)
3838

3939
def frozen(cls):
4040
cls.__init_subclass__ = UnsupportedSubclassing.complain
41-
return attrs.frozen(cls)
41+
return _frozen(cls)
4242

4343

4444
Schema = Union[bool, Mapping[str, Any]]
@@ -51,9 +51,6 @@ class Anchor:
5151
name: str
5252
resource: Schema
5353

54-
def added_to(self, registry: Registry):
55-
return registry.with_anchor(anchor=self)
56-
5754

5855
@frozen
5956
class DynamicAnchor:
@@ -62,9 +59,6 @@ class DynamicAnchor:
6259
name: str
6360
resource: Schema
6461

65-
def added_to(self, registry: Registry):
66-
return registry.with_anchor(anchor=self)
67-
6862

6963
AnchorType = Union[Anchor, DynamicAnchor]
7064

@@ -75,26 +69,15 @@ class IdentifiedResource:
7569
uri: str
7670
resource: Schema
7771

78-
def added_to(self, registry: Registry):
79-
return registry.with_identified_resource(
80-
uri=self.uri,
81-
resource=self.resource,
82-
)
83-
8472

8573
@frozen
8674
class Registry:
8775

88-
_contents: PMap[str, tuple[Schema, PMap[str, AnchorType]]] = attrs.field(
76+
_contents: PMap[str, tuple[Schema, PMap[str, AnchorType]]] = field(
8977
default=m(),
9078
repr=lambda value: f"({len(value)} entries)",
9179
)
92-
93-
def resource_at(self, uri) -> Schema:
94-
return self._contents[uri][0]
95-
96-
def anchor_at(self, uri, name) -> AnchorType:
97-
return self._contents[uri][1][name]
80+
_uncrawled: PSet[str] = field(default=s())
9881

9982
def with_resource(self, resource) -> Registry:
10083
uri = id_of(resource)
@@ -106,10 +89,16 @@ def with_identified_resource(self, uri, resource) -> Registry:
10689
return self.with_resources([(uri, resource)])
10790

10891
def update(self, *registries: Registry) -> Registry:
109-
contents = (registry._contents for registry in registries)
110-
return attrs.evolve(self, contents=self._contents.update(*contents))
92+
contents = (each._contents for each in registries)
93+
uncrawled = (each._uncrawled for each in registries)
94+
return evolve(
95+
self,
96+
contents=self._contents.update(*contents),
97+
uncrawled=self._uncrawled.update(*uncrawled),
98+
)
11199

112100
def with_resources(self, pairs) -> Registry:
101+
uncrawled = self._uncrawled
113102
contents = self._contents
114103
for uri, resource in pairs:
115104
assert (
@@ -122,40 +111,92 @@ def with_resources(self, pairs) -> Registry:
122111
id = id_of(resource)
123112
if id is not None:
124113
contents = contents.set(id, (resource, m()))
125-
return attrs.evolve(self, contents=contents)
114+
115+
uncrawled = uncrawled.add(uri)
116+
return evolve(self, contents=contents, uncrawled=uncrawled)
126117

127118
def with_anchor(self, anchor: Anchor | DynamicAnchor) -> Registry:
128-
uri_resource, anchors = self._contents[anchor.uri]
129-
new = uri_resource, anchors.set(anchor.name, anchor)
130-
return attrs.evolve(self, contents=self._contents.set(anchor.uri, new))
119+
resource, anchors = self._contents[anchor.uri]
120+
new = resource, anchors.set(anchor.name, anchor)
121+
return evolve(self, contents=self._contents.set(anchor.uri, new))
122+
123+
def resource_at(self, uri: str) -> tuple[Schema, Registry]:
124+
at_uri = self._contents.get(uri)
125+
if at_uri is not None and at_uri[1]:
126+
registry = self
127+
else:
128+
registry = self.crawl()
129+
return registry._contents[uri][0], registry
130+
131+
def anchor_at(self, uri, name) -> AnchorType:
132+
return self._contents[uri][1][name]
133+
134+
def crawl(self) -> Registry:
135+
registry = self
136+
resources = [(uri, self._contents[uri][0]) for uri in self._uncrawled]
137+
while resources:
138+
base_uri, resource = resources.pop()
139+
if resource is True or resource is False:
140+
continue
141+
142+
uri = urljoin(base_uri, resource.get("$id", ""))
143+
if uri != base_uri:
144+
registry = registry.with_identified_resource(
145+
uri=uri,
146+
resource=resource,
147+
)
148+
149+
anchor = resource.get("$anchor")
150+
if anchor is not None:
151+
registry = registry.with_anchor(
152+
Anchor(uri=uri, name=anchor, resource=resource),
153+
)
154+
155+
dynamic_anchor = resource.get("$dynamicAnchor")
156+
if dynamic_anchor is not None:
157+
registry = registry.with_anchor(
158+
DynamicAnchor(
159+
uri=uri,
160+
name=dynamic_anchor,
161+
resource=resource,
162+
),
163+
)
164+
165+
resources.extend( # TODO: delay finding anchors in subresources...
166+
(uri, resource[k]) for k in SUBRESOURCE if k in resource
167+
)
168+
resources.extend(
169+
(uri, subresource)
170+
for k in SUBRESOURCE_VALUES
171+
if k in resource
172+
for subresource in resource[k].values()
173+
)
174+
resources.extend(
175+
(uri, subresource)
176+
for k in SUBRESOURCE_ITEMS
177+
if k in resource
178+
for subresource in resource[k]
179+
)
180+
return evolve(registry, uncrawled=s())
131181

132182
def resolver(self, root) -> Resolver:
133183
uri = id_of(root) or ""
134184
registry = self.with_identified_resource(uri=uri, resource=root)
135185
return Resolver(base_uri=uri, registry=registry)
136186

137-
def has_not_crawled(self, uri) -> bool:
138-
at_uri = self._contents.get(uri)
139-
return at_uri is None or not at_uri[1]
140-
141187

142188
@define
143189
class Resolver:
144190

145191
_base_uri: str
146192
_registry: Registry
147193

148-
def lookup(self, ref: str):
194+
def lookup(self, ref: str) -> tuple[Schema, Resolver]:
149195
if ref.startswith("#"):
150196
uri, fragment = self._base_uri, ref[1:]
151197
else:
152198
uri, fragment = urldefrag(urljoin(self._base_uri, ref))
153-
if self._registry.has_not_crawled(uri):
154-
root = self._registry.resource_at(self._base_uri)
155-
for each in find_subresources(base_uri=self._base_uri, root=root):
156-
self._registry = each.added_to(self._registry)
157-
158-
target = self._registry.resource_at(uri)
199+
target, registry = self._registry.resource_at(uri)
159200
if fragment.startswith("/"):
160201
segments = unquote(fragment[1:]).split("/")
161202
for segment in segments:
@@ -165,21 +206,21 @@ def lookup(self, ref: str):
165206
segment = segment.replace("~1", "/").replace("~0", "~")
166207
target = target[segment] # type: ignore # this can't be a bool
167208
elif fragment:
168-
target = self._registry.anchor_at(uri=uri, name=fragment).resource
209+
target = registry.anchor_at(uri=uri, name=fragment).resource
169210

170-
return target, attrs.evolve(self, base_uri=uri)
211+
return target, evolve(self, base_uri=uri, registry=registry)
171212

172213
def with_root(self, root) -> Resolver:
173214
maybe_relative = id_of(root)
174215
if maybe_relative is None:
175-
uri, registry = self._base_uri, self._registry
176-
else:
177-
uri = urljoin(self._base_uri, maybe_relative)
178-
registry = self._registry.with_identified_resource(
179-
uri=uri,
180-
resource=root,
181-
)
182-
return attrs.evolve(self, base_uri=uri, registry=registry)
216+
return self
217+
218+
uri = urljoin(self._base_uri, maybe_relative)
219+
registry = self._registry.with_identified_resource(
220+
uri=uri,
221+
resource=root,
222+
)
223+
return evolve(self, base_uri=uri, registry=registry)
183224

184225

185226
SUBRESOURCE = {"items", "not"}
@@ -191,46 +232,3 @@ def id_of(resource) -> str | None:
191232
if resource is True or resource is False:
192233
return None
193234
return resource.get("$id")
194-
195-
196-
def find_subresources(
197-
root: Schema,
198-
base_uri: str,
199-
) -> Iterable[Anchor | DynamicAnchor | IdentifiedResource]:
200-
resources = [(base_uri, root)]
201-
while resources:
202-
base_uri, resource = resources.pop()
203-
if resource is True or resource is False:
204-
continue
205-
206-
uri = urljoin(base_uri, resource.get("$id", ""))
207-
if uri != base_uri:
208-
yield IdentifiedResource(uri=uri, resource=resource)
209-
210-
anchor = resource.get("$anchor")
211-
if anchor is not None:
212-
yield Anchor(uri=uri, name=anchor, resource=resource)
213-
214-
dynamic_anchor = resource.get("$dynamicAnchor")
215-
if dynamic_anchor is not None:
216-
yield DynamicAnchor(
217-
uri=uri,
218-
name=dynamic_anchor,
219-
resource=resource,
220-
)
221-
222-
resources.extend( # TODO: delay finding anchors in subresources...
223-
(uri, resource[k]) for k in SUBRESOURCE if k in resource
224-
)
225-
resources.extend(
226-
(uri, subresource)
227-
for k in SUBRESOURCE_VALUES
228-
if k in resource
229-
for subresource in resource[k].values()
230-
)
231-
resources.extend(
232-
(uri, subresource)
233-
for k in SUBRESOURCE_ITEMS
234-
if k in resource
235-
for subresource in resource[k]
236-
)

0 commit comments

Comments
 (0)