Skip to content

Commit 2b32c25

Browse files
committed
Rewrite/redirect URLs to honor aliases
Specifically: * Alter URL normalization to factor in alias definitions * Introduce external URL handling: * Introduce Alias.{format_external_url, parse_external_url} * Alter Resource.__new__ to accept _external_ok=True. Update upstream APIs.
1 parent b71cb0c commit 2b32c25

File tree

6 files changed

+689
-66
lines changed

6 files changed

+689
-66
lines changed

src/crystal/model.py

Lines changed: 171 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2652,10 +2652,11 @@ class Resource:
26522652
its project is read-only. If/when the project transitions to
26532653
writable, any unsaved resources will be saved to disk.
26542654
"""
2655-
# Special IDs, all <0
2655+
# Special IDs, all < 0
26562656
_DEFER_ID = -1 # type: Literal[-1]
26572657
_DELETED_ID = -2 # type: Literal[-2]
26582658
_UNSAVED_ID = -3 # type: Literal[-3]
2659+
_EXTERNAL_ID = -4 # type: Literal[-4]
26592660

26602661
# Optimize per-instance memory use, since there may be very many Resource objects
26612662
__slots__ = (
@@ -2688,6 +2689,7 @@ def __new__(cls,
26882689
project: Project,
26892690
url: str,
26902691
_id: Union[None, int]=None,
2692+
*, _external_ok: bool=False,
26912693
) -> Resource:
26922694
"""
26932695
Looks up an existing resource with the specified URL or creates a new
@@ -2705,11 +2707,28 @@ def __new__(cls,
27052707
* url -- absolute URL to this resource (ex: http), or a URI (ex: mailto).
27062708
"""
27072709
# Private API:
2708-
# - If _id == Resource._DEFER_ID, and there is no existing resource
2709-
# corresponding to the URL in the project, the returned Resource
2710-
# will not have a valid ID and report _is_finished_initializing() == False.
2711-
# The caller will then be responsible for populating the ID later
2712-
# using _finish_init().
2710+
# * _id --
2711+
# - If _id is None then any existing resource in the database
2712+
# matching the specified URL will be used. If no matching resource
2713+
# is found then a new Resource will be created in the database.
2714+
# The resulting Resource will always have an _id pointing to
2715+
# a Resource in the database.
2716+
# - If not (_id < 0) then it points to an existing Resource in the
2717+
# database with the specified ID.
2718+
# - If _id == Resource._DEFER_ID, and there is no existing resource
2719+
# corresponding to the URL in the project, the returned Resource
2720+
# will not have a valid ID and report _is_finished_initializing() == False.
2721+
# The caller will then be responsible for populating the ID later
2722+
# using _finish_init().
2723+
# - If _id == Resource._EXTERNAL_ID, then the caller is signaling
2724+
# an explicit intent to create an external URL.
2725+
# - No other values for _id are valid.
2726+
# * _external_ok --
2727+
# - whether the caller is prepared for the possibility
2728+
# that the specified URL corresponds to an external URL.
2729+
# In that circumstance the returned Resource will have
2730+
# (external_url is not None) and the caller should check
2731+
# for that condition to do any special handling required.
27132732

27142733
if _id is None or _id == Resource._DEFER_ID:
27152734
url_alternatives = cls.resource_url_alternatives(project, url)
@@ -2731,6 +2750,21 @@ def __new__(cls,
27312750
normalized_url = url
27322751
del url # prevent accidental usage later
27332752

2753+
# Ensure that if an external URL is used then the caller opts-in
2754+
# to handling that possibility, so that they aren't created unintentionally
2755+
is_external = Alias.parse_external_url(normalized_url) is not None
2756+
if is_external:
2757+
if not _external_ok:
2758+
raise ValueError(
2759+
f'Cannot create Resource with external URL {normalized_url!r} '
2760+
f'unless caller signals it supports that possibility '
2761+
f'using _external_ok=True')
2762+
if not (_id is None or _id < 0): # non-special ID
2763+
raise ValueError(
2764+
f'Cannot create Resource with external URL {normalized_url!r} '
2765+
f'with in-database id={_id}.')
2766+
_id = Resource._EXTERNAL_ID # reinterpret
2767+
27342768
self = object.__new__(cls)
27352769
self.project = _resolve_proxy(project) # type: ignore[assignment]
27362770
self._url = normalized_url
@@ -2754,6 +2788,10 @@ def __new__(cls,
27542788
# Can't have revisions because it was just created this session
27552789
self._definitely_has_no_revisions = True
27562790

2791+
if _id == Resource._EXTERNAL_ID:
2792+
# External resources are in-memory only and never have revisions
2793+
self._definitely_has_no_revisions = True
2794+
27572795
if _id == Resource._DEFER_ID:
27582796
self._id = None # type: ignore[assignment] # intentionally leave exploding None
27592797
else:
@@ -2776,24 +2814,28 @@ def _finish_init(self, id: int, creating: bool) -> None:
27762814
* creating -- whether this resource is being created and did not exist in the database
27772815
"""
27782816
# Private API:
2779-
# - id may be _UNSAVED_ID
2817+
# - id may be _UNSAVED_ID or _EXTERNAL_ID
27802818
self._id = id
27812819

27822820
if creating:
27832821
project = self.project # cache
27842822

2785-
# Record self in Project
2786-
project._resource_for_url[self._url] = self
2787-
if id == Resource._UNSAVED_ID:
2788-
project._unsaved_resources.append(self)
2823+
if id == Resource._EXTERNAL_ID:
2824+
# External resources are in-memory only, not saved to project
2825+
pass
27892826
else:
2790-
assert id >= 0 # not any other kind of special ID
2791-
project._resource_for_id[id] = self
2792-
if project._sorted_resource_urls is not None:
2793-
project._sorted_resource_urls.add(self._url)
2794-
# NOTE: Don't check invariants here to save a little performance,
2795-
# since this method (_finish_init) is called very many times
2796-
#project._check_url_collection_invariants()
2827+
# Record self in Project
2828+
project._resource_for_url[self._url] = self
2829+
if id == Resource._UNSAVED_ID:
2830+
project._unsaved_resources.append(self)
2831+
else:
2832+
assert id >= 0 # not any other kind of special ID
2833+
project._resource_for_id[id] = self
2834+
if project._sorted_resource_urls is not None:
2835+
project._sorted_resource_urls.add(self._url)
2836+
# NOTE: Don't check invariants here to save a little performance,
2837+
# since this method (_finish_init) is called very many times
2838+
#project._check_url_collection_invariants()
27972839

27982840
# Notify listeners that self did instantiate
27992841
project._resource_did_instantiate(self)
@@ -2831,6 +2873,7 @@ def bulk_get_or_create(cls,
28312873
project: Project,
28322874
urls: list[str],
28332875
origin_url: str,
2876+
*, _external_ok: bool=False,
28342877
) -> list[Resource]:
28352878
"""
28362879
Get or creates several Resources for the specified list of URLs, in bulk.
@@ -2852,7 +2895,19 @@ def bulk_get_or_create(cls,
28522895
* urls -- absolute URLs.
28532896
* origin_url -- origin URL from which `urls` were obtained. Used for debugging.
28542897
"""
2855-
(already_created, created) = cls._bulk_get_or_create(project, urls, origin_url)
2898+
# Private API:
2899+
# * _external_ok --
2900+
# - whether the caller is prepared for the possibility
2901+
# that any of the specified URLs correspond to an external URL.
2902+
# In that circumstance the returned list
2903+
# may contain one or more Resources where
2904+
# (external_url is not None) and the caller should check
2905+
# for that condition to do any special handling required.
2906+
2907+
(already_created, created) = cls._bulk_get_or_create(
2908+
project, urls, origin_url,
2909+
_external_ok=_external_ok,
2910+
)
28562911
return already_created + created
28572912

28582913
@classmethod
@@ -2883,7 +2938,14 @@ def bulk_create(cls,
28832938
* urls -- absolute URLs.
28842939
* origin_url -- origin URL from which `urls` were obtained. Used for debugging.
28852940
"""
2886-
(already_created, created) = cls._bulk_get_or_create(project, urls, origin_url)
2941+
(_, created) = cls._bulk_get_or_create(
2942+
project, urls, origin_url,
2943+
# NOTE: _external_ok=True is always safe in this context because
2944+
# any created Resources with an external URL won't be exposed
2945+
# to the caller. Therefore the caller cannot observe such
2946+
# Resources and does not need any special handling for them.
2947+
_external_ok=True,
2948+
)
28872949
return created
28882950

28892951
@classmethod
@@ -2892,25 +2954,39 @@ def _bulk_get_or_create(cls,
28922954
project: Project,
28932955
urls: list[str],
28942956
origin_url: str,
2957+
*, _external_ok: bool=False,
28952958
) -> tuple[list[Resource], list[Resource]]:
2959+
# Private API:
2960+
# * _external_ok --
2961+
# - whether the caller is prepared for the possibility
2962+
# that any of the specified URLs correspond to an external URL.
2963+
# In that circumstance the returned list of
2964+
# `resources_already_created` may contain one or more Resources where
2965+
# (external_url is not None) and the caller should check
2966+
# for that condition to do any special handling required.
2967+
28962968
# 1. Create Resources in memory initially, deferring any database INSERTs
28972969
# 2. Identify new resources that need to be inserted in the database
28982970
resource_for_new_url = OrderedDict() # type: Dict[str, Resource]
28992971
resources_already_created = []
29002972
for url in urls:
29012973
# Get/create Resource in memory and normalize its URL
2902-
new_r = Resource(project, url, _id=Resource._DEFER_ID)
2903-
if new_r._is_finished_initializing:
2904-
# Resource with normalized URL already existed in memory
2974+
new_r = Resource(project, url, _id=Resource._DEFER_ID, _external_ok=_external_ok)
2975+
if new_r.external_url is not None:
2976+
# Report external URLs which exist only in memory as being "already created"
29052977
resources_already_created.append(new_r)
29062978
else:
2907-
# Resource with normalized URL needs to be created in database
2908-
if new_r.url in resource_for_new_url:
2909-
# Resource with normalized URL is already scheduled to be created in database
2910-
pass
2979+
if new_r._is_finished_initializing:
2980+
# Resource with normalized URL already existed in memory
2981+
resources_already_created.append(new_r)
29112982
else:
2912-
# Schedule resource with normalized URL to be created in database
2913-
resource_for_new_url[new_r.url] = new_r
2983+
# Resource with normalized URL needs to be created in database
2984+
if new_r.url in resource_for_new_url:
2985+
# Resource with normalized URL is already scheduled to be created in database
2986+
pass
2987+
else:
2988+
# Schedule resource with normalized URL to be created in database
2989+
resource_for_new_url[new_r.url] = new_r
29142990

29152991
if len(resource_for_new_url) > 0:
29162992
if project.readonly:
@@ -3043,6 +3119,27 @@ def resource_url_alternatives(project: Project, url: str) -> list[str]:
30433119
alternatives.append(new_url)
30443120
old_url = new_url # reinterpret
30453121

3122+
# Apply user-defined alias-based normalization,
3123+
# after all other normalizations
3124+
old_url = new_url
3125+
for alias in project.aliases:
3126+
if not old_url.startswith(alias.source_url_prefix):
3127+
continue
3128+
3129+
# Replace source prefix with target prefix
3130+
new_url = alias.target_url_prefix + old_url[len(alias.source_url_prefix):]
3131+
3132+
# If target is external, format as external URL
3133+
if alias.target_is_external:
3134+
new_url = Alias.format_external_url(new_url)
3135+
3136+
if new_url != old_url:
3137+
alternatives.append(new_url)
3138+
old_url = new_url # reinterpret
3139+
3140+
# Only apply the first matching alias
3141+
break
3142+
30463143
return alternatives
30473144

30483145
@property
@@ -3066,6 +3163,18 @@ def url(self) -> str:
30663163
def normalized_url(self) -> str:
30673164
return self.resource_url_alternatives(self.project, self._url)[-1]
30683165

3166+
@property
3167+
def external_url(self) -> str | None:
3168+
"""
3169+
If this Resource points to live URL on the internet external to the project,
3170+
returns what that external URL is. Otherwise returns None.
3171+
"""
3172+
if self._id != Resource._EXTERNAL_ID:
3173+
return None
3174+
external_url = Alias.parse_external_url(self._url)
3175+
assert external_url is not None
3176+
return external_url
3177+
30693178
def _get_already_downloaded_this_session(self) -> bool:
30703179
return self._already_downloaded_this_session
30713180
def _set_already_downloaded_this_session(self, value: bool) -> None:
@@ -4766,6 +4875,39 @@ def _set_target_is_external(self, target_is_external: bool) -> None:
47664875
self._target_is_external = target_is_external
47674876
target_is_external = cast(bool, property(_get_target_is_external, _set_target_is_external))
47684877

4878+
# === External URLs ===
4879+
4880+
@staticmethod
4881+
def format_external_url(external_url: str) -> str:
4882+
"""
4883+
Given an external URL (pointing to a live resource on the internet),
4884+
returns the corresponding archive URL that should be used internally
4885+
within the project to represent this external resource.
4886+
4887+
Example:
4888+
>>> Alias.format_external_url('https://example.com/page')
4889+
'crystal://external/https://example.com/page'
4890+
"""
4891+
return f'crystal://external/{external_url}'
4892+
4893+
@staticmethod
4894+
def parse_external_url(archive_url: str) -> str | None:
4895+
"""
4896+
Given an archive URL, returns the corresponding external URL if the
4897+
archive URL represents an external resource, or None otherwise.
4898+
4899+
Example:
4900+
>>> Alias.parse_external_url('crystal://external/https://example.com/page')
4901+
'https://example.com/page'
4902+
>>> Alias.parse_external_url('https://example.com/page')
4903+
None
4904+
"""
4905+
prefix = 'crystal://external/'
4906+
if archive_url.startswith(prefix):
4907+
return archive_url[len(prefix):]
4908+
else:
4909+
return None
4910+
47694911
# === Utility ===
47704912

47714913
def __repr__(self):

src/crystal/server/__init__.py

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from crystal.doc.generic import Document, Link
1212
from crystal.doc.html.soup import HtmlDocument
1313
from crystal.model import (
14-
Project, Resource, ResourceGroup, ResourceGroupSource, ResourceRevision,
14+
Alias, Project, Resource, ResourceGroup, ResourceGroupSource, ResourceRevision,
1515
RootResource,
1616
)
1717
from crystal import resources
@@ -720,22 +720,7 @@ def _serve_archive_url(self, archive_url: str) -> Generator[SwitchToThread, None
720720
# or whether it should be created in a different form
721721
resource = self.project.get_resource(archive_url)
722722
if resource is None:
723-
archive_url_alternatives = Resource.resource_url_alternatives(
724-
self.project, archive_url)
725-
if len(archive_url_alternatives) >= 2:
726-
assert archive_url_alternatives[0] == archive_url
727-
# TODO: Optimize to use a bulk version of Project.get_resource()
728-
# rather than making several individual queries
729-
for urla in archive_url_alternatives[1:]:
730-
if self.project.get_resource(urla) is not None:
731-
# Redirect to existing URL in archive
732-
yield SwitchToThread.BACKGROUND
733-
self.send_redirect(self.get_request_url(urla))
734-
return
735-
736-
# Redirect to canonical form of URL in archive
737-
yield SwitchToThread.BACKGROUND
738-
self.send_redirect(self.get_request_url(archive_url_alternatives[-1]))
723+
if (yield from self._try_redirect_to_best_alternative_url(archive_url)):
739724
return
740725
# (Either resource exists at archive_url, or archive_url is in canonical form)
741726

@@ -802,6 +787,12 @@ def get_default_revision() -> ResourceRevision | None:
802787
return
803788

804789
assert revision is None # still
790+
yield SwitchToThread.FOREGROUND
791+
if (yield from self._try_redirect_to_best_alternative_url(archive_url)):
792+
return
793+
# (archive_url is in canonical form)
794+
795+
yield SwitchToThread.BACKGROUND
805796
self.send_resource_not_in_archive(archive_url)
806797
return
807798

@@ -834,7 +825,39 @@ def get_default_revision() -> ResourceRevision | None:
834825

835826
self.send_revision(revision, archive_url)
836827
return
837-
828+
829+
@fg_affinity
830+
def _try_redirect_to_best_alternative_url(self, archive_url: str) -> Generator[SwitchToThread, None, bool]:
831+
"""
832+
Returns whether successfully sent a redirect.
833+
Fails only if archive_url is already in canonical form.
834+
"""
835+
archive_url_alternatives = Resource.resource_url_alternatives(
836+
self.project, archive_url)
837+
if not (len(archive_url_alternatives) >= 2):
838+
return False
839+
assert archive_url_alternatives[0] == archive_url
840+
841+
# TODO: Optimize to use a bulk version of Project.get_resource()
842+
# rather than making several individual queries
843+
for urla in archive_url_alternatives[1:]:
844+
if self.project.get_resource(urla) is not None:
845+
request_url = self.get_request_url(urla)
846+
elif (external_url := Alias.parse_external_url(urla)) is not None:
847+
request_url = external_url
848+
else:
849+
continue
850+
851+
# Redirect to existing URL in archive
852+
yield SwitchToThread.BACKGROUND
853+
self.send_redirect(request_url)
854+
return True
855+
856+
# Redirect to canonical form of URL in archive
857+
yield SwitchToThread.BACKGROUND
858+
self.send_redirect(self.get_request_url(archive_url_alternatives[-1]))
859+
return True
860+
838861
def _find_root_resource_matching_archive_url(self, archive_url: str) -> RootResource | None:
839862
for rr in self.project.root_resources:
840863
if rr.resource.url == archive_url:

0 commit comments

Comments
 (0)