@@ -2652,10 +2652,11 @@ class Resource:
26522652 its project is read-only. If/when the project transitions to
26532653 writable, any unsaved resources will be saved to disk.
26542654 """
2655- # Special IDs, all <0
2655+ # Special IDs, all < 0
26562656 _DEFER_ID = - 1 # type: Literal[-1]
26572657 _DELETED_ID = - 2 # type: Literal[-2]
26582658 _UNSAVED_ID = - 3 # type: Literal[-3]
2659+ _EXTERNAL_ID = - 4 # type: Literal[-4]
26592660
26602661 # Optimize per-instance memory use, since there may be very many Resource objects
26612662 __slots__ = (
@@ -2688,6 +2689,7 @@ def __new__(cls,
26882689 project : Project ,
26892690 url : str ,
26902691 _id : Union [None , int ]= None ,
2692+ * , _external_ok : bool = False ,
26912693 ) -> Resource :
26922694 """
26932695 Looks up an existing resource with the specified URL or creates a new
@@ -2705,11 +2707,28 @@ def __new__(cls,
27052707 * url -- absolute URL to this resource (ex: http), or a URI (ex: mailto).
27062708 """
27072709 # Private API:
2708- # - If _id == Resource._DEFER_ID, and there is no existing resource
2709- # corresponding to the URL in the project, the returned Resource
2710- # will not have a valid ID and report _is_finished_initializing() == False.
2711- # The caller will then be responsible for populating the ID later
2712- # using _finish_init().
2710+ # * _id --
2711+ # - If _id is None then any existing resource in the database
2712+ # matching the specified URL will be used. If no matching resource
2713+ # is found then a new Resource will be created in the database.
2714+ # The resulting Resource will always have an _id pointing to
2715+ # a Resource in the database.
2716+ # - If not (_id < 0) then it points to an existing Resource in the
2717+ # database with the specified ID.
2718+ # - If _id == Resource._DEFER_ID, and there is no existing resource
2719+ # corresponding to the URL in the project, the returned Resource
2720+ # will not have a valid ID and report _is_finished_initializing() == False.
2721+ # The caller will then be responsible for populating the ID later
2722+ # using _finish_init().
2723+ # - If _id == Resource._EXTERNAL_ID, then the caller is signaling
2724+ # an explicit intent to create an external URL.
2725+ # - No other values for _id are valid.
2726+ # * _external_ok --
2727+ # - whether the caller is prepared for the possibility
2728+ # that the specified URL corresponds to an external URL.
2729+ # In that circumstance the returned Resource will have
2730+ # (external_url is not None) and the caller should check
2731+ # for that condition to do any special handling required.
27132732
27142733 if _id is None or _id == Resource ._DEFER_ID :
27152734 url_alternatives = cls .resource_url_alternatives (project , url )
@@ -2731,6 +2750,21 @@ def __new__(cls,
27312750 normalized_url = url
27322751 del url # prevent accidental usage later
27332752
2753+ # Ensure that if an external URL is used then the caller opts-in
2754+ # to handling that possibility, so that they aren't created unintentionally
2755+ is_external = Alias .parse_external_url (normalized_url ) is not None
2756+ if is_external :
2757+ if not _external_ok :
2758+ raise ValueError (
2759+ f'Cannot create Resource with external URL { normalized_url !r} '
2760+ f'unless caller signals it supports that possibility '
2761+ f'using _external_ok=True' )
2762+ if not (_id is None or _id < 0 ): # non-special ID
2763+ raise ValueError (
2764+ f'Cannot create Resource with external URL { normalized_url !r} '
2765+ f'with in-database id={ _id } .' )
2766+ _id = Resource ._EXTERNAL_ID # reinterpret
2767+
27342768 self = object .__new__ (cls )
27352769 self .project = _resolve_proxy (project ) # type: ignore[assignment]
27362770 self ._url = normalized_url
@@ -2754,6 +2788,10 @@ def __new__(cls,
27542788 # Can't have revisions because it was just created this session
27552789 self ._definitely_has_no_revisions = True
27562790
2791+ if _id == Resource ._EXTERNAL_ID :
2792+ # External resources are in-memory only and never have revisions
2793+ self ._definitely_has_no_revisions = True
2794+
27572795 if _id == Resource ._DEFER_ID :
27582796 self ._id = None # type: ignore[assignment] # intentionally leave exploding None
27592797 else :
@@ -2776,24 +2814,28 @@ def _finish_init(self, id: int, creating: bool) -> None:
27762814 * creating -- whether this resource is being created and did not exist in the database
27772815 """
27782816 # Private API:
2779- # - id may be _UNSAVED_ID
2817+ # - id may be _UNSAVED_ID or _EXTERNAL_ID
27802818 self ._id = id
27812819
27822820 if creating :
27832821 project = self .project # cache
27842822
2785- # Record self in Project
2786- project ._resource_for_url [self ._url ] = self
2787- if id == Resource ._UNSAVED_ID :
2788- project ._unsaved_resources .append (self )
2823+ if id == Resource ._EXTERNAL_ID :
2824+ # External resources are in-memory only, not saved to project
2825+ pass
27892826 else :
2790- assert id >= 0 # not any other kind of special ID
2791- project ._resource_for_id [id ] = self
2792- if project ._sorted_resource_urls is not None :
2793- project ._sorted_resource_urls .add (self ._url )
2794- # NOTE: Don't check invariants here to save a little performance,
2795- # since this method (_finish_init) is called very many times
2796- #project._check_url_collection_invariants()
2827+ # Record self in Project
2828+ project ._resource_for_url [self ._url ] = self
2829+ if id == Resource ._UNSAVED_ID :
2830+ project ._unsaved_resources .append (self )
2831+ else :
2832+ assert id >= 0 # not any other kind of special ID
2833+ project ._resource_for_id [id ] = self
2834+ if project ._sorted_resource_urls is not None :
2835+ project ._sorted_resource_urls .add (self ._url )
2836+ # NOTE: Don't check invariants here to save a little performance,
2837+ # since this method (_finish_init) is called very many times
2838+ #project._check_url_collection_invariants()
27972839
27982840 # Notify listeners that self did instantiate
27992841 project ._resource_did_instantiate (self )
@@ -2831,6 +2873,7 @@ def bulk_get_or_create(cls,
28312873 project : Project ,
28322874 urls : list [str ],
28332875 origin_url : str ,
2876+ * , _external_ok : bool = False ,
28342877 ) -> list [Resource ]:
28352878 """
28362879 Get or creates several Resources for the specified list of URLs, in bulk.
@@ -2852,7 +2895,19 @@ def bulk_get_or_create(cls,
28522895 * urls -- absolute URLs.
28532896 * origin_url -- origin URL from which `urls` were obtained. Used for debugging.
28542897 """
2855- (already_created , created ) = cls ._bulk_get_or_create (project , urls , origin_url )
2898+ # Private API:
2899+ # * _external_ok --
2900+ # - whether the caller is prepared for the possibility
2901+ # that any of the specified URLs correspond to an external URL.
2902+ # In that circumstance the returned list
2903+ # may contain one or more Resources where
2904+ # (external_url is not None) and the caller should check
2905+ # for that condition to do any special handling required.
2906+
2907+ (already_created , created ) = cls ._bulk_get_or_create (
2908+ project , urls , origin_url ,
2909+ _external_ok = _external_ok ,
2910+ )
28562911 return already_created + created
28572912
28582913 @classmethod
@@ -2883,7 +2938,14 @@ def bulk_create(cls,
28832938 * urls -- absolute URLs.
28842939 * origin_url -- origin URL from which `urls` were obtained. Used for debugging.
28852940 """
2886- (already_created , created ) = cls ._bulk_get_or_create (project , urls , origin_url )
2941+ (_ , created ) = cls ._bulk_get_or_create (
2942+ project , urls , origin_url ,
2943+ # NOTE: _external_ok=True is always safe in this context because
2944+ # any created Resources with an external URL won't be exposed
2945+ # to the caller. Therefore the caller cannot observe such
2946+ # Resources and does not need any special handling for them.
2947+ _external_ok = True ,
2948+ )
28872949 return created
28882950
28892951 @classmethod
@@ -2892,25 +2954,39 @@ def _bulk_get_or_create(cls,
28922954 project : Project ,
28932955 urls : list [str ],
28942956 origin_url : str ,
2957+ * , _external_ok : bool = False ,
28952958 ) -> tuple [list [Resource ], list [Resource ]]:
2959+ # Private API:
2960+ # * _external_ok --
2961+ # - whether the caller is prepared for the possibility
2962+ # that any of the specified URLs correspond to an external URL.
2963+ # In that circumstance the returned list of
2964+ # `resources_already_created` may contain one or more Resources where
2965+ # (external_url is not None) and the caller should check
2966+ # for that condition to do any special handling required.
2967+
28962968 # 1. Create Resources in memory initially, deferring any database INSERTs
28972969 # 2. Identify new resources that need to be inserted in the database
28982970 resource_for_new_url = OrderedDict () # type: Dict[str, Resource]
28992971 resources_already_created = []
29002972 for url in urls :
29012973 # Get/create Resource in memory and normalize its URL
2902- new_r = Resource (project , url , _id = Resource ._DEFER_ID )
2903- if new_r ._is_finished_initializing :
2904- # Resource with normalized URL already existed in memory
2974+ new_r = Resource (project , url , _id = Resource ._DEFER_ID , _external_ok = _external_ok )
2975+ if new_r .external_url is not None :
2976+ # Report external URLs which exist only in memory as being "already created"
29052977 resources_already_created .append (new_r )
29062978 else :
2907- # Resource with normalized URL needs to be created in database
2908- if new_r .url in resource_for_new_url :
2909- # Resource with normalized URL is already scheduled to be created in database
2910- pass
2979+ if new_r ._is_finished_initializing :
2980+ # Resource with normalized URL already existed in memory
2981+ resources_already_created .append (new_r )
29112982 else :
2912- # Schedule resource with normalized URL to be created in database
2913- resource_for_new_url [new_r .url ] = new_r
2983+ # Resource with normalized URL needs to be created in database
2984+ if new_r .url in resource_for_new_url :
2985+ # Resource with normalized URL is already scheduled to be created in database
2986+ pass
2987+ else :
2988+ # Schedule resource with normalized URL to be created in database
2989+ resource_for_new_url [new_r .url ] = new_r
29142990
29152991 if len (resource_for_new_url ) > 0 :
29162992 if project .readonly :
@@ -3043,6 +3119,27 @@ def resource_url_alternatives(project: Project, url: str) -> list[str]:
30433119 alternatives .append (new_url )
30443120 old_url = new_url # reinterpret
30453121
3122+ # Apply user-defined alias-based normalization,
3123+ # after all other normalizations
3124+ old_url = new_url
3125+ for alias in project .aliases :
3126+ if not old_url .startswith (alias .source_url_prefix ):
3127+ continue
3128+
3129+ # Replace source prefix with target prefix
3130+ new_url = alias .target_url_prefix + old_url [len (alias .source_url_prefix ):]
3131+
3132+ # If target is external, format as external URL
3133+ if alias .target_is_external :
3134+ new_url = Alias .format_external_url (new_url )
3135+
3136+ if new_url != old_url :
3137+ alternatives .append (new_url )
3138+ old_url = new_url # reinterpret
3139+
3140+ # Only apply the first matching alias
3141+ break
3142+
30463143 return alternatives
30473144
30483145 @property
@@ -3066,6 +3163,18 @@ def url(self) -> str:
30663163 def normalized_url (self ) -> str :
30673164 return self .resource_url_alternatives (self .project , self ._url )[- 1 ]
30683165
3166+ @property
3167+ def external_url (self ) -> str | None :
3168+ """
3169+ If this Resource points to live URL on the internet external to the project,
3170+ returns what that external URL is. Otherwise returns None.
3171+ """
3172+ if self ._id != Resource ._EXTERNAL_ID :
3173+ return None
3174+ external_url = Alias .parse_external_url (self ._url )
3175+ assert external_url is not None
3176+ return external_url
3177+
30693178 def _get_already_downloaded_this_session (self ) -> bool :
30703179 return self ._already_downloaded_this_session
30713180 def _set_already_downloaded_this_session (self , value : bool ) -> None :
@@ -4766,6 +4875,39 @@ def _set_target_is_external(self, target_is_external: bool) -> None:
47664875 self ._target_is_external = target_is_external
47674876 target_is_external = cast (bool , property (_get_target_is_external , _set_target_is_external ))
47684877
4878+ # === External URLs ===
4879+
4880+ @staticmethod
4881+ def format_external_url (external_url : str ) -> str :
4882+ """
4883+ Given an external URL (pointing to a live resource on the internet),
4884+ returns the corresponding archive URL that should be used internally
4885+ within the project to represent this external resource.
4886+
4887+ Example:
4888+ >>> Alias.format_external_url('https://example.com/page')
4889+ 'crystal://external/https://example.com/page'
4890+ """
4891+ return f'crystal://external/{ external_url } '
4892+
4893+ @staticmethod
4894+ def parse_external_url (archive_url : str ) -> str | None :
4895+ """
4896+ Given an archive URL, returns the corresponding external URL if the
4897+ archive URL represents an external resource, or None otherwise.
4898+
4899+ Example:
4900+ >>> Alias.parse_external_url('crystal://external/https://example.com/page')
4901+ 'https://example.com/page'
4902+ >>> Alias.parse_external_url('https://example.com/page')
4903+ None
4904+ """
4905+ prefix = 'crystal://external/'
4906+ if archive_url .startswith (prefix ):
4907+ return archive_url [len (prefix ):]
4908+ else :
4909+ return None
4910+
47694911 # === Utility ===
47704912
47714913 def __repr__ (self ):
0 commit comments