Skip to content

Commit 6a5ae2e

Browse files
committed
[a 2/2] Add mirror URI to service /index/file response (#7624)
Expose the file mirror service as a property of the mirror service. Consolidate conditional URI emission as a service method. Consider the file's source when deciding whether to emit a URI in the /index/file response. Previously, the source of a file was only considered when emitting the URI in manifests.
1 parent 331d584 commit 6a5ae2e

File tree

8 files changed

+114
-62
lines changed

8 files changed

+114
-62
lines changed

src/azul/__init__.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@
2424
from typing import (
2525
Any,
2626
BinaryIO,
27+
Callable,
2728
ClassVar,
29+
Hashable,
2830
IO,
2931
Literal,
3032
NotRequired,
3133
Self,
34+
TYPE_CHECKING,
3235
TextIO,
3336
TypedDict,
3437
final,
@@ -82,7 +85,19 @@
8285

8386
lru_cache = functools.lru_cache
8487

85-
cache = functools.cache
88+
if TYPE_CHECKING:
89+
# Work around https://github.com/python/typeshed/issues/15139
90+
@final
91+
class CacheWrapper[_T]:
92+
93+
def __call__(self, *args: Hashable, **kwargs: Hashable) -> _T:
94+
...
95+
96+
97+
def cache[_T](f: Callable[..., _T], /) -> CacheWrapper[_T]: # noqa: E303
98+
...
99+
else:
100+
cache = functools.cache
86101

87102

88103
def cache_per_thread(f, /):

src/azul/indexer/mirror_service.py

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
SourceSpec,
4242
)
4343
from azul.indexer.mirror_file_service import (
44+
BaseMirrorFileService,
4445
FilePart,
4546
MirrorFileService,
4647
SchemaUrlFunc,
@@ -58,6 +59,7 @@
5859
SourceService,
5960
)
6061
from azul.types import (
62+
JSON,
6163
json_element_strings,
6264
)
6365

@@ -128,6 +130,10 @@ class BaseMirrorService:
128130
Service for queuing mirroring work, e.g., sending action messages.
129131
"""
130132

133+
@cache
134+
def file_service(self, catalog: CatalogName) -> BaseMirrorFileService:
135+
return BaseMirrorFileService(catalog=catalog)
136+
131137
@cached_property
132138
def _queues(self) -> Queues:
133139
return Queues()
@@ -153,8 +159,7 @@ def may_mirror_files_from_source(self,
153159
else:
154160
return False
155161

156-
@classmethod
157-
def may_mirror(cls, catalog: CatalogName, file_size: int = 0) -> bool:
162+
def may_mirror(self, catalog: CatalogName, file_size: int = 0) -> bool:
158163
"""
159164
Test whether it makes sense to request the mirroring of files from the
160165
given catalog if they are of the given size or larger. If this method
@@ -168,6 +173,38 @@ def may_mirror(cls, catalog: CatalogName, file_size: int = 0) -> bool:
168173
else:
169174
return False
170175

176+
def mirror_uri(self,
177+
catalog: CatalogName,
178+
source: SourceSpec,
179+
file_cls: type[File],
180+
file_json: JSON
181+
) -> str | None:
182+
"""
183+
Return the the URI of the mirror copy of the given file from the given
184+
catalog. If this method returns None, the file was not mirrored, and no
185+
such URI exists. Otherwise, a mirror copy of the file may or may not
186+
exist under the returned URI.
187+
188+
:param catalog: The catalog the file is part of
189+
190+
:param source: The source of the file
191+
192+
:param file_cls: The type of the file. This parameter is needed in order
193+
to avoid deserializing a file from a source that was
194+
configured to not be mirrored because the file metadata
195+
in that source is incomplete or broken
196+
197+
:param file_json: the index representation of the file
198+
"""
199+
if self.may_mirror_files_from_source(catalog, source):
200+
file = file_cls.from_index(file_json)
201+
if self.may_mirror(catalog, 0 if file.size is None else file.size):
202+
return self.file_service(catalog).mirror_uri(file)
203+
else:
204+
return None
205+
else:
206+
return None
207+
171208
def mirror_sources(self,
172209
catalog: CatalogName,
173210
sources: Iterable[tuple[SourceRef, SourceConfig]]
@@ -217,7 +254,7 @@ class MirrorService(BaseMirrorService):
217254
_schema_url_func: SchemaUrlFunc
218255

219256
@cache
220-
def _file_service(self, catalog: CatalogName) -> MirrorFileService:
257+
def file_service(self, catalog: CatalogName) -> MirrorFileService:
221258
return MirrorFileService(catalog=catalog,
222259
schema_url_func=self._schema_url_func)
223260

@@ -285,7 +322,7 @@ def _(self, a: MirrorPartitionAction) -> Iterable[MirrorAction]:
285322
@_mirror.register
286323
def _(self, a: MirrorFileAction) -> Iterable[MirrorAction]:
287324
assert a.file.size is not None, R('File size unknown', a.file)
288-
service = self._file_service(a.catalog)
325+
service = self.file_service(a.catalog)
289326
if service.info_exists(a.file):
290327
log.info('File is already mirrored, skipping upload: %r', a.file)
291328
elif service.file_exists(a.file):
@@ -318,7 +355,7 @@ def _(self, a: MirrorFileAction) -> Iterable[MirrorAction]:
318355
@_mirror.register
319356
def _(self, a: MirrorPartAction) -> Iterable[MirrorAction]:
320357
log.info('Uploading part #%d of file %r', a.part.index, a.file)
321-
service = self._file_service(a.catalog)
358+
service = self.file_service(a.catalog)
322359
# Hashers are mutable so we need to make a copy
323360
hasher = a.hasher.copy()
324361
etag = service.mirror_file_part(a.file, a.part, a.upload_id, hasher)
@@ -348,7 +385,7 @@ def _(self, a: MirrorPartAction) -> Iterable[MirrorAction]:
348385
@_mirror.register
349386
def _(self, a: FinalizeFileAction) -> Iterable[MirrorAction]:
350387
assert len(a.etags) > 0
351-
service = self._file_service(a.catalog)
388+
service = self.file_service(a.catalog)
352389
service.finish_mirroring_file(file=a.file,
353390
upload_id=a.upload_id,
354391
etags=a.etags,

src/azul/plugins/metadata/anvil/service/response.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
from azul import (
1515
cached_property,
1616
)
17+
from azul.indexer import (
18+
SourceRef,
19+
SourceSpec,
20+
)
1721
from azul.json import (
1822
copy_any_json,
1923
copy_json,
@@ -171,15 +175,18 @@ def choose_entry(_term) -> AnyMutableJSON:
171175
}
172176

173177
def _make_hit(self, es_hit: JSON) -> MutableJSON:
174-
sources, bundles = es_hit['sources'], es_hit['bundles']
178+
contents = json_mapping(es_hit['contents'])
179+
sources = json_element_mappings(es_hit['sources'])
180+
bundles = json_element_mappings(es_hit['bundles'])
181+
source = SourceRef[SourceSpec].from_json(one(sources)).spec
175182
return {
176183
'entryId': json_str(es_hit['entity_id']),
177184
# Note that there is a brittle coupling that must be maintained
178185
# between the `sources` and `bundles` field paths here and the
179186
# renamed fields in `Plugin.manifest_config`.
180-
'sources': list(map(self._make_source, json_element_mappings(sources))),
181-
'bundles': list(map(self._make_bundle, json_element_mappings(bundles))),
182-
**self._make_contents(json_mapping(es_hit['contents']))
187+
'sources': list(map(self._make_source, sources)),
188+
'bundles': list(map(self._make_bundle, bundles)),
189+
**self._make_contents(source, contents)
183190
}
184191

185192
def _make_source(self, es_source: JSON) -> MutableJSON:
@@ -199,26 +206,31 @@ def _make_bundle(self, es_bundle: JSON) -> MutableJSON:
199206
self._special_fields.bundle_version.name_in_hit: json_str(es_bundle['version'])
200207
}
201208

202-
def _make_contents(self, es_contents: JSON) -> MutableJSON:
209+
def _make_contents(self, source: SourceSpec, es_contents: JSON) -> MutableJSON:
203210
return {
204211
inner_entity_type: (
205-
[self._pivotal_entity(inner_entity_type, json_mapping(one(inner_entities)))]
212+
[
213+
self._pivotal_entity(source,
214+
inner_entity_type,
215+
json_mapping(one(inner_entities)))
216+
]
206217
if inner_entity_type == self.entity_type else
207218
list(map(partial(self._non_pivotal_entity, inner_entity_type), inner_entities))
208219
)
209220
for inner_entity_type, inner_entities in json_item_sequences(es_contents)
210221
}
211222

212223
def _pivotal_entity(self,
224+
source: SourceSpec,
213225
inner_entity_type: str,
214-
inner_entity: JSON
226+
inner_entity: JSON,
215227
) -> MutableJSON:
216228
inner_entity = copy_json(inner_entity)
217229
if inner_entity_type == 'files':
218230
inner_entity['azul_url'] = self._file_url(uuid=json_str(inner_entity['document_id']),
219231
version=json_str(inner_entity['version']),
220232
drs_uri=optional(json_str, inner_entity['drs_uri']))
221-
inner_entity['azul_mirror_uri'] = self._file_mirror_uri(inner_entity)
233+
inner_entity['azul_mirror_uri'] = self._file_mirror_uri(source, inner_entity)
222234
inner_entity.pop('version', None)
223235
return inner_entity
224236

src/azul/plugins/metadata/hca/service/response.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
from azul import (
2222
cached_property,
2323
)
24+
from azul.indexer import (
25+
SourceRef,
26+
SourceSpec,
27+
)
2428
from azul.plugins import (
2529
SpecialFields,
2630
)
@@ -345,7 +349,7 @@ def make_dates(self, entry) -> MutableJSONs:
345349
for dates in entry['contents']['dates']
346350
]
347351

348-
def make_projects(self, entry) -> MutableJSONs:
352+
def make_projects(self, source: SourceSpec, entry) -> MutableJSONs:
349353
projects = []
350354
contents = entry['contents']
351355
for project in contents['projects']:
@@ -374,34 +378,36 @@ def make_projects(self, entry) -> MutableJSONs:
374378
for key in list(publication.keys()):
375379
publication[to_camel_case(key)] = publication.pop(key)
376380
translated_project['supplementaryLinks'] = project.get('supplementary_links', [None])
377-
translated_project['matrices'] = self.make_matrices_(contents['matrices'])
378-
translated_project['contributedAnalyses'] = self.make_matrices_(contents['contributed_analyses'])
381+
translated_project['matrices'] = self.make_matrices_(source,
382+
contents['matrices'])
383+
translated_project['contributedAnalyses'] = self.make_matrices_(source,
384+
contents['contributed_analyses'])
379385
translated_project['accessions'] = project.get('accessions', [None])
380386
projects.append(translated_project)
381387
return projects
382388

383389
# FIXME: Move this to during aggregation
384390
# https://github.com/DataBiosphere/azul/issues/2415
385391

386-
def make_matrices_(self, matrices: JSONs) -> JSON:
392+
def make_matrices_(self, source: SourceSpec, matrices: JSONs) -> JSON:
387393
files: list[JSON] = []
388394
if matrices:
389395
for file in json_element_mappings(one(matrices)['file']):
390396
translated_file = {
391-
**self.make_file(file),
397+
**self.make_file(source, file),
392398
'strata': json_str(file['strata'])
393399
}
394400
files.append(translated_file)
395401
return make_stratification_tree(files)
396402

397-
def make_files(self, entry: JSON) -> JSONs:
403+
def make_files(self, source: SourceSpec, entry: JSON) -> JSONs:
398404
files = []
399405
for _file in json_element_mappings(json_mapping(entry['contents'])['files']):
400-
translated_file = self.make_file(_file)
406+
translated_file = self.make_file(source, _file)
401407
files.append(translated_file)
402408
return files
403409

404-
def make_file(self, file: JSON) -> JSON:
410+
def make_file(self, source: SourceSpec, file: JSON) -> JSON:
405411
translated_file = {
406412
'contentDescription': file.get('content_description'),
407413
'format': file.get('file_format'),
@@ -417,7 +423,7 @@ def make_file(self, file: JSON) -> JSON:
417423
'azul_url': self._file_url(uuid=json_str(file['uuid']),
418424
version=json_str(file['version']),
419425
drs_uri=optional(json_str, file['drs_uri'])),
420-
'azul_mirror_uri': self._file_mirror_uri(file),
426+
'azul_mirror_uri': self._file_mirror_uri(source, file),
421427
}
422428
return translated_file
423429

@@ -511,10 +517,11 @@ def make_hits(self, hits: JSONs) -> list[SummarizedHit | CompleteHit]:
511517
return list(map(self.make_hit, hits))
512518

513519
def make_hit(self, es_hit) -> SummarizedHit | CompleteHit:
520+
source: SourceSpec = SourceRef.from_json(one(es_hit['sources'])).spec
514521
hit = Hit(protocols=self.make_protocols(es_hit),
515522
entryId=es_hit['entity_id'],
516523
sources=self.make_sources(es_hit),
517-
projects=self.make_projects(es_hit),
524+
projects=self.make_projects(source, es_hit),
518525
samples=self.make_samples(es_hit),
519526
specimens=self.make_specimens(es_hit),
520527
cellLines=self.make_cell_lines(es_hit),
@@ -525,7 +532,7 @@ def make_hit(self, es_hit) -> SummarizedHit | CompleteHit:
525532
if self.entity_type in ('files', 'bundles'):
526533
complete_hit = cast(CompleteHit, hit)
527534
complete_hit['bundles'] = self.make_bundles(es_hit)
528-
complete_hit['files'] = self.make_files(es_hit)
535+
complete_hit['files'] = self.make_files(source, es_hit)
529536
return complete_hit
530537
else:
531538
summarized_hit = cast(SummarizedHit, hit)

src/azul/service/manifest_service.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,6 @@
109109
FieldTypes,
110110
null_str,
111111
)
112-
from azul.indexer.mirror_file_service import (
113-
BaseMirrorFileService,
114-
)
115112
from azul.indexer.mirror_service import (
116113
BaseMirrorService,
117114
)
@@ -827,10 +824,6 @@ def metadata_plugin(self) -> MetadataPlugin:
827824
def mirror_service(self) -> BaseMirrorService:
828825
return BaseMirrorService()
829826

830-
@cached_property
831-
def mirror_file_service(self) -> BaseMirrorFileService:
832-
return BaseMirrorFileService(catalog=self.catalog)
833-
834827
@classmethod
835828
@abstractmethod
836829
def file_name_extension(cls) -> str:
@@ -1167,15 +1160,9 @@ def _azul_file_url(self,
11671160
fetch=False,
11681161
**args))
11691162

1170-
def _azul_mirror_uri(self, file: JSON, source: SourceSpec) -> str | None:
1171-
if self.mirror_service.may_mirror_files_from_source(self.catalog, source):
1172-
file = self.metadata_plugin.file_class.from_index(file)
1173-
if self.mirror_service.may_mirror(self.catalog, file.size):
1174-
return self.mirror_file_service.mirror_uri(file)
1175-
else:
1176-
return None
1177-
else:
1178-
return None
1163+
def _azul_mirror_uri(self, source: SourceSpec, file: JSON) -> str | None:
1164+
file_cls = self.metadata_plugin.file_class
1165+
return self.mirror_service.mirror_uri(self.catalog, source, file_cls, file)
11791166

11801167
@cached_property
11811168
def manifest_content_hash(self) -> int:
@@ -1707,7 +1694,7 @@ def write_page_to(self,
17071694
if 'file_url' in column_mapping:
17081695
file['file_url'] = self._azul_file_url(file)
17091696
if 'file_mirror_uri' in column_mapping:
1710-
file['file_mirror_uri'] = self._azul_mirror_uri(file, source)
1697+
file['file_mirror_uri'] = self._azul_mirror_uri(source, file)
17111698
entities = [file]
17121699
self._extract_fields(field_path=field_path,
17131700
entities=entities,
@@ -1723,7 +1710,7 @@ def write_page_to(self,
17231710
if 'file_url' in column_mapping:
17241711
file['file_url'] = self._azul_file_url(file)
17251712
if 'file_mirror_uri' in column_mapping:
1726-
file['file_mirror_uri'] = self._azul_mirror_uri(file, source)
1713+
file['file_mirror_uri'] = self._azul_mirror_uri(source, file)
17271714
self._extract_fields(field_path=field_path,
17281715
entities=[file],
17291716
column_mapping=column_mapping,

src/azul/service/repository_controller.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
pass_thru_bool,
4545
)
4646
from azul.indexer.mirror_file_service import (
47-
BaseMirrorFileService,
4847
MirrorFileDownload,
4948
)
5049
from azul.plugins import (
@@ -85,9 +84,6 @@ class RepositoryController(ServiceAppController):
8584
def service(self) -> RepositoryService:
8685
return RepositoryService()
8786

88-
def mirror_service(self, catalog: CatalogName) -> BaseMirrorFileService:
89-
return self.service.mirror_service(catalog)
90-
9187
def repository_plugin(self, catalog: CatalogName) -> RepositoryPlugin:
9288
return self.service.repository_plugin(catalog)
9389

@@ -262,7 +258,7 @@ def download_file(self,
262258
plugin = self.repository_plugin(catalog)
263259

264260
if config.enable_mirroring:
265-
mirror_service = self.mirror_service(catalog)
261+
mirror_service = self.service.mirror_service.file_service(catalog)
266262
is_mirrored = mirror_service.info_exists(file)
267263
else:
268264
mirror_service, is_mirrored = None, False

0 commit comments

Comments
 (0)