Skip to content

Commit ca3897b

Browse files
committed
[a] Add mirror URI to service /index/file response (#7624)
1 parent a2b36c7 commit ca3897b

File tree

9 files changed

+77
-13
lines changed

9 files changed

+77
-13
lines changed

lambdas/service/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@
121121
# changes and reset the minor version to zero. Otherwise, increment only
122122
# the minor version for backwards compatible changes. A backwards
123123
# compatible change is one that does not require updates to clients.
124-
'version': '15.0',
124+
'version': '15.1',
125125
'description': fd(f'''
126126
# Overview
127127

lambdas/service/openapi.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"openapi": "3.0.1",
33
"info": {
44
"title": "azul-service-dev",
5-
"version": "15.0",
5+
"version": "15.1",
66
"description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n\n\n## Contact us\n\nFor technical support please file an issue at\n[GitHub](https://github.com/DataBiosphere/azul/issues) or email\n`azul-group@ucsc.edu`. To report a security concern or misconduct please email\n`azul-group@ucsc.edu`.\n"
77
},
88
"tags": [

src/azul/plugins/metadata/anvil/service/response.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,11 @@ def _pivotal_entity(self,
218218
inner_entity['azul_url'] = self._file_url(uuid=json_str(inner_entity['document_id']),
219219
version=json_str(inner_entity['version']),
220220
drs_uri=optional(json_str, inner_entity['drs_uri']))
221+
<<<<<<< HEAD
222+
=======
223+
inner_entity['azul_mirror_uri'] = self._file_mirror_uri(inner_entity)
224+
inner_entity.pop('uuid', None)
225+
>>>>>>> 5e0b949b8 ([a] Add mirror URI to service /index/file response (#7624))
221226
inner_entity.pop('version', None)
222227
return inner_entity
223228

src/azul/plugins/metadata/hca/service/response.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,8 @@ def make_file(self, file: JSON) -> JSON:
416416
'drs_uri': file.get('drs_uri'),
417417
'azul_url': self._file_url(uuid=json_str(file['uuid']),
418418
version=json_str(file['version']),
419-
drs_uri=optional(json_str, file['drs_uri']))
419+
drs_uri=optional(json_str, file['drs_uri'])),
420+
'azul_mirror_uri': self._file_mirror_uri(file),
420421
}
421422
return translated_file
422423

src/azul/service/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
)
3737
from azul.plugins import (
3838
FieldName,
39+
File,
3940
MetadataPlugin,
4041
)
4142
from azul.types import (
@@ -433,3 +434,8 @@ def __call__(self,
433434
fetch: bool = True,
434435
**params: str
435436
) -> mutable_furl: ...
437+
438+
439+
class FileMirrorUriFunc(Protocol):
440+
441+
def __call__(self, file: File) -> str: ...

src/azul/service/repository_controller.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def search(self,
107107
response = self.service.search(catalog=catalog,
108108
entity_type=entity_type,
109109
file_url_func=self.file_url_func,
110+
file_mirror_uri_func=self.mirror_service(catalog).mirror_uri,
110111
item_id=item_id,
111112
filters=filters,
112113
pagination=pagination)

src/azul/service/repository_service.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,17 @@
3232
CatalogName,
3333
config,
3434
)
35+
from azul.indexer.mirror_service import (
36+
BaseMirrorService,
37+
)
3538
from azul.plugins import (
3639
File,
3740
RepositoryPlugin,
3841
dotted,
3942
)
4043
from azul.service import (
4144
BadArgumentException,
45+
FileMirrorUriFunc,
4246
FileUrlFunc,
4347
Filters,
4448
)
@@ -73,6 +77,7 @@ def __init__(self, entity_type: str, entity_id: str):
7377
class SearchResponseStage(_ElasticsearchStage[ResponseTriple, MutableJSON],
7478
metaclass=ABCMeta):
7579
file_url_func: FileUrlFunc
80+
file_mirror_uri_func: FileMirrorUriFunc
7681

7782
def prepare_request(self, request: Search) -> Search:
7883
return request
@@ -93,6 +98,13 @@ def _file_url(self, *, uuid: str, version: str, drs_uri: str | None) -> str | No
9398
file_uuid=uuid,
9499
version=version))
95100

101+
def _file_mirror_uri(self, file: JSON) -> str | None:
102+
file = self.plugin.file_class.from_index(file)
103+
if BaseMirrorService.may_mirror(self.catalog, file.size):
104+
return self.file_mirror_uri_func(file)
105+
else:
106+
return None
107+
96108

97109
class SummaryResponseStage(ElasticsearchStage[JSON, MutableJSON],
98110
metaclass=ABCMeta):
@@ -113,6 +125,7 @@ def search(self,
113125
catalog: CatalogName,
114126
entity_type: str,
115127
file_url_func: FileUrlFunc,
128+
file_mirror_uri_func: FileMirrorUriFunc,
116129
item_id: str | None,
117130
filters: Filters,
118131
pagination: Pagination
@@ -127,6 +140,8 @@ def search(self,
127140
:param file_url_func: A function that is used only when getting a *list* of files data.
128141
It creates the files URL based on info from the request. It should have the type
129142
signature `(uuid: str, **params) -> str`
143+
:param file_mirror_uri_func: Like `file_url_func`, but creates URIs to
144+
where the files are mirrored. It should have the type signature `(File) -> str`.
130145
:return: The Elasticsearch JSON response
131146
"""
132147
if item_id is not None:
@@ -138,7 +153,8 @@ def search(self,
138153
pagination=pagination,
139154
aggregate=item_id is None,
140155
entity_type=entity_type,
141-
file_url_func=file_url_func)
156+
file_url_func=file_url_func,
157+
file_mirror_uri_func=file_mirror_uri_func)
142158

143159
special_fields = self.metadata_plugin(catalog).special_fields
144160
for hit in response['hits']:
@@ -158,7 +174,8 @@ def _search(self,
158174
aggregate: bool,
159175
filters: Filters,
160176
pagination: Pagination,
161-
file_url_func: FileUrlFunc
177+
file_url_func: FileUrlFunc,
178+
file_mirror_uri_func: FileMirrorUriFunc,
162179
) -> MutableJSON:
163180
"""
164181
This function does the whole transformation process. It takes the path
@@ -217,7 +234,8 @@ def _search(self,
217234
chain = response_stage_cls(service=self,
218235
catalog=catalog,
219236
entity_type=entity_type,
220-
file_url_func=file_url_func).wrap(chain)
237+
file_url_func=file_url_func,
238+
file_mirror_uri_func=file_mirror_uri_func).wrap(chain)
221239

222240
request = self.create_request(catalog, entity_type)
223241
request = chain.prepare_request(request)

test/service/test_response.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
from azul.collections import (
4343
none_safe_key,
4444
)
45+
from azul.deployment import (
46+
aws,
47+
)
4548
from azul.indexer import (
4649
BundleFQID,
4750
Prefix,
@@ -139,6 +142,11 @@ def tearDownClass(cls):
139142
def file_url_func(self):
140143
return self._app.file_url
141144

145+
@property
146+
def file_mirror_uri_func(self):
147+
mirror_service = self._app.repository_controller.mirror_service(self.catalog)
148+
return mirror_service.mirror_uri
149+
142150
def _get_hits(self, entity_type: str, entity_id: str):
143151
"""
144152
Fetches hits from ES instance searching for a particular entity ID
@@ -167,7 +175,8 @@ def _response_stage(self, entity_type: str) -> HCASearchResponseStage:
167175
return HCASearchResponseStage(service=self.index_service,
168176
entity_type=entity_type,
169177
catalog=self.catalog,
170-
file_url_func=self.file_url_func
178+
file_url_func=self.file_url_func,
179+
file_mirror_uri_func=self.file_mirror_uri_func)
171180

172181
@property
173182
def paginations(self):
@@ -247,6 +256,8 @@ def test_response_stage_files(self):
247256
'azul_url': f'{self.base_url}/repository/files/'
248257
f'7b07f99e-4a8a-4ad0-bd4f-db0d7a00c7bb'
249258
f'?catalog=test&version=2018-11-02T11%3A33%3A44.698028Z',
259+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/'
260+
f'77337cb51b2e584b5ae1b99db6c163b988cbc5b894dda2f5d22424978c3bfc7a.sha256',
250261
'drs_uri': f'drs://{self._drs_domain_name}/'
251262
f'7b07f99e-4a8a-4ad0-bd4f-db0d7a00c7bb?version=2018-11-02T11%3A33%3A44.698028Z',
252263
'uuid': '7b07f99e-4a8a-4ad0-bd4f-db0d7a00c7bb',
@@ -976,6 +987,8 @@ def test_response_stage_files_file(self):
976987
'azul_url': f'{self.base_url}/repository/files/'
977988
f'a8b8479d-cfa9-4f74-909f-49552439e698'
978989
f'?catalog=test&version=2019-10-09T17%3A22%3A51.560099Z',
990+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/'
991+
f'709fede4736213f0f71ae4d76719fd51fa402a9112582a4c52983973cb7d7e47.sha256',
979992
'drs_uri': f'drs://{self._drs_domain_name}/'
980993
f'a8b8479d-cfa9-4f74-909f-49552439e698?version=2019-10-09T17%3A22%3A51.560099Z',
981994
'uuid': 'a8b8479d-cfa9-4f74-909f-49552439e698',
@@ -2958,7 +2971,10 @@ def test_matrices_tree(self):
29582971
'azul_url': str(self.base_url.set(
29592972
path='/repository/files/bd98f428-881e-501a-ac16-24f27a68ce2f',
29602973
args=dict(catalog='test', version='2021-02-11T23:11:45.000000Z')
2961-
))
2974+
)),
2975+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/'
2976+
f'6a6483c2e78da77017e912a4d350f141'
2977+
f'bda1ec7b269f20ca718b55145ee5c83c.sha256'
29622978
}
29632979
]
29642980
}
@@ -2990,7 +3006,10 @@ def test_matrices_tree(self):
29903006
'azul_url': str(self.base_url.set(
29913007
path='/repository/files/538faa28-3235-5e4b-a998-5672e2d964e8',
29923008
args=dict(catalog='test', version='2020-12-03T10:39:17.144517Z')
2993-
))
3009+
)),
3010+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/'
3011+
f'edb8e0139fece9702d89ae5fe7f761c4'
3012+
f'1c291ef6a71129c6420857e025228a24.sha256',
29943013
},
29953014
{
29963015
# Supplementary file, source from submitter_id
@@ -3012,7 +3031,10 @@ def test_matrices_tree(self):
30123031
'azul_url': str(self.base_url.set(
30133032
path='/repository/files/6c142250-567c-5b63-bd4f-0d78499863f8',
30143033
args=dict(catalog='test', version='2020-12-03T10:39:17.144517Z')
3015-
))
3034+
)),
3035+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/'
3036+
f'cb1467f4d23a2429b4928943b51652b3'
3037+
f'2edb949099250d28cf400d13074f5440.sha256',
30163038
},
30173039
{
30183040
# Supplementary file, source from submitter_id
@@ -3034,7 +3056,10 @@ def test_matrices_tree(self):
30343056
'azul_url': str(self.base_url.set(
30353057
path='/repository/files/8d2ba1c1-bc9f-5c2a-a74d-fe5e09bdfb18',
30363058
args=dict(catalog='test', version='2020-12-03T10:39:17.144517Z')
3037-
))
3059+
)),
3060+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/'
3061+
f'724b2c0ddf33c662b362179bc6ca90cd'
3062+
f'866b99b340d061463c35d27cfd5a23c5.sha256',
30383063
}
30393064
]
30403065
}
@@ -3757,6 +3782,8 @@ def test_contributed_analyses_matrix(self):
37573782
version='2022-07-26T00:16:47.748000Z'
37583783
)
37593784
))
3785+
mirror_uri = (f's3://{aws.mirror_bucket}/file/'
3786+
f'649c45bd2f01b028c974c7e2a9604b9cf564d8afcf528eb299eaf3d7fe92bae3.sha256')
37603787
expected_file = {
37613788
'contentDescription': ['Count matrix', 'Feature table'],
37623789
'format': 'csv.gz',
@@ -3772,6 +3799,7 @@ def test_contributed_analyses_matrix(self):
37723799
'_541cc0bb-c54f-4a7e-8cdd-1a70cbd2f20c'
37733800
'_596c26ba-2c35-4396-8c7c-50c825eb4e75',
37743801
'azul_url': file_url,
3802+
'azul_mirror_uri': mirror_uri,
37753803
}
37763804
expected_tree = {
37773805
'genusSpecies': {'Homo sapiens': {

test/service/test_response_anvil.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
import requests
22

3+
from azul.deployment import (
4+
aws,
5+
)
36
from azul.logging import (
47
configure_test_logging,
58
)
@@ -1271,7 +1274,8 @@ def test_entity_indices(self):
12711274
'azul_url': str(self.base_url.set(
12721275
path='/repository/files/15b76f9c-6b46-433f-851d-34e89f1b9ba6',
12731276
args=dict(catalog='test', version=self.version)
1274-
))
1277+
)),
1278+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/beec606ee0aa299fdf913f4259316622.md5'
12751279
}
12761280
]
12771281
},
@@ -1349,7 +1353,8 @@ def test_entity_indices(self):
13491353
'azul_url': str(self.base_url.set(
13501354
path='/repository/files/3b17377b-16b1-431c-9967-e5d01fc5923f',
13511355
args=dict(catalog='test', version=self.version)
1352-
))
1356+
)),
1357+
'azul_mirror_uri': f's3://{aws.mirror_bucket}/file/7cd9fd7b54a8bf380e44e93706f1fa2d.md5'
13531358
}
13541359
]
13551360
}

0 commit comments

Comments
 (0)