Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
b4de0da
Remove files.version from HCA response (#6793)
dsotirho-ucsc Jan 5, 2026
d220c03
[A] Remove sort & filter key sampleId from HCA (#6793)
dsotirho-ucsc Jan 5, 2026
4ba906f
Refactor HCA aggregators
dsotirho-ucsc Jan 5, 2026
0306ca3
[r] Remove aggregation of cell_suspensions.document_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
4b2588e
[r] Remove aggregation of cell_suspensions.biomaterial_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
d40a5d6
[r] Remove aggregation of cell_lines.document_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
7499e44
[r] Remove aggregation of cell_lines.biomaterial_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
338dd1a
[r] Remove aggregation of donors.biomaterial_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
cef1772
[r] Remove aggregation of organoids.document_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
941b6f9
[r] Remove aggregation of organoids.biomaterial_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
3d9af9b
[r] Remove aggregation of sequencing_inputs.document_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
e38d74c
[r] Remove aggregation of sequencing_inputs.biomaterial_id in HCA (#6…
dsotirho-ucsc Jan 6, 2026
bf94c2a
[r] Remove aggregation of sequencing_processes.document_id in HCA (#6…
dsotirho-ucsc Jan 6, 2026
f0b15b3
[r] Remove aggregation of specimens.document_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
219320b
[r] Remove aggregation of specimens.biomaterial_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
6f757e2
[r] Remove aggregation of samples.document_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
233aeb4
[r] Remove aggregation of samples.biomaterial_id in HCA (#6793)
dsotirho-ucsc Jan 6, 2026
eb454a3
Refactor AnVIL aggregators
dsotirho-ucsc Jan 7, 2026
f9a494b
[r] Remove aggregation of activities.activity_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
18a3dc8
[r] Remove aggregation of activities.document_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
a53f8a3
[r] Remove aggregation of activities.source_datarepo_row_ids in AnVIL…
dsotirho-ucsc Jan 7, 2026
17e9939
[r] Remove aggregation of biosamples.biosample_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
a08f757
[r] Remove aggregation of biosamples.document_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
346dfa7
[r] Remove aggregation of biosamples.source_datarepo_row_ids in AnVIL…
dsotirho-ucsc Jan 7, 2026
bf334f8
[r] Remove aggregation of datasets.source_datarepo_row_ids in AnVIL (…
dsotirho-ucsc Jan 7, 2026
c13cbab
[r] Remove aggregation of diagnoses.diagnosis_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
926df11
[r] Remove aggregation of diagnoses.document_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
acac60e
[r] Remove aggregation of diagnoses.source_datarepo_row_ids in AnVIL …
dsotirho-ucsc Jan 7, 2026
f7368bd
[r] Remove aggregation of donors.document_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
4077d82
[r] Remove aggregation of donors.donor_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
06690ab
[r] Remove aggregation of donors.source_datarepo_row_ids in AnVIL (#6…
dsotirho-ucsc Jan 7, 2026
4fde1d3
[r] Remove aggregation of files.document_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
11c58fa
[r] Remove aggregation of files.drs_uri in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
38a6058
[r] Remove aggregation of files.file_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
fb233bf
[r] Remove aggregation of files.file_md5sum in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
b1116ea
[r] Remove aggregation of files.file_name in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
5d58065
[r] Remove aggregation of files.source_datarepo_row_ids in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
6e39e8b
[r] Remove aggregation of files.version in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
26bce55
[r] Increase accumulator limit for donors.development_stage in HCA (#…
dsotirho-ucsc Jan 7, 2026
a2bac0f
[r] Increase accumulator limit for donors.organism_age_range in HCA (…
dsotirho-ucsc Jan 7, 2026
936def6
[r] Increase accumulator limit for donors.organism_age in HCA (#6793)
dsotirho-ucsc Jan 7, 2026
be80b78
[r] Increase accumulator limit for matrices.file in HCA (#6793)
dsotirho-ucsc Jan 7, 2026
c90ae5c
[r] Increase accumulator limit for diagnoses.disease in AnVIL (#6793)
dsotirho-ucsc Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
# changes and reset the minor version to zero. Otherwise, increment only
# the minor version for backwards compatible changes. A backwards
# compatible change is one that does not require updates to clients.
'version': '15.1',
'version': '16.0',
'description': fd(f'''
# Overview

Expand Down
131 changes: 8 additions & 123 deletions lambdas/service/openapi.json

Large diffs are not rendered by default.

67 changes: 61 additions & 6 deletions src/azul/plugins/metadata/anvil/indexer/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Accumulator,
DistinctAccumulator,
GroupingAggregator,
SetAccumulator,
SetOfDictAccumulator,
SimpleAggregator,
SumAccumulator,
Expand All @@ -23,13 +24,28 @@


class ActivityAggregator(SimpleAggregator):
pass

def _accumulator(self, field: str) -> Accumulator | None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, don't see the point of this

if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)
Comment on lines +29 to +36
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: each of these could be rewritten as:

Suggested change
if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)
# Added to the files aggregate in order to be included in manifests
if field in ('activity_id', 'document_id', 'source_datarepo_row_ids') and self.outer_entity_type != 'files':
return None
else:
return super()._accumulator(field)

Which I personally think is cleaner since it reduces the number of return pathways and keeps a consistent, flat level of nesting. But it does cause issues with line length, and there's also an argument to be made that keeping the return statements separate is the more resilient approach.



class BiosampleAggregator(SimpleAggregator):

def _accumulator(self, field: str) -> Accumulator | None:
if field == 'donor_age_at_collection':
if field in ('biosample_id', 'document_id', 'source_datarepo_row_ids'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
elif field == 'donor_age_at_collection':
return SetOfDictAccumulator(max_size=100,
key=compose_keys(none_safe_tuple_key(none_last=True),
itemgetter('lte', 'gte')))
Expand All @@ -38,13 +54,33 @@ def _accumulator(self, field: str) -> Accumulator | None:


class DatasetAggregator(SimpleAggregator):
pass

def _accumulator(self, field: str) -> Accumulator | None:
# dataset.document_id aggregation is required for creating of manifests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either "creating manifests" or "creation of manifests", not "creating of"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also it would be useful to record why the field is required, i.e., what the symptom of failure would be if it were to be removed.

if field == 'document_id':
return super()._accumulator(field)
elif field == 'source_datarepo_row_ids':
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)


class DiagnosisAggregator(SimpleAggregator):

def _accumulator(self, field: str) -> Accumulator | None:
if field in ('diagnosis_age', 'onset_age'):
if field in ('diagnosis_id', 'document_id', 'source_datarepo_row_ids'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
elif field == 'disease':
return SetAccumulator(max_size=14100)
elif field in ('diagnosis_age', 'onset_age'):
return SetOfDictAccumulator(max_size=100,
key=compose_keys(none_safe_tuple_key(none_last=True),
itemgetter('lte', 'gte')))
Expand All @@ -53,7 +89,16 @@ def _accumulator(self, field: str) -> Accumulator | None:


class DonorAggregator(SimpleAggregator):
pass

def _accumulator(self, field: str) -> Accumulator | None:
if field in ('document_id', 'donor_id', 'source_datarepo_row_ids'):
# Added to the files aggregate in order to be included in manifests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Grammar nit

Suggested change
# Added to the files aggregate in order to be included in manifests
# Add to the files aggregate so that it can be included in manifests

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding fixup commits for this would have nearly doubled the count of commits, so I've squashed this change into the existing commits.

if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)


class FileAggregator(GroupingAggregator):
Expand All @@ -72,7 +117,17 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
return entity['file_format'],

def _accumulator(self, field: str) -> Accumulator | None:
if field in ('count', 'file_size'):
if field in (
'document_id',
'drs_uri',
'file_id',
'file_md5sum',
'file_name',
'source_datarepo_row_ids',
'version',
):
return None
elif field in ('count', 'file_size'):
return DistinctAccumulator(SumAccumulator())
else:
return super()._accumulator(field)
3 changes: 1 addition & 2 deletions src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]:
files=Sorting(field_name='fileName'),
projects=Sorting(field_name='projectTitle',
max_page_size=75),
samples=Sorting(field_name='sampleId')
samples=Sorting(field_name='entryId')
)

@property
Expand Down Expand Up @@ -276,7 +276,6 @@ def _field_mapping(self) -> InverseFieldMapping:
'donor_count': 'donorCount'
},
'samples': {
'biomaterial_id': 'sampleId',
'entity_type': 'sampleEntityType',
'organ': 'organ',
'organ_part': 'organPart',
Expand Down
117 changes: 99 additions & 18 deletions src/azul/plugins/metadata/hca/indexer/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,39 @@ def _default_accumulator(self) -> Accumulator | None:


class SampleAggregator(SimpleAggregator):
pass

def _accumulator(self, field) -> Accumulator | None:
if field in ('biomaterial_id', 'document_id'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)


class SpecimenAggregator(SimpleAggregator):
pass

def _accumulator(self, field) -> Accumulator | None:
if field == 'biomaterial_id':
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
elif field == 'document_id':
# Added to the files aggregate in order to be included in manifests.
# It is also added to the samples aggregate for the calculation of
# the summary response field `specimenCount`, which is okay since
# there should only be one specimen inner entity in any samples
# outer entity.
if self.outer_entity_type in ('samples', 'files'):
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)


class CellSuspensionAggregator(GroupingAggregator):
Expand All @@ -143,14 +171,29 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
return frozenset(entity['organ']),

def _accumulator(self, field) -> Accumulator | None:
if field in self.cell_count_fields:
if field in ('biomaterial_id', 'document_id'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
elif field in self.cell_count_fields:
return DistinctAccumulator(SumAccumulator())
else:
return super()._accumulator(field)


class CellLineAggregator(SimpleAggregator):
pass

def _accumulator(self, field) -> Accumulator | None:
if field in ('biomaterial_id', 'document_id'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)


class DonorOrganismAggregator(SimpleAggregator):
Expand All @@ -162,14 +205,12 @@ def _transform_entity(self, entity: JSON) -> JSON:
}

def _accumulator(self, field) -> Accumulator | None:
if field == 'organism_age_range':
return SetAccumulator(max_size=100)
elif field == 'organism_age':
return SetOfDictAccumulator(max_size=100,
key=compose_keys(none_safe_tuple_key(none_last=True),
none_safe_itemgetter('value', 'unit')))
elif field == 'donor_count':
return UniqueValueCountAccumulator()
if field == 'biomaterial_id':
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
elif field == 'document_id':
# If any donor IDs are missing from the aggregate, those donors will
# be omitted during the verbatim handover. Donors are a "hot" entity
Expand All @@ -179,17 +220,38 @@ def _accumulator(self, field) -> Accumulator | None:
# FIXME: Enforce that hot entity types are completely aggregated
# https://github.com/DataBiosphere/azul/issues/6793
return SetAccumulator(max_size=100)
elif field == 'development_stage':
return SetAccumulator(max_size=200)
elif field == 'organism_age_range':
return SetAccumulator(max_size=200)
elif field == 'organism_age':
Comment on lines +224 to +227
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was it intentional to keep these separate?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. I've combined them in the fixup commit.

return SetOfDictAccumulator(max_size=200,
key=compose_keys(none_safe_tuple_key(none_last=True),
none_safe_itemgetter('value', 'unit')))
elif field == 'donor_count':
return UniqueValueCountAccumulator()
else:
return super()._accumulator(field)


class OrganoidAggregator(SimpleAggregator):
pass

def _accumulator(self, field) -> Accumulator | None:
if field in ('biomaterial_id', 'document_id'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)


class ProjectAggregator(SimpleAggregator):

def _accumulator(self, field) -> Accumulator | None:
# Aggregation of `document_id` is required to allow filters using
# the `projectId` field on non-project endpoints.
if field == 'document_id':
return SetAccumulator(max_size=100)
elif field in ('project_description',
Expand All @@ -212,9 +274,7 @@ def _accumulator(self, field) -> Accumulator | None:
class ProtocolAggregator(SimpleAggregator):

def _accumulator(self, field) -> Accumulator | None:
if field == 'assay_type':
return FrequencySetAccumulator(max_size=100)
elif field == 'document_id':
if field == 'document_id':
# If any protocol IDs are missing from the aggregate, those
# protocols may be omitted during the verbatim handover. Some
# protocols are "hot" entity types, and we can't track their hubs in
Expand All @@ -223,6 +283,8 @@ def _accumulator(self, field) -> Accumulator | None:
# FIXME: Enforce that hot entity types are completely aggregated
# https://github.com/DataBiosphere/azul/issues/6793
return SetAccumulator(max_size=100)
elif field == 'assay_type':
return FrequencySetAccumulator(max_size=100)
else:
return super()._accumulator(field)

Expand All @@ -231,11 +293,30 @@ def _default_accumulator(self) -> Accumulator | None:


class SequencingInputAggregator(SimpleAggregator):
pass

def _accumulator(self, field) -> Accumulator | None:
if field in ('biomaterial_id', 'document_id'):
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)


class SequencingProcessAggregator(SimpleAggregator):

def _accumulator(self, field) -> Accumulator | None:
if field == 'document_id':
# Added to the files aggregate in order to be included in manifests
if self.outer_entity_type == 'files':
return super()._accumulator(field)
else:
return None
else:
return super()._accumulator(field)

def _default_accumulator(self) -> Accumulator | None:
return SetAccumulator(max_size=10)

Expand All @@ -246,7 +327,7 @@ def _accumulator(self, field) -> Accumulator | None:
if field == 'document_id':
return None
elif field == 'file':
return DictAccumulator(max_size=100, key=itemgetter('uuid'))
return DictAccumulator(max_size=600, key=itemgetter('uuid'))
else:
return SetAccumulator()

Expand Down
10 changes: 3 additions & 7 deletions src/azul/plugins/metadata/hca/service/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,6 @@ def make_file(self, file: JSON) -> JSON:
'size': file.get('size'),
'fileSource': file.get('file_source'),
self.plugin.special_fields.file_uuid.name_in_hit: file.get('uuid'),
'version': file.get('version'),
'matrixCellCount': file.get('matrix_cell_count'),
'drs_uri': file.get('drs_uri'),
'azul_url': self._file_url(uuid=json_str(file['uuid']),
Expand All @@ -422,7 +421,6 @@ def make_file(self, file: JSON) -> JSON:

def make_specimen(self, specimen) -> MutableJSON:
return {
'id': specimen['biomaterial_id'],
'organ': specimen.get('organ', None),
'organPart': specimen.get('organ_part', None),
'disease': specimen.get('disease', None),
Expand Down Expand Up @@ -452,7 +450,6 @@ def make_cell_suspensions(self, entry) -> MutableJSONs:

def make_cell_line(self, cell_line) -> MutableJSON:
return {
'id': cell_line['biomaterial_id'],
'cellLineType': cell_line.get('cell_line_type', None),
'modelOrgan': cell_line.get('model_organ', None),
}
Expand All @@ -462,7 +459,6 @@ def make_cell_lines(self, entry) -> MutableJSONs:

def make_donor(self, donor) -> MutableJSON:
return {
'id': donor['biomaterial_id'],
'donorCount': donor.get('donor_count', None),
'developmentStage': donor.get('development_stage', None),
'genusSpecies': donor.get('genus_species', None),
Expand All @@ -477,7 +473,6 @@ def make_donors(self, entry) -> MutableJSONs:

def make_organoid(self, organoid) -> MutableJSON:
return {
'id': organoid['biomaterial_id'],
'modelOrgan': organoid.get('model_organ', None),
'modelOrganPart': organoid.get('model_organ_part', None)
}
Expand All @@ -486,11 +481,12 @@ def make_organoids(self, entry) -> MutableJSONs:
return [self.make_organoid(organoid) for organoid in entry['contents']['organoids']]

def make_sample(self, sample, entity_dict, entity_type) -> MutableJSON:
is_aggregate = isinstance(sample['document_id'], list)
organ_prop = 'organ' if entity_type == 'specimens' else 'model_organ'
effective_organ = sample[organ_prop]
is_aggregate = isinstance(effective_organ, list)
return {
'sampleEntityType': [entity_type] if is_aggregate else entity_type,
'effectiveOrgan': sample[organ_prop],
'effectiveOrgan': effective_organ,
**entity_dict
}

Expand Down
Loading
Loading