DataBiosphere
diff --git a/‎src/azul/plugins/metadata/anvil/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎src/azul/plugins/metadata/anvil/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/azul/plugins/metadata/anvil/indexer/aggregate.py‎
Lines changed: 39 additions & 6 deletions b/‎src/azul/plugins/metadata/anvil/indexer/aggregate.py‎
Lines changed: 39 additions & 6 deletions
diff --git a/‎src/azul/plugins/metadata/hca/__init__.py‎
Lines changed: 1 addition & 10 deletions b/‎src/azul/plugins/metadata/hca/__init__.py‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎src/azul/plugins/metadata/hca/indexer/aggregate.py‎
Lines changed: 67 additions & 30 deletions b/‎src/azul/plugins/metadata/hca/indexer/aggregate.py‎
Lines changed: 67 additions & 30 deletions
diff --git a/‎src/azul/plugins/metadata/hca/service/response.py‎
Lines changed: 6 additions & 5 deletions b/‎src/azul/plugins/metadata/hca/service/response.py‎
Lines changed: 6 additions & 5 deletions
@@ -314,12 +314,26 @@ def manifest_config(self) -> ManifestConfig:
         # Note that there is a brittle coupling that must be maintained between
         # the fields listed here and those used in `self._field_mapping`.
         fields_to_omit_from_manifest: list[FieldPath] = [
+            ('contents', 'activities', 'activity_id'),
             ('contents', 'activities', 'activity_table'),
+            ('contents', 'activities', 'document_id'),
+            ('contents', 'activities', 'source_datarepo_row_ids'),
+            ('contents', 'biosamples', 'biosample_id'),
+            ('contents', 'biosamples', 'document_id'),
+            ('contents', 'biosamples', 'source_datarepo_row_ids'),
+            ('contents', 'datasets', 'document_id'),
             # We omit the `duos_id` field from manifests since there is only one
             # DUOS bundle per dataset, and that bundle only contributes to outer
             # entities of the `datasets` type, not to entities of the other
             # types, such as files, which the manifest is generated from.
             ('contents', 'datasets', 'duos_id'),
+            ('contents', 'datasets', 'source_datarepo_row_ids'),
+            ('contents', 'diagnoses', 'diagnosis_id'),
+            ('contents', 'diagnoses', 'document_id'),
+            ('contents', 'diagnoses', 'source_datarepo_row_ids'),
+            ('contents', 'donors', 'document_id'),
+            ('contents', 'donors', 'donor_id'),
+            ('contents', 'donors', 'source_datarepo_row_ids'),
             ('contents', 'files', 'version'),
         ]
 
 
@@ -23,13 +23,20 @@
 
 
 class ActivityAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'):
+            return None
+        else:
+            return super()._accumulator(field)
 
 
 class BiosampleAggregator(SimpleAggregator):
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field == 'donor_age_at_collection':
+        if field in ('biosample_id', 'document_id', 'source_datarepo_row_ids'):
+            return None
+        elif field == 'donor_age_at_collection':
             return SetOfDictAccumulator(max_size=100,
                                         key=compose_keys(none_safe_tuple_key(none_last=True),
                                                          itemgetter('lte', 'gte')))
@@ -38,13 +45,24 @@ def _accumulator(self, field: str) -> Accumulator | None:
 
 
 class DatasetAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if field == 'source_datarepo_row_ids':
+            return None
+        # Aggregation of datasets.document_id is required for the creation of
+        # manifests
+        elif field == 'document_id':
+            return super()._accumulator(field)
+        else:
+            return super()._accumulator(field)
 
 
 class DiagnosisAggregator(SimpleAggregator):
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field in ('diagnosis_age', 'onset_age'):
+        if field in ('diagnosis_id', 'document_id', 'source_datarepo_row_ids'):
+            return None
+        elif field in ('diagnosis_age', 'onset_age'):
             return SetOfDictAccumulator(max_size=100,
                                         key=compose_keys(none_safe_tuple_key(none_last=True),
                                                          itemgetter('lte', 'gte')))
@@ -53,7 +71,12 @@ def _accumulator(self, field: str) -> Accumulator | None:
 
 
 class DonorAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if field in ('document_id', 'donor_id', 'source_datarepo_row_ids'):
+            return None
+        else:
+            return super()._accumulator(field)
 
 
 class FileAggregator(GroupingAggregator):
@@ -72,7 +95,17 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
         return entity['file_format'],
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field in ('count', 'file_size'):
+        if field in (
+            'document_id',
+            'drs_uri',
+            'file_id',
+            'file_md5sum',
+            'file_name',
+            'source_datarepo_row_ids',
+            'uuid',
+        ):
+            return None
+        elif field in ('count', 'file_size'):
             return DistinctAccumulator(SumAccumulator())
         else:
             return super()._accumulator(field)
@@ -387,14 +387,10 @@ def manifest_config(self) -> ManifestConfig:
                 'file_mirror_uri': 'file_mirror_uri',
             },
             ('contents', 'cell_suspensions'): {
-                'document_id': 'cell_suspension.provenance.document_id',
-                'biomaterial_id': 'cell_suspension.biomaterial_core.biomaterial_id',
                 'total_estimated_cells': 'cell_suspension.estimated_cell_count',
                 'selected_cell_type': 'cell_suspension.selected_cell_type'
             },
-            ('contents', 'sequencing_processes'): {
-                'document_id': 'sequencing_process.provenance.document_id'
-            },
+            ('contents', 'sequencing_processes'): {},
             ('contents', 'sequencing_protocols'): {
                 'instrument_manufacturer_model': 'sequencing_protocol.instrument_manufacturer_model',
                 'paired_end': 'sequencing_protocol.paired_end'
@@ -420,7 +416,6 @@ def manifest_config(self) -> ManifestConfig:
             },
             ('contents', 'donors'): {
                 'biological_sex': 'donor_organism.sex',
-                'biomaterial_id': 'donor_organism.biomaterial_core.biomaterial_id',
                 'document_id': 'donor_organism.provenance.document_id',
                 'genus_species': 'donor_organism.genus_species',
                 'development_stage': 'donor_organism.development_stage',
@@ -439,12 +434,8 @@ def manifest_config(self) -> ManifestConfig:
             },
             ('contents', 'samples'): {
                 'entity_type': '_entity_type',
-                'document_id': 'sample.provenance.document_id',
-                'biomaterial_id': 'sample.biomaterial_core.biomaterial_id'
             },
             ('contents', 'sequencing_inputs'): {
-                'document_id': 'sequencing_input.provenance.document_id',
-                'biomaterial_id': 'sequencing_input.biomaterial_core.biomaterial_id',
                 'sequencing_input_type': 'sequencing_input_type'
             }
         }
 
@@ -116,11 +116,32 @@ def _default_accumulator(self) -> Accumulator | None:
 
 
 class SampleAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field == 'document_id':
+            return None
+        # TODO: remove sampleId filter
+        # Aggregation of samples.biomaterial_id is required for filters
+        # using the `sampleId` field on non-sample endpoints.
+        elif field == 'biomaterial_id':
+            return super()._accumulator(field)
+        else:
+            return super()._accumulator(field)
 
 
 class SpecimenAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field == 'biomaterial_id':
+            return None
+        # TODO: use `if` and comment why (high cardinality, only 1 for samples)
+        # if self.outer_entity_type == samples
+        # Aggregation of `document_id` is required for the summary response
+        # field `specimenCount` as it is calculated from the `samples` aggregate
+        elif field == 'document_id':
+            return super()._accumulator(field)
+        else:
+            return super()._accumulator(field)
 
 
 class CellSuspensionAggregator(GroupingAggregator):
@@ -143,14 +164,21 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
         return frozenset(entity['organ']),
 
     def _accumulator(self, field) -> Accumulator | None:
-        if field in self.cell_count_fields:
+        if field in ('document_id', 'biomaterial_id'):
+            return None
+        elif field in self.cell_count_fields:
             return DistinctAccumulator(SumAccumulator())
         else:
             return super()._accumulator(field)
 
 
 class CellLineAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('document_id', 'biomaterial_id'):
+            return None
+        else:
+            return super()._accumulator(field)
 
 
 class DonorOrganismAggregator(SimpleAggregator):
@@ -162,34 +190,39 @@ def _transform_entity(self, entity: JSON) -> JSON:
         }
 
     def _accumulator(self, field) -> Accumulator | None:
-        if field == 'organism_age_range':
-            return SetAccumulator(max_size=100)
+        if field == 'biomaterial_id':
+            return None
+        # Aggregation of donors.document_id is required for the summary response
+        # field `donorCount` which is calculated from the `samples` aggregate.
+        elif field == 'document_id':
+            return super()._accumulator(field)
+        elif field == 'development_stage':
+            return SetAccumulator(max_size=200)
+        elif field == 'organism_age_range':
+            return SetAccumulator(max_size=200)
         elif field == 'organism_age':
-            return SetOfDictAccumulator(max_size=100,
+            return SetOfDictAccumulator(max_size=200,
                                         key=compose_keys(none_safe_tuple_key(none_last=True),
                                                          none_safe_itemgetter('value', 'unit')))
         elif field == 'donor_count':
             return UniqueValueCountAccumulator()
-        elif field == 'document_id':
-            # If any donor IDs are missing from the aggregate, those donors will
-            # be omitted during the verbatim handover. Donors are a "hot" entity
-            # type, and we can't track their hubs in replica documents, so we
-            # rely on the inner entity IDs instead.
-            #
-            # FIXME: Enforce that hot entity types are completely aggregated
-            #        https://github.com/DataBiosphere/azul/issues/6793
-            return SetAccumulator(max_size=100)
         else:
             return super()._accumulator(field)
 
 
 class OrganoidAggregator(SimpleAggregator):
-    pass
 
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('document_id', 'biomaterial_id'):
+            return None
+        else:
+            return super()._accumulator(field)
 
 class ProjectAggregator(SimpleAggregator):
 
     def _accumulator(self, field) -> Accumulator | None:
+        # Aggregation of projects.document_id is required to allow filters using
+        # the `projectId` field on non-project endpoints.
         if field == 'document_id':
             return SetAccumulator(max_size=100)
         elif field in ('project_description',
@@ -212,17 +245,10 @@ def _accumulator(self, field) -> Accumulator | None:
 class ProtocolAggregator(SimpleAggregator):
 
     def _accumulator(self, field) -> Accumulator | None:
-        if field == 'assay_type':
+        if field in ('document_id', 'biomaterial_id'):
+            return None
+        elif field == 'assay_type':
             return FrequencySetAccumulator(max_size=100)
-        elif field == 'document_id':
-            # If any protocol IDs are missing from the aggregate, those
-            # protocols may be omitted during the verbatim handover. Some
-            # protocols are "hot" entity types, and we can't track their hubs in
-            # replicas, so we rely on the inner entity IDs instead.
-            #
-            # FIXME: Enforce that hot entity types are completely aggregated
-            #        https://github.com/DataBiosphere/azul/issues/6793
-            return SetAccumulator(max_size=100)
         else:
             return super()._accumulator(field)
 
@@ -231,11 +257,22 @@ def _default_accumulator(self) -> Accumulator | None:
 
 
 class SequencingInputAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('document_id', 'biomaterial_id'):
+            return None
+        else:
+            return super()._accumulator(field)
 
 
 class SequencingProcessAggregator(SimpleAggregator):
 
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('document_id', 'biomaterial_id'):
+            return None
+        else:
+            return super()._accumulator(field)
+
     def _default_accumulator(self) -> Accumulator | None:
         return SetAccumulator(max_size=10)
 
@@ -246,15 +283,15 @@ def _accumulator(self, field) -> Accumulator | None:
         if field == 'document_id':
             return None
         elif field == 'file':
-            return DictAccumulator(max_size=100, key=itemgetter('uuid'))
+            return DictAccumulator(max_size=500, key=itemgetter('uuid'))
         else:
             return SetAccumulator()
 
 
 class DateAggregator(SimpleAggregator):
 
     def _accumulator(self, field) -> Accumulator | None:
-        if field == 'document_id':
+        if field in ('document_id', 'biomaterial_id'):
             return None
         elif field in ('submission_date', 'aggregate_submission_date'):
             return MinAccumulator()
 
@@ -362,6 +362,7 @@ def make_projects(self, entry) -> MutableJSONs:
                 'duosId': project.get('duos_id')
             }
             if self.entity_type == 'projects':
+                # translated_project['projectId'] = project['document_id']
                 translated_project['projectDescription'] = project.get('project_description', [])
                 contributors = project.get('contributors', [])  # list of dict
                 translated_project['contributors'] = contributors
@@ -422,7 +423,7 @@ def make_file(self, file: JSON) -> JSON:
 
     def make_specimen(self, specimen) -> MutableJSON:
         return {
-            'id': specimen['biomaterial_id'],
+            # 'id': specimen['biomaterial_id'],
             'organ': specimen.get('organ', None),
             'organPart': specimen.get('organ_part', None),
             'disease': specimen.get('disease', None),
@@ -452,7 +453,7 @@ def make_cell_suspensions(self, entry) -> MutableJSONs:
 
     def make_cell_line(self, cell_line) -> MutableJSON:
         return {
-            'id': cell_line['biomaterial_id'],
+            # 'id': cell_line['biomaterial_id'],
             'cellLineType': cell_line.get('cell_line_type', None),
             'modelOrgan': cell_line.get('model_organ', None),
         }
@@ -462,7 +463,7 @@ def make_cell_lines(self, entry) -> MutableJSONs:
 
     def make_donor(self, donor) -> MutableJSON:
         return {
-            'id': donor['biomaterial_id'],
+            # 'id': donor['biomaterial_id'],
             'donorCount': donor.get('donor_count', None),
             'developmentStage': donor.get('development_stage', None),
             'genusSpecies': donor.get('genus_species', None),
@@ -477,7 +478,7 @@ def make_donors(self, entry) -> MutableJSONs:
 
     def make_organoid(self, organoid) -> MutableJSON:
         return {
-            'id': organoid['biomaterial_id'],
+            # 'id': organoid['biomaterial_id'],
             'modelOrgan': organoid.get('model_organ', None),
             'modelOrganPart': organoid.get('model_organ_part', None)
         }
@@ -486,7 +487,7 @@ def make_organoids(self, entry) -> MutableJSONs:
         return [self.make_organoid(organoid) for organoid in entry['contents']['organoids']]
 
     def make_sample(self, sample, entity_dict, entity_type) -> MutableJSON:
-        is_aggregate = isinstance(sample['document_id'], list)
+        is_aggregate = 'document_id' not in sample
         organ_prop = 'organ' if entity_type == 'specimens' else 'model_organ'
         return {
             'sampleEntityType': [entity_type] if is_aggregate else entity_type,