DataBiosphere · dsotirho-ucsc · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 6, 2026
@@ -121,7 +121,7 @@
         # changes and reset the minor version to zero. Otherwise, increment only
         # the minor version for backwards compatible changes. A backwards
         # compatible change is one that does not require updates to clients.
-        'version': '15.1',
+        'version': '16.0',
         'description': fd(f'''
             # Overview
 

@@ -13,6 +13,7 @@
     Accumulator,
     DistinctAccumulator,
     GroupingAggregator,
+    SetAccumulator,
     SetOfDictAccumulator,
     SimpleAggregator,
     SumAccumulator,
@@ -23,13 +24,28 @@
 
 
 class ActivityAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
-        if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'):
-            # Added to the files aggregate in order to be included in manifests
-            if self.outer_entity_type == 'files':
-                return super()._accumulator(field)
-            else:
-                return None
-        else:
-            return super()._accumulator(field)
+        # Added to the files aggregate in order to be included in manifests
+        if field in ('activity_id', 'document_id', 'source_datarepo_row_ids') and self.outer_entity_type != 'files':
+            return None
+        else:
+            return super()._accumulator(field)
-        if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'):
-            # Added to the files aggregate in order to be included in manifests
-            if self.outer_entity_type == 'files':
-                return super()._accumulator(field)
-            else:
-                return None
-        else:
-            return super()._accumulator(field)
+        # Added to the files aggregate in order to be included in manifests
+        if field in ('activity_id', 'document_id', 'source_datarepo_row_ids') and self.outer_entity_type != 'files':
+            return None
+        else:
+            return super()._accumulator(field)
 
 
 class BiosampleAggregator(SimpleAggregator):
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field == 'donor_age_at_collection':
+        if field in ('biosample_id', 'document_id', 'source_datarepo_row_ids'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        elif field == 'donor_age_at_collection':
             return SetOfDictAccumulator(max_size=100,
                                         key=compose_keys(none_safe_tuple_key(none_last=True),
                                                          itemgetter('lte', 'gte')))
@@ -38,13 +54,33 @@ def _accumulator(self, field: str) -> Accumulator | None:
 
 
 class DatasetAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        # dataset.document_id aggregation is required for creating of manifests
+        if field == 'document_id':
+            return super()._accumulator(field)
+        elif field == 'source_datarepo_row_ids':
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
 
 
 class DiagnosisAggregator(SimpleAggregator):
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field in ('diagnosis_age', 'onset_age'):
+        if field in ('diagnosis_id', 'document_id', 'source_datarepo_row_ids'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        elif field == 'disease':
+            return SetAccumulator(max_size=14100)
+        elif field in ('diagnosis_age', 'onset_age'):
             return SetOfDictAccumulator(max_size=100,
                                         key=compose_keys(none_safe_tuple_key(none_last=True),
                                                          itemgetter('lte', 'gte')))
@@ -53,7 +89,16 @@ def _accumulator(self, field: str) -> Accumulator | None:
 
 
 class DonorAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if field in ('document_id', 'donor_id', 'source_datarepo_row_ids'):
+            # Added to the files aggregate in order to be included in manifests
-            # Added to the files aggregate in order to be included in manifests
+            # Add to the files aggregate so that it can be included in manifests
-            # Added to the files aggregate in order to be included in manifests
+            # Add to the files aggregate so that it can be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
 
 
 class FileAggregator(GroupingAggregator):
@@ -72,7 +117,17 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
         return entity['file_format'],
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field in ('count', 'file_size'):
+        if field in (
+                'document_id',
+                'drs_uri',
+                'file_id',
+                'file_md5sum',
+                'file_name',
+                'source_datarepo_row_ids',
+                'version',
+        ):
+            return None
+        elif field in ('count', 'file_size'):
             return DistinctAccumulator(SumAccumulator())
         else:
             return super()._accumulator(field)
@@ -183,7 +183,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]:
             files=Sorting(field_name='fileName'),
             projects=Sorting(field_name='projectTitle',
                              max_page_size=75),
-            samples=Sorting(field_name='sampleId')
+            samples=Sorting(field_name='entryId')
         )
 
     @property
@@ -276,7 +276,6 @@ def _field_mapping(self) -> InverseFieldMapping:
                     'donor_count': 'donorCount'
                 },
                 'samples': {
-                    'biomaterial_id': 'sampleId',
                     'entity_type': 'sampleEntityType',
                     'organ': 'organ',
                     'organ_part': 'organPart',

@@ -116,11 +116,39 @@ def _default_accumulator(self) -> Accumulator | None:
 
 
 class SampleAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('biomaterial_id', 'document_id'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
 
 
 class SpecimenAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field == 'biomaterial_id':
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        elif field == 'document_id':
+            # Added to the files aggregate in order to be included in manifests.
+            # It is also added to the samples aggregate for the calculation of
+            # the summary response field `specimenCount`, which is okay since
+            # there should only be one specimen inner entity in any samples
+            # outer entity.
+            if self.outer_entity_type in ('samples', 'files'):
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
 
 
 class CellSuspensionAggregator(GroupingAggregator):
@@ -143,14 +171,29 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
         return frozenset(entity['organ']),
 
     def _accumulator(self, field) -> Accumulator | None:
-        if field in self.cell_count_fields:
+        if field in ('biomaterial_id', 'document_id'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        elif field in self.cell_count_fields:
             return DistinctAccumulator(SumAccumulator())
         else:
             return super()._accumulator(field)
 
 
 class CellLineAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('biomaterial_id', 'document_id'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
 
 
 class DonorOrganismAggregator(SimpleAggregator):
@@ -162,14 +205,12 @@ def _transform_entity(self, entity: JSON) -> JSON:
         }
 
     def _accumulator(self, field) -> Accumulator | None:
-        if field == 'organism_age_range':
-            return SetAccumulator(max_size=100)
-        elif field == 'organism_age':
-            return SetOfDictAccumulator(max_size=100,
-                                        key=compose_keys(none_safe_tuple_key(none_last=True),
-                                                         none_safe_itemgetter('value', 'unit')))
-        elif field == 'donor_count':
-            return UniqueValueCountAccumulator()
+        if field == 'biomaterial_id':
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
         elif field == 'document_id':
             # If any donor IDs are missing from the aggregate, those donors will
             # be omitted during the verbatim handover. Donors are a "hot" entity
@@ -179,17 +220,38 @@ def _accumulator(self, field) -> Accumulator | None:
             # FIXME: Enforce that hot entity types are completely aggregated
             #        https://github.com/DataBiosphere/azul/issues/6793
             return SetAccumulator(max_size=100)
+        elif field == 'development_stage':
+            return SetAccumulator(max_size=200)
+        elif field == 'organism_age_range':
+            return SetAccumulator(max_size=200)
+        elif field == 'organism_age':
+            return SetOfDictAccumulator(max_size=200,
+                                        key=compose_keys(none_safe_tuple_key(none_last=True),
+                                                         none_safe_itemgetter('value', 'unit')))
+        elif field == 'donor_count':
+            return UniqueValueCountAccumulator()
         else:
             return super()._accumulator(field)
 
 
 class OrganoidAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('biomaterial_id', 'document_id'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
 
 
 class ProjectAggregator(SimpleAggregator):
 
     def _accumulator(self, field) -> Accumulator | None:
+        # Aggregation of `document_id` is required to allow filters using
+        # the `projectId` field on non-project endpoints.
         if field == 'document_id':
             return SetAccumulator(max_size=100)
         elif field in ('project_description',
@@ -212,9 +274,7 @@ def _accumulator(self, field) -> Accumulator | None:
 class ProtocolAggregator(SimpleAggregator):
 
     def _accumulator(self, field) -> Accumulator | None:
-        if field == 'assay_type':
-            return FrequencySetAccumulator(max_size=100)
-        elif field == 'document_id':
+        if field == 'document_id':
             # If any protocol IDs are missing from the aggregate, those
             # protocols may be omitted during the verbatim handover. Some
             # protocols are "hot" entity types, and we can't track their hubs in
@@ -223,6 +283,8 @@ def _accumulator(self, field) -> Accumulator | None:
             # FIXME: Enforce that hot entity types are completely aggregated
             #        https://github.com/DataBiosphere/azul/issues/6793
             return SetAccumulator(max_size=100)
+        elif field == 'assay_type':
+            return FrequencySetAccumulator(max_size=100)
         else:
             return super()._accumulator(field)
 
@@ -231,11 +293,30 @@ def _default_accumulator(self) -> Accumulator | None:
 
 
 class SequencingInputAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field) -> Accumulator | None:
+        if field in ('biomaterial_id', 'document_id'):
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
 
 
 class SequencingProcessAggregator(SimpleAggregator):
 
+    def _accumulator(self, field) -> Accumulator | None:
+        if field == 'document_id':
+            # Added to the files aggregate in order to be included in manifests
+            if self.outer_entity_type == 'files':
+                return super()._accumulator(field)
+            else:
+                return None
+        else:
+            return super()._accumulator(field)
+
     def _default_accumulator(self) -> Accumulator | None:
         return SetAccumulator(max_size=10)
 
@@ -246,7 +327,7 @@ def _accumulator(self, field) -> Accumulator | None:
         if field == 'document_id':
             return None
         elif field == 'file':
-            return DictAccumulator(max_size=100, key=itemgetter('uuid'))
+            return DictAccumulator(max_size=600, key=itemgetter('uuid'))
         else:
             return SetAccumulator()
 

@@ -411,7 +411,6 @@ def make_file(self, file: JSON) -> JSON:
             'size': file.get('size'),
             'fileSource': file.get('file_source'),
             self.plugin.special_fields.file_uuid.name_in_hit: file.get('uuid'),
-            'version': file.get('version'),
             'matrixCellCount': file.get('matrix_cell_count'),
             'drs_uri': file.get('drs_uri'),
             'azul_url': self._file_url(uuid=json_str(file['uuid']),
@@ -422,7 +421,6 @@ def make_file(self, file: JSON) -> JSON:
 
     def make_specimen(self, specimen) -> MutableJSON:
         return {
-            'id': specimen['biomaterial_id'],
             'organ': specimen.get('organ', None),
             'organPart': specimen.get('organ_part', None),
             'disease': specimen.get('disease', None),
@@ -452,7 +450,6 @@ def make_cell_suspensions(self, entry) -> MutableJSONs:
 
     def make_cell_line(self, cell_line) -> MutableJSON:
         return {
-            'id': cell_line['biomaterial_id'],
             'cellLineType': cell_line.get('cell_line_type', None),
             'modelOrgan': cell_line.get('model_organ', None),
         }
@@ -462,7 +459,6 @@ def make_cell_lines(self, entry) -> MutableJSONs:
 
     def make_donor(self, donor) -> MutableJSON:
         return {
-            'id': donor['biomaterial_id'],
             'donorCount': donor.get('donor_count', None),
             'developmentStage': donor.get('development_stage', None),
             'genusSpecies': donor.get('genus_species', None),
@@ -477,7 +473,6 @@ def make_donors(self, entry) -> MutableJSONs:
 
     def make_organoid(self, organoid) -> MutableJSON:
         return {
-            'id': organoid['biomaterial_id'],
             'modelOrgan': organoid.get('model_organ', None),
             'modelOrganPart': organoid.get('model_organ_part', None)
         }
@@ -486,11 +481,12 @@ def make_organoids(self, entry) -> MutableJSONs:
         return [self.make_organoid(organoid) for organoid in entry['contents']['organoids']]
 
     def make_sample(self, sample, entity_dict, entity_type) -> MutableJSON:
-        is_aggregate = isinstance(sample['document_id'], list)
         organ_prop = 'organ' if entity_type == 'specimens' else 'model_organ'
+        effective_organ = sample[organ_prop]
+        is_aggregate = isinstance(effective_organ, list)
         return {
             'sampleEntityType': [entity_type] if is_aggregate else entity_type,
-            'effectiveOrgan': sample[organ_prop],
+            'effectiveOrgan': effective_organ,
             **entity_dict
         }