-
Notifications
You must be signed in to change notification settings - Fork 3
Fix: Accumulator overflow during aggregation (#6793) #7641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
b4de0da
d220c03
4ba906f
0306ca3
4b2588e
d40a5d6
7499e44
338dd1a
cef1772
941b6f9
3d9af9b
e38d74c
bf94c2a
f0b15b3
219320b
6f757e2
233aeb4
eb454a3
f9a494b
18a3dc8
a53f8a3
17e9939
a08f757
346dfa7
bf334f8
c13cbab
926df11
acac60e
f7368bd
4077d82
06690ab
4fde1d3
11c58fa
38a6058
fb233bf
b1116ea
5d58065
6e39e8b
26bce55
a2bac0f
936def6
be80b78
c90ae5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |||||||||||||||||||||||||||
| Accumulator, | ||||||||||||||||||||||||||||
| DistinctAccumulator, | ||||||||||||||||||||||||||||
| GroupingAggregator, | ||||||||||||||||||||||||||||
| SetAccumulator, | ||||||||||||||||||||||||||||
| SetOfDictAccumulator, | ||||||||||||||||||||||||||||
| SimpleAggregator, | ||||||||||||||||||||||||||||
| SumAccumulator, | ||||||||||||||||||||||||||||
|
|
@@ -23,13 +24,28 @@ | |||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| class ActivityAggregator(SimpleAggregator): | ||||||||||||||||||||||||||||
| pass | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| def _accumulator(self, field: str) -> Accumulator | None: | ||||||||||||||||||||||||||||
| if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'): | ||||||||||||||||||||||||||||
| # Added to the files aggregate in order to be included in manifests | ||||||||||||||||||||||||||||
| if self.outer_entity_type == 'files': | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return None | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
|
Comment on lines
+29
to
+36
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: each of these could be rewritten as:
Suggested change
Which I personally think is cleaner since it reduces the number of return pathways and keeps a consistent, flat level of nesting. But it does cause issues with line length, and there's also an argument to be made that keeping the return statements separate is the more resilient approach. |
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| class BiosampleAggregator(SimpleAggregator): | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| def _accumulator(self, field: str) -> Accumulator | None: | ||||||||||||||||||||||||||||
| if field == 'donor_age_at_collection': | ||||||||||||||||||||||||||||
| if field in ('biosample_id', 'document_id', 'source_datarepo_row_ids'): | ||||||||||||||||||||||||||||
| # Added to the files aggregate in order to be included in manifests | ||||||||||||||||||||||||||||
| if self.outer_entity_type == 'files': | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return None | ||||||||||||||||||||||||||||
| elif field == 'donor_age_at_collection': | ||||||||||||||||||||||||||||
| return SetOfDictAccumulator(max_size=100, | ||||||||||||||||||||||||||||
| key=compose_keys(none_safe_tuple_key(none_last=True), | ||||||||||||||||||||||||||||
| itemgetter('lte', 'gte'))) | ||||||||||||||||||||||||||||
|
|
@@ -38,13 +54,33 @@ def _accumulator(self, field: str) -> Accumulator | None: | |||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| class DatasetAggregator(SimpleAggregator): | ||||||||||||||||||||||||||||
| pass | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| def _accumulator(self, field: str) -> Accumulator | None: | ||||||||||||||||||||||||||||
| # dataset.document_id aggregation is required for creating of manifests | ||||||||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Either "creating manifests" or "creation of manifests", not "creating of"
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also it would be useful to record why the field is required, i.e., what the symptom of failure would be if it were to be removed. |
||||||||||||||||||||||||||||
| if field == 'document_id': | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
| elif field == 'source_datarepo_row_ids': | ||||||||||||||||||||||||||||
| # Added to the files aggregate in order to be included in manifests | ||||||||||||||||||||||||||||
| if self.outer_entity_type == 'files': | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return None | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| class DiagnosisAggregator(SimpleAggregator): | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| def _accumulator(self, field: str) -> Accumulator | None: | ||||||||||||||||||||||||||||
| if field in ('diagnosis_age', 'onset_age'): | ||||||||||||||||||||||||||||
| if field in ('diagnosis_id', 'document_id', 'source_datarepo_row_ids'): | ||||||||||||||||||||||||||||
| # Added to the files aggregate in order to be included in manifests | ||||||||||||||||||||||||||||
| if self.outer_entity_type == 'files': | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return None | ||||||||||||||||||||||||||||
| elif field == 'disease': | ||||||||||||||||||||||||||||
| return SetAccumulator(max_size=14100) | ||||||||||||||||||||||||||||
| elif field in ('diagnosis_age', 'onset_age'): | ||||||||||||||||||||||||||||
| return SetOfDictAccumulator(max_size=100, | ||||||||||||||||||||||||||||
| key=compose_keys(none_safe_tuple_key(none_last=True), | ||||||||||||||||||||||||||||
| itemgetter('lte', 'gte'))) | ||||||||||||||||||||||||||||
|
|
@@ -53,7 +89,16 @@ def _accumulator(self, field: str) -> Accumulator | None: | |||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| class DonorAggregator(SimpleAggregator): | ||||||||||||||||||||||||||||
| pass | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| def _accumulator(self, field: str) -> Accumulator | None: | ||||||||||||||||||||||||||||
| if field in ('document_id', 'donor_id', 'source_datarepo_row_ids'): | ||||||||||||||||||||||||||||
| # Added to the files aggregate in order to be included in manifests | ||||||||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Grammar nit
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding fixup commits for this would have nearly doubled the count of commits, so I've squashed this change into the existing commits. |
||||||||||||||||||||||||||||
| if self.outer_entity_type == 'files': | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return None | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| class FileAggregator(GroupingAggregator): | ||||||||||||||||||||||||||||
|
|
@@ -72,7 +117,17 @@ def _group_keys(self, entity) -> tuple[Any, ...]: | |||||||||||||||||||||||||||
| return entity['file_format'], | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| def _accumulator(self, field: str) -> Accumulator | None: | ||||||||||||||||||||||||||||
| if field in ('count', 'file_size'): | ||||||||||||||||||||||||||||
| if field in ( | ||||||||||||||||||||||||||||
| 'document_id', | ||||||||||||||||||||||||||||
| 'drs_uri', | ||||||||||||||||||||||||||||
| 'file_id', | ||||||||||||||||||||||||||||
| 'file_md5sum', | ||||||||||||||||||||||||||||
| 'file_name', | ||||||||||||||||||||||||||||
| 'source_datarepo_row_ids', | ||||||||||||||||||||||||||||
| 'version', | ||||||||||||||||||||||||||||
| ): | ||||||||||||||||||||||||||||
| return None | ||||||||||||||||||||||||||||
| elif field in ('count', 'file_size'): | ||||||||||||||||||||||||||||
| return DistinctAccumulator(SumAccumulator()) | ||||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||||
| return super()._accumulator(field) | ||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -116,11 +116,39 @@ def _default_accumulator(self) -> Accumulator | None: | |
|
|
||
|
|
||
| class SampleAggregator(SimpleAggregator): | ||
| pass | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field in ('biomaterial_id', 'document_id'): | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
||
| class SpecimenAggregator(SimpleAggregator): | ||
| pass | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field == 'biomaterial_id': | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| elif field == 'document_id': | ||
| # Added to the files aggregate in order to be included in manifests. | ||
| # It is also added to the samples aggregate for the calculation of | ||
| # the summary response field `specimenCount`, which is okay since | ||
| # there should only be one specimen inner entity in any samples | ||
| # outer entity. | ||
| if self.outer_entity_type in ('samples', 'files'): | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
||
| class CellSuspensionAggregator(GroupingAggregator): | ||
|
|
@@ -143,14 +171,29 @@ def _group_keys(self, entity) -> tuple[Any, ...]: | |
| return frozenset(entity['organ']), | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field in self.cell_count_fields: | ||
| if field in ('biomaterial_id', 'document_id'): | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| elif field in self.cell_count_fields: | ||
| return DistinctAccumulator(SumAccumulator()) | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
||
| class CellLineAggregator(SimpleAggregator): | ||
| pass | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field in ('biomaterial_id', 'document_id'): | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
||
| class DonorOrganismAggregator(SimpleAggregator): | ||
|
|
@@ -162,14 +205,12 @@ def _transform_entity(self, entity: JSON) -> JSON: | |
| } | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field == 'organism_age_range': | ||
| return SetAccumulator(max_size=100) | ||
| elif field == 'organism_age': | ||
| return SetOfDictAccumulator(max_size=100, | ||
| key=compose_keys(none_safe_tuple_key(none_last=True), | ||
| none_safe_itemgetter('value', 'unit'))) | ||
| elif field == 'donor_count': | ||
| return UniqueValueCountAccumulator() | ||
| if field == 'biomaterial_id': | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| elif field == 'document_id': | ||
| # If any donor IDs are missing from the aggregate, those donors will | ||
| # be omitted during the verbatim handover. Donors are a "hot" entity | ||
|
|
@@ -179,17 +220,38 @@ def _accumulator(self, field) -> Accumulator | None: | |
| # FIXME: Enforce that hot entity types are completely aggregated | ||
| # https://github.com/DataBiosphere/azul/issues/6793 | ||
| return SetAccumulator(max_size=100) | ||
| elif field == 'development_stage': | ||
| return SetAccumulator(max_size=200) | ||
| elif field == 'organism_age_range': | ||
| return SetAccumulator(max_size=200) | ||
| elif field == 'organism_age': | ||
|
Comment on lines
+224
to
+227
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was it intentional to keep these separate?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. I've combined them in the fixup commit. |
||
| return SetOfDictAccumulator(max_size=200, | ||
| key=compose_keys(none_safe_tuple_key(none_last=True), | ||
| none_safe_itemgetter('value', 'unit'))) | ||
| elif field == 'donor_count': | ||
| return UniqueValueCountAccumulator() | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
||
| class OrganoidAggregator(SimpleAggregator): | ||
| pass | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field in ('biomaterial_id', 'document_id'): | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
||
| class ProjectAggregator(SimpleAggregator): | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| # Aggregation of `document_id` is required to allow filters using | ||
| # the `projectId` field on non-project endpoints. | ||
| if field == 'document_id': | ||
| return SetAccumulator(max_size=100) | ||
| elif field in ('project_description', | ||
|
|
@@ -212,9 +274,7 @@ def _accumulator(self, field) -> Accumulator | None: | |
| class ProtocolAggregator(SimpleAggregator): | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field == 'assay_type': | ||
| return FrequencySetAccumulator(max_size=100) | ||
| elif field == 'document_id': | ||
| if field == 'document_id': | ||
| # If any protocol IDs are missing from the aggregate, those | ||
| # protocols may be omitted during the verbatim handover. Some | ||
| # protocols are "hot" entity types, and we can't track their hubs in | ||
|
|
@@ -223,6 +283,8 @@ def _accumulator(self, field) -> Accumulator | None: | |
| # FIXME: Enforce that hot entity types are completely aggregated | ||
| # https://github.com/DataBiosphere/azul/issues/6793 | ||
| return SetAccumulator(max_size=100) | ||
| elif field == 'assay_type': | ||
| return FrequencySetAccumulator(max_size=100) | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
@@ -231,11 +293,30 @@ def _default_accumulator(self) -> Accumulator | None: | |
|
|
||
|
|
||
| class SequencingInputAggregator(SimpleAggregator): | ||
| pass | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field in ('biomaterial_id', 'document_id'): | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
|
|
||
| class SequencingProcessAggregator(SimpleAggregator): | ||
|
|
||
| def _accumulator(self, field) -> Accumulator | None: | ||
| if field == 'document_id': | ||
| # Added to the files aggregate in order to be included in manifests | ||
| if self.outer_entity_type == 'files': | ||
| return super()._accumulator(field) | ||
| else: | ||
| return None | ||
| else: | ||
| return super()._accumulator(field) | ||
|
|
||
| def _default_accumulator(self) -> Accumulator | None: | ||
| return SetAccumulator(max_size=10) | ||
|
|
||
|
|
@@ -246,7 +327,7 @@ def _accumulator(self, field) -> Accumulator | None: | |
| if field == 'document_id': | ||
| return None | ||
| elif field == 'file': | ||
| return DictAccumulator(max_size=100, key=itemgetter('uuid')) | ||
| return DictAccumulator(max_size=600, key=itemgetter('uuid')) | ||
| else: | ||
| return SetAccumulator() | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again, don't see the point of this