Skip to content

Commit 39bd2c4

Browse files
committed
[r] Remove AnVIL fields from aggregation (#6793)
1 parent 6f6a802 commit 39bd2c4

File tree

4 files changed

+53
-602
lines changed

4 files changed

+53
-602
lines changed

src/azul/plugins/metadata/anvil/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,12 +314,26 @@ def manifest_config(self) -> ManifestConfig:
314314
# Note that there is a brittle coupling that must be maintained between
315315
# the fields listed here and those used in `self._field_mapping`.
316316
fields_to_omit_from_manifest: list[FieldPath] = [
317+
('contents', 'activities', 'activity_id'),
317318
('contents', 'activities', 'activity_table'),
319+
('contents', 'activities', 'document_id'),
320+
('contents', 'activities', 'source_datarepo_row_ids'),
321+
('contents', 'biosamples', 'biosample_id'),
322+
('contents', 'biosamples', 'document_id'),
323+
('contents', 'biosamples', 'source_datarepo_row_ids'),
324+
('contents', 'datasets', 'document_id'),
318325
# We omit the `duos_id` field from manifests since there is only one
319326
# DUOS bundle per dataset, and that bundle only contributes to outer
320327
# entities of the `datasets` type, not to entities of the other
321328
# types, such as files, which the manifest is generated from.
322329
('contents', 'datasets', 'duos_id'),
330+
('contents', 'datasets', 'source_datarepo_row_ids'),
331+
('contents', 'diagnoses', 'diagnosis_id'),
332+
('contents', 'diagnoses', 'document_id'),
333+
('contents', 'diagnoses', 'source_datarepo_row_ids'),
334+
('contents', 'donors', 'document_id'),
335+
('contents', 'donors', 'donor_id'),
336+
('contents', 'donors', 'source_datarepo_row_ids'),
323337
('contents', 'files', 'version'),
324338
]
325339

src/azul/plugins/metadata/anvil/indexer/aggregate.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,20 @@
2323

2424

2525
class ActivityAggregator(SimpleAggregator):
26-
pass
26+
27+
def _accumulator(self, field: str) -> Accumulator | None:
28+
if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'):
29+
return None
30+
else:
31+
return super()._accumulator(field)
2732

2833

2934
class BiosampleAggregator(SimpleAggregator):
3035

3136
def _accumulator(self, field: str) -> Accumulator | None:
32-
if field == 'donor_age_at_collection':
37+
if field in ('biosample_id', 'document_id', 'source_datarepo_row_ids'):
38+
return None
39+
elif field == 'donor_age_at_collection':
3340
return SetOfDictAccumulator(max_size=100,
3441
key=compose_keys(none_safe_tuple_key(none_last=True),
3542
itemgetter('lte', 'gte')))
@@ -38,13 +45,23 @@ def _accumulator(self, field: str) -> Accumulator | None:
3845

3946

4047
class DatasetAggregator(SimpleAggregator):
41-
pass
48+
49+
def _accumulator(self, field: str) -> Accumulator | None:
50+
if field == 'source_datarepo_row_ids':
51+
return None
52+
# Aggregation of `document_id` is required for the creation of manifests
53+
elif field == 'document_id':
54+
return super()._accumulator(field)
55+
else:
56+
return super()._accumulator(field)
4257

4358

4459
class DiagnosisAggregator(SimpleAggregator):
4560

4661
def _accumulator(self, field: str) -> Accumulator | None:
47-
if field in ('diagnosis_age', 'onset_age'):
62+
if field in ('diagnosis_id', 'document_id', 'source_datarepo_row_ids'):
63+
return None
64+
elif field in ('diagnosis_age', 'onset_age'):
4865
return SetOfDictAccumulator(max_size=100,
4966
key=compose_keys(none_safe_tuple_key(none_last=True),
5067
itemgetter('lte', 'gte')))
@@ -53,7 +70,12 @@ def _accumulator(self, field: str) -> Accumulator | None:
5370

5471

5572
class DonorAggregator(SimpleAggregator):
56-
pass
73+
74+
def _accumulator(self, field: str) -> Accumulator | None:
75+
if field in ('document_id', 'donor_id', 'source_datarepo_row_ids'):
76+
return None
77+
else:
78+
return super()._accumulator(field)
5779

5880

5981
class FileAggregator(GroupingAggregator):
@@ -72,7 +94,17 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
7294
return entity['file_format'],
7395

7496
def _accumulator(self, field: str) -> Accumulator | None:
75-
if field in ('count', 'file_size'):
97+
if field in (
98+
'document_id',
99+
'drs_uri',
100+
'file_id',
101+
'file_md5sum',
102+
'file_name',
103+
'source_datarepo_row_ids',
104+
'version',
105+
):
106+
return None
107+
elif field in ('count', 'file_size'):
76108
return DistinctAccumulator(SumAccumulator())
77109
else:
78110
return super()._accumulator(field)

0 commit comments

Comments
 (0)