@@ -116,11 +116,32 @@ def _default_accumulator(self) -> Accumulator | None:
116116
117117
118118class SampleAggregator (SimpleAggregator ):
119- pass
119+
120+ def _accumulator (self , field ) -> Accumulator | None :
121+ if field == 'document_id' :
122+ return None
123+ # TODO: remove sampleId filter
124+ # Aggregation of samples.biomaterial_id is required for filters
125+ # using the `sampleId` field on non-sample endpoints.
126+ elif field == 'biomaterial_id' :
127+ return super ()._accumulator (field )
128+ else :
129+ return super ()._accumulator (field )
120130
121131
122132class SpecimenAggregator (SimpleAggregator ):
123- pass
133+
134+ def _accumulator (self , field ) -> Accumulator | None :
135+ if field == 'biomaterial_id' :
136+ return None
137+ # TODO: use `if` and comment why (high cardinality, only 1 for samples)
138+ # if self.outer_entity_type == samples
139+ # Aggregation of `document_id` is required for the summary response
140+ # field `specimenCount` as it is calculated from the `samples` aggregate
141+ elif field == 'document_id' :
142+ return super ()._accumulator (field )
143+ else :
144+ return super ()._accumulator (field )
124145
125146
126147class CellSuspensionAggregator (GroupingAggregator ):
@@ -143,14 +164,21 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
143164 return frozenset (entity ['organ' ]),
144165
145166 def _accumulator (self , field ) -> Accumulator | None :
146- if field in self .cell_count_fields :
167+ if field in ('document_id' , 'biomaterial_id' ):
168+ return None
169+ elif field in self .cell_count_fields :
147170 return DistinctAccumulator (SumAccumulator ())
148171 else :
149172 return super ()._accumulator (field )
150173
151174
152175class CellLineAggregator (SimpleAggregator ):
153- pass
176+
177+ def _accumulator (self , field ) -> Accumulator | None :
178+ if field in ('document_id' , 'biomaterial_id' ):
179+ return None
180+ else :
181+ return super ()._accumulator (field )
154182
155183
156184class DonorOrganismAggregator (SimpleAggregator ):
@@ -162,34 +190,39 @@ def _transform_entity(self, entity: JSON) -> JSON:
162190 }
163191
164192 def _accumulator (self , field ) -> Accumulator | None :
165- if field == 'organism_age_range' :
166- return SetAccumulator (max_size = 100 )
193+ if field == 'biomaterial_id' :
194+ return None
195+ # Aggregation of donors.document_id is required for the summary response
196+ # field `donorCount` which is calculated from the `samples` aggregate.
197+ elif field == 'document_id' :
198+ return super ()._accumulator (field )
199+ elif field == 'development_stage' :
200+ return SetAccumulator (max_size = 200 )
201+ elif field == 'organism_age_range' :
202+ return SetAccumulator (max_size = 200 )
167203 elif field == 'organism_age' :
168- return SetOfDictAccumulator (max_size = 100 ,
204+ return SetOfDictAccumulator (max_size = 200 ,
169205 key = compose_keys (none_safe_tuple_key (none_last = True ),
170206 none_safe_itemgetter ('value' , 'unit' )))
171207 elif field == 'donor_count' :
172208 return UniqueValueCountAccumulator ()
173- elif field == 'document_id' :
174- # If any donor IDs are missing from the aggregate, those donors will
175- # be omitted during the verbatim handover. Donors are a "hot" entity
176- # type, and we can't track their hubs in replica documents, so we
177- # rely on the inner entity IDs instead.
178- #
179- # FIXME: Enforce that hot entity types are completely aggregated
180- # https://github.com/DataBiosphere/azul/issues/6793
181- return SetAccumulator (max_size = 100 )
182209 else :
183210 return super ()._accumulator (field )
184211
185212
186213class OrganoidAggregator (SimpleAggregator ):
187- pass
188214
215+ def _accumulator (self , field ) -> Accumulator | None :
216+ if field in ('document_id' , 'biomaterial_id' ):
217+ return None
218+ else :
219+ return super ()._accumulator (field )
189220
190221class ProjectAggregator (SimpleAggregator ):
191222
192223 def _accumulator (self , field ) -> Accumulator | None :
224+ # Aggregation of projects.document_id is required to allow filters using
225+ # the `projectId` field on non-project endpoints.
193226 if field == 'document_id' :
194227 return SetAccumulator (max_size = 100 )
195228 elif field in ('project_description' ,
@@ -212,17 +245,10 @@ def _accumulator(self, field) -> Accumulator | None:
212245class ProtocolAggregator (SimpleAggregator ):
213246
214247 def _accumulator (self , field ) -> Accumulator | None :
215- if field == 'assay_type' :
248+ if field in ('document_id' , 'biomaterial_id' ):
249+ return None
250+ elif field == 'assay_type' :
216251 return FrequencySetAccumulator (max_size = 100 )
217- elif field == 'document_id' :
218- # If any protocol IDs are missing from the aggregate, those
219- # protocols may be omitted during the verbatim handover. Some
220- # protocols are "hot" entity types, and we can't track their hubs in
221- # replicas, so we rely on the inner entity IDs instead.
222- #
223- # FIXME: Enforce that hot entity types are completely aggregated
224- # https://github.com/DataBiosphere/azul/issues/6793
225- return SetAccumulator (max_size = 100 )
226252 else :
227253 return super ()._accumulator (field )
228254
@@ -231,11 +257,22 @@ def _default_accumulator(self) -> Accumulator | None:
231257
232258
233259class SequencingInputAggregator (SimpleAggregator ):
234- pass
260+
261+ def _accumulator (self , field ) -> Accumulator | None :
262+ if field in ('document_id' , 'biomaterial_id' ):
263+ return None
264+ else :
265+ return super ()._accumulator (field )
235266
236267
237268class SequencingProcessAggregator (SimpleAggregator ):
238269
270+ def _accumulator (self , field ) -> Accumulator | None :
271+ if field in ('document_id' , 'biomaterial_id' ):
272+ return None
273+ else :
274+ return super ()._accumulator (field )
275+
239276 def _default_accumulator (self ) -> Accumulator | None :
240277 return SetAccumulator (max_size = 10 )
241278
@@ -246,15 +283,15 @@ def _accumulator(self, field) -> Accumulator | None:
246283 if field == 'document_id' :
247284 return None
248285 elif field == 'file' :
249- return DictAccumulator (max_size = 100 , key = itemgetter ('uuid' ))
286+ return DictAccumulator (max_size = 500 , key = itemgetter ('uuid' ))
250287 else :
251288 return SetAccumulator ()
252289
253290
254291class DateAggregator (SimpleAggregator ):
255292
256293 def _accumulator (self , field ) -> Accumulator | None :
257- if field == 'document_id' :
294+ if field in ( 'document_id' , 'biomaterial_id' ) :
258295 return None
259296 elif field in ('submission_date' , 'aggregate_submission_date' ):
260297 return MinAccumulator ()
0 commit comments