Skip to content

Commit c6a2014

Browse files
authored
Merge pull request #525 from VariantEffect/jstone-dev/score-set-search-result-optimization
Score set search result optimization
2 parents a150ebd + 7f0688b commit c6a2014

File tree

4 files changed

+328
-60
lines changed

4 files changed

+328
-60
lines changed

src/mavedb/lib/score_sets.py

Lines changed: 108 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1+
from collections import Counter
12
import csv
23
import io
34
import logging
4-
import re
55
from operator import attrgetter
6+
import re
67
from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal
78

89
from mavedb.models.mapped_variant import MappedVariant
910
import numpy as np
1011
import pandas as pd
1112
from pandas.testing import assert_index_equal
1213
from sqlalchemy import Integer, and_, cast, func, or_, select
13-
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, selectinload
14+
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload
1415

1516
from mavedb.lib.exceptions import ValidationError
1617
from mavedb.lib.logging.context import logging_context, save_to_logging_context
@@ -71,11 +72,15 @@ def options(cls) -> list[str]:
7172
return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN]
7273

7374

74-
def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch) -> list[ScoreSet]:
75-
save_to_logging_context({"score_set_search_criteria": search.model_dump()})
75+
def build_search_score_sets_query_filter(
76+
db: Session, query: Query[ScoreSet], owner_or_contributor: Optional[User], search: ScoreSetsSearch
77+
):
78+
superseding_score_set = aliased(ScoreSet)
7679

77-
query = db.query(ScoreSet) # \
78-
# .filter(ScoreSet.private.is_(False))
80+
# Limit to unsuperseded score sets.
81+
# TODO#??? Prevent unpublished superseding score sets from hiding their published precursors in search results.
82+
query = query.join(superseding_score_set, ScoreSet.superseding_score_set, isouter=True)
83+
query = query.filter(superseding_score_set.id.is_(None))
7984

8085
if owner_or_contributor is not None:
8186
query = query.filter(
@@ -213,6 +218,14 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
213218
)
214219
)
215220
)
221+
return query
222+
223+
224+
def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch):
225+
save_to_logging_context({"score_set_search_criteria": search.model_dump()})
226+
227+
query = db.query(ScoreSet)
228+
query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search)
216229

217230
score_sets: list[ScoreSet] = (
218231
query.join(ScoreSet.experiment)
@@ -257,15 +270,102 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
257270
),
258271
)
259272
.order_by(Experiment.title)
273+
.offset(search.offset if search.offset is not None else None)
274+
.limit(search.limit + 1 if search.limit is not None else None)
260275
.all()
261276
)
262277
if not score_sets:
263278
score_sets = []
264279

265-
save_to_logging_context({"matching_resources": len(score_sets)})
280+
offset = search.offset if search.offset is not None else 0
281+
num_score_sets = offset + len(score_sets)
282+
if search.limit is not None and num_score_sets > offset + search.limit:
283+
# In the main query, we have allowed limit + 1 results. The extra record tells us whether we need to run a count
284+
# query.
285+
score_sets = score_sets[: search.limit]
286+
count_query = db.query(ScoreSet)
287+
build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search)
288+
num_score_sets = count_query.order_by(None).limit(None).count()
289+
290+
save_to_logging_context({"matching_resources": num_score_sets})
266291
logger.debug(msg=f"Score set search yielded {len(score_sets)} matching resources.", extra=logging_context())
267292

268-
return score_sets # filter_visible_score_sets(score_sets)
293+
return {"score_sets": score_sets, "num_score_sets": num_score_sets}
294+
295+
296+
def score_set_search_filter_options_from_counter(counter: Counter):
297+
return [{"value": value, "count": count} for value, count in counter.items()]
298+
299+
300+
def fetch_score_set_search_filter_options(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch):
301+
save_to_logging_context({"score_set_search_criteria": search.model_dump()})
302+
303+
query = db.query(ScoreSet)
304+
query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search)
305+
306+
score_sets: list[ScoreSet] = query.all()
307+
if not score_sets:
308+
score_sets = []
309+
310+
target_category_counter: Counter[str] = Counter()
311+
target_name_counter: Counter[str] = Counter()
312+
target_organism_name_counter: Counter[str] = Counter()
313+
target_accession_counter: Counter[str] = Counter()
314+
for score_set in score_sets:
315+
for target in getattr(score_set, "target_genes", []):
316+
category = getattr(target, "category", None)
317+
if category:
318+
target_category_counter[category] += 1
319+
320+
name = getattr(target, "name", None)
321+
if name:
322+
target_name_counter[name] += 1
323+
324+
target_sequence = getattr(target, "target_sequence", None)
325+
taxonomy = getattr(target_sequence, "taxonomy", None)
326+
organism_name = getattr(taxonomy, "organism_name", None)
327+
328+
if organism_name:
329+
target_organism_name_counter[organism_name] += 1
330+
331+
target_accession = getattr(target, "target_accession", None)
332+
accession = getattr(target_accession, "accession", None)
333+
334+
if accession:
335+
target_accession_counter[accession] += 1
336+
337+
publication_author_name_counter: Counter[str] = Counter()
338+
publication_db_name_counter: Counter[str] = Counter()
339+
publication_journal_counter: Counter[str] = Counter()
340+
for score_set in score_sets:
341+
for publication_association in getattr(score_set, "publication_identifier_associations", []):
342+
publication = getattr(publication_association, "publication", None)
343+
344+
authors = getattr(publication, "authors", [])
345+
for author in authors:
346+
name = author.get("name")
347+
if name:
348+
publication_author_name_counter[name] += 1
349+
350+
db_name = getattr(publication, "db_name", None)
351+
if db_name:
352+
publication_db_name_counter[db_name] += 1
353+
354+
journal = getattr(publication, "publication_journal", None)
355+
if journal:
356+
publication_journal_counter[journal] += 1
357+
358+
logger.debug(msg="Score set search filter options were fetched.", extra=logging_context())
359+
360+
return {
361+
"target_gene_categories": score_set_search_filter_options_from_counter(target_category_counter),
362+
"target_gene_names": score_set_search_filter_options_from_counter(target_name_counter),
363+
"target_organism_names": score_set_search_filter_options_from_counter(target_organism_name_counter),
364+
"target_accessions": score_set_search_filter_options_from_counter(target_accession_counter),
365+
"publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter),
366+
"publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter),
367+
"publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter),
368+
}
269369

270370

271371
def fetch_superseding_score_set_in_search_result(

src/mavedb/routers/score_sets.py

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@
4545
from mavedb.lib.permissions import Action, assert_permission, has_permission
4646
from mavedb.lib.score_sets import (
4747
csv_data_to_df,
48+
fetch_score_set_search_filter_options,
4849
find_meta_analyses_for_experiment_sets,
4950
get_score_set_variants_as_csv,
5051
variants_to_csv_rows,
5152
)
5253
from mavedb.lib.score_sets import (
53-
fetch_superseding_score_set_in_search_result,
5454
search_score_sets as _search_score_sets,
5555
refresh_variant_urns,
5656
)
@@ -74,10 +74,13 @@
7474
from mavedb.models.target_sequence import TargetSequence
7575
from mavedb.models.variant import Variant
7676
from mavedb.view_models import mapped_variant, score_set, clinical_control, score_range, gnomad_variant
77-
from mavedb.view_models.search import ScoreSetsSearch
77+
from mavedb.view_models.search import ScoreSetsSearch, ScoreSetsSearchFilterOptionsResponse, ScoreSetsSearchResponse
7878

7979
logger = logging.getLogger(__name__)
8080

81+
SCORE_SET_SEARCH_MAX_LIMIT = 100
82+
SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS = 40
83+
8184

8285
async def fetch_score_set_by_urn(
8386
db, urn: str, user: Optional[UserData], owner_or_contributor: Optional[UserData], only_published: bool
@@ -134,26 +137,64 @@ async def fetch_score_set_by_urn(
134137
)
135138

136139

137-
@router.post("/score-sets/search", status_code=200, response_model=list[score_set.ShortScoreSet])
140+
@router.post("/score-sets/search", status_code=200, response_model=ScoreSetsSearchResponse)
138141
def search_score_sets(
139142
search: ScoreSetsSearch,
140143
db: Session = Depends(deps.get_db),
141144
user_data: Optional[UserData] = Depends(get_current_user),
142-
) -> Any: # = Body(..., embed=True),
145+
) -> Any:
143146
"""
144147
Search score sets.
145148
"""
146-
score_sets = _search_score_sets(db, None, search)
147-
updated_score_sets = fetch_superseding_score_set_in_search_result(score_sets, user_data, search)
148149

150+
# Disallow searches for unpublished score sets via this endpoint.
151+
if search.published is False:
152+
raise HTTPException(
153+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
154+
detail="Cannot search for private score sets except in the context of the current user's data.",
155+
)
156+
search.published = True
157+
158+
# Require a limit of at most SCORE_SET_SEARCH_MAX_LIMIT when the search query does not include publication
159+
# identifiers. We allow unlimited searches with publication identifiers, presuming that such a search will not have
160+
# excessive results.
161+
if search.publication_identifiers is None and search.limit is None:
162+
search.limit = SCORE_SET_SEARCH_MAX_LIMIT
163+
elif search.publication_identifiers is None and (search.limit is None or search.limit > SCORE_SET_SEARCH_MAX_LIMIT):
164+
raise HTTPException(
165+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
166+
detail=f"Cannot search for more than {SCORE_SET_SEARCH_MAX_LIMIT} score sets at a time. Please use the offset and limit parameters to run a paginated search.",
167+
)
168+
169+
# Also limit the search to at most SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS publication identifiers, to prevent
170+
# artificially constructed searches that return very large result sets.
171+
if (
172+
search.publication_identifiers is not None
173+
and len(search.publication_identifiers) > SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS
174+
):
175+
raise HTTPException(
176+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
177+
detail=f"Cannot search for score sets belonging to more than {SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS} publication identifiers at once.",
178+
)
179+
180+
score_sets, num_score_sets = _search_score_sets(db, None, search).values()
149181
enriched_score_sets = []
150-
if search.include_experiment_score_set_urns_and_count and updated_score_sets:
151-
for u in updated_score_sets:
152-
enriched_experiment = enrich_experiment_with_num_score_sets(u.experiment, user_data)
153-
response_item = score_set.ScoreSet.model_validate(u).copy(update={"experiment": enriched_experiment})
182+
if search.include_experiment_score_set_urns_and_count:
183+
for ss in score_sets:
184+
enriched_experiment = enrich_experiment_with_num_score_sets(ss.experiment, user_data)
185+
response_item = score_set.ScoreSet.model_validate(ss).copy(update={"experiment": enriched_experiment})
154186
enriched_score_sets.append(response_item)
187+
score_sets = enriched_score_sets
188+
189+
return {"score_sets": score_sets, "num_score_sets": num_score_sets}
190+
155191

156-
return enriched_score_sets if search.include_experiment_score_set_urns_and_count else updated_score_sets
192+
@router.post("/score-sets/search/filter-options", status_code=200, response_model=ScoreSetsSearchFilterOptionsResponse)
193+
def get_filter_options_for_search(
194+
search: ScoreSetsSearch,
195+
db: Session = Depends(deps.get_db),
196+
) -> Any:
197+
return fetch_score_set_search_filter_options(db, None, search)
157198

158199

159200
@router.get("/score-sets/mapped-genes", status_code=200, response_model=dict[str, list[str]])
@@ -190,26 +231,24 @@ def score_set_mapped_gene_mapping(
190231
@router.post(
191232
"/me/score-sets/search",
192233
status_code=200,
193-
response_model=list[score_set.ShortScoreSet],
234+
response_model=ScoreSetsSearchResponse,
194235
)
195236
def search_my_score_sets(
196-
search: ScoreSetsSearch, # = Body(..., embed=True),
237+
search: ScoreSetsSearch,
197238
db: Session = Depends(deps.get_db),
198239
user_data: UserData = Depends(require_current_user),
199240
) -> Any:
200241
"""
201242
Search score sets created by the current user..
202243
"""
203-
score_sets = _search_score_sets(db, user_data.user, search)
204-
updated_score_sets = fetch_superseding_score_set_in_search_result(score_sets, user_data, search)
244+
score_sets, num_score_sets = _search_score_sets(db, user_data.user, search).values()
205245
enriched_score_sets = []
206-
if updated_score_sets:
207-
for u in updated_score_sets:
208-
enriched_experiment = enrich_experiment_with_num_score_sets(u.experiment, user_data)
209-
response_item = score_set.ScoreSet.model_validate(u).copy(update={"experiment": enriched_experiment})
210-
enriched_score_sets.append(response_item)
246+
for ss in score_sets:
247+
enriched_experiment = enrich_experiment_with_num_score_sets(ss.experiment, user_data)
248+
response_item = score_set.ScoreSet.model_validate(ss).copy(update={"experiment": enriched_experiment})
249+
enriched_score_sets.append(response_item)
211250

212-
return enriched_score_sets
251+
return {"score_sets": enriched_score_sets, "num_score_sets": num_score_sets}
213252

214253

215254
@router.get(

src/mavedb/view_models/search.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Optional
22

33
from mavedb.view_models.base.base import BaseModel
4+
from mavedb.view_models.score_set import ShortScoreSet
45

56

67
class ExperimentsSearch(BaseModel):
@@ -27,6 +28,37 @@ class ScoreSetsSearch(BaseModel):
2728
keywords: Optional[list[str]] = None
2829
text: Optional[str] = None
2930
include_experiment_score_set_urns_and_count: Optional[bool] = True
31+
offset: Optional[int] = None
32+
limit: Optional[int] = None
33+
34+
35+
class ScoreSetsSearchResponse(BaseModel):
36+
score_sets: list[ShortScoreSet]
37+
num_score_sets: int
38+
39+
class Config:
40+
from_attributes = True
41+
42+
43+
class ScoreSetsSearchFilterOption(BaseModel):
44+
value: str
45+
count: int
46+
47+
class Config:
48+
from_attributes = True
49+
50+
51+
class ScoreSetsSearchFilterOptionsResponse(BaseModel):
52+
target_gene_categories: list[ScoreSetsSearchFilterOption]
53+
target_gene_names: list[ScoreSetsSearchFilterOption]
54+
target_organism_names: list[ScoreSetsSearchFilterOption]
55+
target_accessions: list[ScoreSetsSearchFilterOption]
56+
publication_author_names: list[ScoreSetsSearchFilterOption]
57+
publication_db_names: list[ScoreSetsSearchFilterOption]
58+
publication_journals: list[ScoreSetsSearchFilterOption]
59+
60+
class Config:
61+
from_attributes = True
3062

3163

3264
class TextSearch(BaseModel):

0 commit comments

Comments
 (0)