-
Notifications
You must be signed in to change notification settings - Fork 2
Score set search result optimization #525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
eb591af
0c1544f
041bfb9
1e5e6ed
a3a1372
b6b3720
6331d55
fd6e701
9bcb0cf
ea62984
765f02f
0ef53bf
6a67c84
52cb863
b9ff02b
db53a0e
3be7945
0508a4f
be3d522
ebde590
38d974e
e0abfe0
7f0688b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,16 +1,17 @@ | ||
| from collections import Counter | ||
| import csv | ||
| import io | ||
| import logging | ||
| import re | ||
| from operator import attrgetter | ||
| import re | ||
| from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal | ||
|
|
||
| from mavedb.models.mapped_variant import MappedVariant | ||
| import numpy as np | ||
| import pandas as pd | ||
| from pandas.testing import assert_index_equal | ||
| from sqlalchemy import Integer, and_, cast, func, or_, select | ||
| from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, selectinload | ||
| from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload | ||
|
|
||
| from mavedb.lib.exceptions import ValidationError | ||
| from mavedb.lib.logging.context import logging_context, save_to_logging_context | ||
|
|
@@ -71,11 +72,15 @@ def options(cls) -> list[str]: | |
| return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] | ||
|
|
||
|
|
||
| def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch) -> list[ScoreSet]: | ||
| save_to_logging_context({"score_set_search_criteria": search.model_dump()}) | ||
| def build_search_score_sets_query_filter( | ||
| db: Session, query: Query[ScoreSet], owner_or_contributor: Optional[User], search: ScoreSetsSearch | ||
| ): | ||
| superseding_score_set = aliased(ScoreSet) | ||
|
|
||
| query = db.query(ScoreSet) # \ | ||
| # .filter(ScoreSet.private.is_(False)) | ||
| # Limit to unsuperseded score sets. | ||
| # TODO#??? Prevent unpublished superseding score sets from hiding their published precursors in search results. | ||
| query = query.join(superseding_score_set, ScoreSet.superseding_score_set, isouter=True) | ||
| query = query.filter(superseding_score_set.id.is_(None)) | ||
|
|
||
| if owner_or_contributor is not None: | ||
| query = query.filter( | ||
|
|
@@ -213,6 +218,14 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: | |
| ) | ||
| ) | ||
| ) | ||
| return query | ||
|
|
||
|
|
||
| def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch): | ||
| save_to_logging_context({"score_set_search_criteria": search.model_dump()}) | ||
|
|
||
| query = db.query(ScoreSet) | ||
| query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search) | ||
|
|
||
| score_sets: list[ScoreSet] = ( | ||
| query.join(ScoreSet.experiment) | ||
|
|
@@ -257,15 +270,102 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: | |
| ), | ||
| ) | ||
| .order_by(Experiment.title) | ||
| .offset(search.offset if search.offset is not None else None) | ||
| .limit(search.limit + 1 if search.limit is not None else None) | ||
| .all() | ||
| ) | ||
| if not score_sets: | ||
| score_sets = [] | ||
|
|
||
| save_to_logging_context({"matching_resources": len(score_sets)}) | ||
| offset = search.offset if search.offset is not None else 0 | ||
| num_score_sets = offset + len(score_sets) | ||
| if search.limit is not None and num_score_sets > offset + search.limit: | ||
| # In the main query, we have allowed limit + 1 results. The extra record tells us whether we need to run a count | ||
| # query. | ||
| score_sets = score_sets[: search.limit] | ||
| count_query = db.query(ScoreSet) | ||
| build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search) | ||
| num_score_sets = count_query.order_by(None).limit(None).count() | ||
|
|
||
| save_to_logging_context({"matching_resources": num_score_sets}) | ||
| logger.debug(msg=f"Score set search yielded {len(score_sets)} matching resources.", extra=logging_context()) | ||
|
|
||
| return score_sets # filter_visible_score_sets(score_sets) | ||
| return {"score_sets": score_sets, "num_score_sets": num_score_sets} | ||
|
|
||
|
|
||
| def score_set_search_filter_options_from_counter(counter: Counter): | ||
| return [{"value": value, "count": count} for value, count in counter.items()] | ||
|
|
||
|
|
||
| def fetch_score_set_search_filter_options(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm struggling a little with the duplication in this function. I'm not sure there's an easy way to abstract the counters in a way that's readable, but we could add a helper that reduces the duplication in counter to value dictionary generation. That would at least get rid of a bunch of the duplicated list comprehensions, which are a little verbose. def _counterHelper(counter: Counter):
return [{"value": value, "count": count} for value, count in counter.items()]
...
return {
"target_gene_categories": _counterHelper(target_category_counter),
...
"publication_journals": _counterHelper(publication_journals),
} |
||
| save_to_logging_context({"score_set_search_criteria": search.model_dump()}) | ||
|
|
||
| query = db.query(ScoreSet) | ||
| query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search) | ||
|
|
||
| score_sets: list[ScoreSet] = query.all() | ||
| if not score_sets: | ||
| score_sets = [] | ||
|
|
||
| target_category_counter: Counter[str] = Counter() | ||
| target_name_counter: Counter[str] = Counter() | ||
| target_organism_name_counter: Counter[str] = Counter() | ||
| target_accession_counter: Counter[str] = Counter() | ||
| for score_set in score_sets: | ||
| for target in getattr(score_set, "target_genes", []): | ||
| category = getattr(target, "category", None) | ||
| if category: | ||
| target_category_counter[category] += 1 | ||
|
|
||
| name = getattr(target, "name", None) | ||
| if name: | ||
| target_name_counter[name] += 1 | ||
|
|
||
| target_sequence = getattr(target, "target_sequence", None) | ||
| taxonomy = getattr(target_sequence, "taxonomy", None) | ||
| organism_name = getattr(taxonomy, "organism_name", None) | ||
|
|
||
| if organism_name: | ||
| target_organism_name_counter[organism_name] += 1 | ||
|
|
||
| target_accession = getattr(target, "target_accession", None) | ||
| accession = getattr(target_accession, "accession", None) | ||
|
|
||
| if accession: | ||
| target_accession_counter[accession] += 1 | ||
|
|
||
| publication_author_name_counter: Counter[str] = Counter() | ||
| publication_db_name_counter: Counter[str] = Counter() | ||
| publication_journal_counter: Counter[str] = Counter() | ||
| for score_set in score_sets: | ||
| for publication_association in getattr(score_set, "publication_identifier_associations", []): | ||
| publication = getattr(publication_association, "publication", None) | ||
|
|
||
| authors = getattr(publication, "authors", []) | ||
| for author in authors: | ||
| name = author.get("name") | ||
| if name: | ||
| publication_author_name_counter[name] += 1 | ||
|
|
||
| db_name = getattr(publication, "db_name", None) | ||
| if db_name: | ||
| publication_db_name_counter[db_name] += 1 | ||
|
|
||
| journal = getattr(publication, "publication_journal", None) | ||
| if journal: | ||
| publication_journal_counter[journal] += 1 | ||
|
|
||
| logger.debug(msg="Score set search filter options were fetched.", extra=logging_context()) | ||
|
|
||
| return { | ||
| "target_gene_categories": score_set_search_filter_options_from_counter(target_category_counter), | ||
| "target_gene_names": score_set_search_filter_options_from_counter(target_name_counter), | ||
| "target_organism_names": score_set_search_filter_options_from_counter(target_organism_name_counter), | ||
| "target_accessions": score_set_search_filter_options_from_counter(target_accession_counter), | ||
| "publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter), | ||
| "publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter), | ||
| "publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter), | ||
| } | ||
|
|
||
|
|
||
| def fetch_superseding_score_set_in_search_result( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,12 +45,12 @@ | |
| from mavedb.lib.permissions import Action, assert_permission, has_permission | ||
| from mavedb.lib.score_sets import ( | ||
| csv_data_to_df, | ||
| fetch_score_set_search_filter_options, | ||
| find_meta_analyses_for_experiment_sets, | ||
| get_score_set_variants_as_csv, | ||
| variants_to_csv_rows, | ||
| ) | ||
| from mavedb.lib.score_sets import ( | ||
| fetch_superseding_score_set_in_search_result, | ||
| search_score_sets as _search_score_sets, | ||
| refresh_variant_urns, | ||
| ) | ||
|
|
@@ -74,10 +74,13 @@ | |
| from mavedb.models.target_sequence import TargetSequence | ||
| from mavedb.models.variant import Variant | ||
| from mavedb.view_models import mapped_variant, score_set, clinical_control, score_range, gnomad_variant | ||
| from mavedb.view_models.search import ScoreSetsSearch | ||
| from mavedb.view_models.search import ScoreSetsSearch, ScoreSetsSearchFilterOptionsResponse, ScoreSetsSearchResponse | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| SCORE_SET_SEARCH_MAX_LIMIT = 100 | ||
| SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS = 40 | ||
|
|
||
|
|
||
| async def fetch_score_set_by_urn( | ||
| db, urn: str, user: Optional[UserData], owner_or_contributor: Optional[UserData], only_published: bool | ||
|
|
@@ -134,26 +137,64 @@ async def fetch_score_set_by_urn( | |
| ) | ||
|
|
||
|
|
||
| @router.post("/score-sets/search", status_code=200, response_model=list[score_set.ShortScoreSet]) | ||
| @router.post("/score-sets/search", status_code=200, response_model=ScoreSetsSearchResponse) | ||
| def search_score_sets( | ||
| search: ScoreSetsSearch, | ||
| db: Session = Depends(deps.get_db), | ||
| user_data: Optional[UserData] = Depends(get_current_user), | ||
| ) -> Any: # = Body(..., embed=True), | ||
| ) -> Any: | ||
| """ | ||
| Search score sets. | ||
| """ | ||
| score_sets = _search_score_sets(db, None, search) | ||
| updated_score_sets = fetch_superseding_score_set_in_search_result(score_sets, user_data, search) | ||
|
|
||
| # Disallow searches for unpublished score sets via this endpoint. | ||
| if search.published is False: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
| detail="Cannot search for private score sets except in the context of the current user's data.", | ||
| ) | ||
|
Comment on lines
+151
to
+155
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It'd be nice for this block to have an associated test. |
||
| search.published = True | ||
|
|
||
| # Require a limit of at most SCORE_SET_SEARCH_MAX_LIMIT when the search query does not include publication | ||
| # identifiers. We allow unlimited searches with publication identifiers, presuming that such a search will not have | ||
| # excessive results. | ||
| if search.publication_identifiers is None and search.limit is None: | ||
| search.limit = SCORE_SET_SEARCH_MAX_LIMIT | ||
| elif search.publication_identifiers is None and (search.limit is None or search.limit > SCORE_SET_SEARCH_MAX_LIMIT): | ||
| raise HTTPException( | ||
| status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
| detail=f"Cannot search for more than {SCORE_SET_SEARCH_MAX_LIMIT} score sets at a time. Please use the offset and limit parameters to run a paginated search.", | ||
| ) | ||
|
|
||
| # Also limit the search to at most SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS publication identifiers, to prevent | ||
| # artificially constructed searches that return very large result sets. | ||
| if ( | ||
| search.publication_identifiers is not None | ||
| and len(search.publication_identifiers) > SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS | ||
| ): | ||
| raise HTTPException( | ||
| status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
| detail=f"Cannot search for score sets belonging to more than {SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS} publication identifiers at once.", | ||
| ) | ||
|
|
||
| score_sets, num_score_sets = _search_score_sets(db, None, search).values() | ||
| enriched_score_sets = [] | ||
| if search.include_experiment_score_set_urns_and_count and updated_score_sets: | ||
| for u in updated_score_sets: | ||
| enriched_experiment = enrich_experiment_with_num_score_sets(u.experiment, user_data) | ||
| response_item = score_set.ScoreSet.model_validate(u).copy(update={"experiment": enriched_experiment}) | ||
| if search.include_experiment_score_set_urns_and_count: | ||
| for ss in score_sets: | ||
| enriched_experiment = enrich_experiment_with_num_score_sets(ss.experiment, user_data) | ||
| response_item = score_set.ScoreSet.model_validate(ss).copy(update={"experiment": enriched_experiment}) | ||
| enriched_score_sets.append(response_item) | ||
| score_sets = enriched_score_sets | ||
|
|
||
| return {"score_sets": score_sets, "num_score_sets": num_score_sets} | ||
|
|
||
|
|
||
| return enriched_score_sets if search.include_experiment_score_set_urns_and_count else updated_score_sets | ||
| @router.post("/score-sets/search/filter-options", status_code=200, response_model=ScoreSetsSearchFilterOptionsResponse) | ||
| def get_filter_options_for_search( | ||
| search: ScoreSetsSearch, | ||
| db: Session = Depends(deps.get_db), | ||
| ) -> Any: | ||
| return fetch_score_set_search_filter_options(db, None, search) | ||
|
|
||
|
|
||
| @router.get("/score-sets/mapped-genes", status_code=200, response_model=dict[str, list[str]]) | ||
|
|
@@ -190,26 +231,24 @@ def score_set_mapped_gene_mapping( | |
| @router.post( | ||
| "/me/score-sets/search", | ||
| status_code=200, | ||
| response_model=list[score_set.ShortScoreSet], | ||
| response_model=ScoreSetsSearchResponse, | ||
| ) | ||
| def search_my_score_sets( | ||
| search: ScoreSetsSearch, # = Body(..., embed=True), | ||
| search: ScoreSetsSearch, | ||
| db: Session = Depends(deps.get_db), | ||
| user_data: UserData = Depends(require_current_user), | ||
| ) -> Any: | ||
| """ | ||
| Search score sets created by the current user.. | ||
| """ | ||
| score_sets = _search_score_sets(db, user_data.user, search) | ||
| updated_score_sets = fetch_superseding_score_set_in_search_result(score_sets, user_data, search) | ||
| score_sets, num_score_sets = _search_score_sets(db, user_data.user, search).values() | ||
| enriched_score_sets = [] | ||
| if updated_score_sets: | ||
| for u in updated_score_sets: | ||
| enriched_experiment = enrich_experiment_with_num_score_sets(u.experiment, user_data) | ||
| response_item = score_set.ScoreSet.model_validate(u).copy(update={"experiment": enriched_experiment}) | ||
| enriched_score_sets.append(response_item) | ||
| for ss in score_sets: | ||
| enriched_experiment = enrich_experiment_with_num_score_sets(ss.experiment, user_data) | ||
| response_item = score_set.ScoreSet.model_validate(ss).copy(update={"experiment": enriched_experiment}) | ||
| enriched_score_sets.append(response_item) | ||
|
|
||
| return enriched_score_sets | ||
| return {"score_sets": enriched_score_sets, "num_score_sets": num_score_sets} | ||
|
|
||
|
|
||
| @router.get( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did we end up opening an issue for this we can add the number for?