Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
eb591af
Apply limit to score set searches.
jstone-dev Sep 30, 2025
0c1544f
Add an endpoint to obtain search filter options based on a given scor…
jstone-dev Sep 30, 2025
041bfb9
Allow score set search without a row limit when publication IDs are s…
jstone-dev Sep 30, 2025
1e5e6ed
MyPy: typing for counters
jstone-dev Sep 30, 2025
a3a1372
Update unit tests to reflect score set search endpoint change.
jstone-dev Sep 30, 2025
b6b3720
Code formatting
jstone-dev Sep 30, 2025
6331d55
Unit test fixes
jstone-dev Sep 30, 2025
fd6e701
Format & test fixes
jstone-dev Sep 30, 2025
9bcb0cf
Test bug fixes
jstone-dev Sep 30, 2025
ea62984
Unit test fixes
jstone-dev Oct 1, 2025
765f02f
Test bug fix
jstone-dev Oct 1, 2025
0ef53bf
Refactor counter usage for score set search filters.
jstone-dev Oct 24, 2025
6a67c84
Log the total number of matching score sets rather than the number re…
jstone-dev Oct 24, 2025
52cb863
Add an offset parameter to support full pagination of score set searc…
jstone-dev Oct 24, 2025
b9ff02b
Move score set search limits into constants.
jstone-dev Oct 24, 2025
db53a0e
Supply a default search limit.
jstone-dev Oct 24, 2025
3be7945
Unit tests for new score set search errors.
jstone-dev Oct 24, 2025
0508a4f
Don't import from router in test_score_set.py.
jstone-dev Oct 24, 2025
be3d522
Linting fix
jstone-dev Oct 24, 2025
ebde590
Return correct result count in paginated results with offset.
jstone-dev Oct 24, 2025
38d974e
Merge branch 'release-2025.5.0' into jstone-dev/score-set-search-resu…
jstone-dev Oct 29, 2025
e0abfe0
Fix after merge
jstone-dev Oct 29, 2025
7f0688b
Return unenriched score sets when include_experiment_score_set_urns_a…
jstone-dev Nov 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 108 additions & 8 deletions src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
from collections import Counter
import csv
import io
import logging
import re
from operator import attrgetter
import re
from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal

from mavedb.models.mapped_variant import MappedVariant
import numpy as np
import pandas as pd
from pandas.testing import assert_index_equal
from sqlalchemy import Integer, and_, cast, func, or_, select
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, selectinload
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload

from mavedb.lib.exceptions import ValidationError
from mavedb.lib.logging.context import logging_context, save_to_logging_context
Expand Down Expand Up @@ -71,11 +72,15 @@ def options(cls) -> list[str]:
return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN]


def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch) -> list[ScoreSet]:
save_to_logging_context({"score_set_search_criteria": search.model_dump()})
def build_search_score_sets_query_filter(
db: Session, query: Query[ScoreSet], owner_or_contributor: Optional[User], search: ScoreSetsSearch
):
superseding_score_set = aliased(ScoreSet)

query = db.query(ScoreSet) # \
# .filter(ScoreSet.private.is_(False))
# Limit to unsuperseded score sets.
# TODO#??? Prevent unpublished superseding score sets from hiding their published precursors in search results.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we end up opening an issue for this we can add the number for?

query = query.join(superseding_score_set, ScoreSet.superseding_score_set, isouter=True)
query = query.filter(superseding_score_set.id.is_(None))

if owner_or_contributor is not None:
query = query.filter(
Expand Down Expand Up @@ -213,6 +218,14 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
)
)
)
return query


def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch):
save_to_logging_context({"score_set_search_criteria": search.model_dump()})

query = db.query(ScoreSet)
query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search)

score_sets: list[ScoreSet] = (
query.join(ScoreSet.experiment)
Expand Down Expand Up @@ -257,15 +270,102 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
),
)
.order_by(Experiment.title)
.offset(search.offset if search.offset is not None else None)
.limit(search.limit + 1 if search.limit is not None else None)
.all()
)
if not score_sets:
score_sets = []

save_to_logging_context({"matching_resources": len(score_sets)})
offset = search.offset if search.offset is not None else 0
num_score_sets = offset + len(score_sets)
if search.limit is not None and num_score_sets > offset + search.limit:
# In the main query, we have allowed limit + 1 results. The extra record tells us whether we need to run a count
# query.
score_sets = score_sets[: search.limit]
count_query = db.query(ScoreSet)
build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search)
num_score_sets = count_query.order_by(None).limit(None).count()

save_to_logging_context({"matching_resources": num_score_sets})
logger.debug(msg=f"Score set search yielded {len(score_sets)} matching resources.", extra=logging_context())

return score_sets # filter_visible_score_sets(score_sets)
return {"score_sets": score_sets, "num_score_sets": num_score_sets}


def score_set_search_filter_options_from_counter(counter: Counter):
return [{"value": value, "count": count} for value, count in counter.items()]


def fetch_score_set_search_filter_options(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm struggling a little with the duplication in this function. I'm not sure there's an easy way to abstract the counters in a way that's readable, but we could add a helper that reduces the duplication in counter to value dictionary generation. That would at least get rid of a bunch of the duplicated list comprehensions, which are a little verbose.

def _counterHelper(counter: Counter):
    return [{"value": value, "count": count} for value, count in counter.items()]

...

return {
   "target_gene_categories": _counterHelper(target_category_counter),
    ...
    "publication_journals": _counterHelper(publication_journals),
}

save_to_logging_context({"score_set_search_criteria": search.model_dump()})

query = db.query(ScoreSet)
query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search)

score_sets: list[ScoreSet] = query.all()
if not score_sets:
score_sets = []

target_category_counter: Counter[str] = Counter()
target_name_counter: Counter[str] = Counter()
target_organism_name_counter: Counter[str] = Counter()
target_accession_counter: Counter[str] = Counter()
for score_set in score_sets:
for target in getattr(score_set, "target_genes", []):
category = getattr(target, "category", None)
if category:
target_category_counter[category] += 1

name = getattr(target, "name", None)
if name:
target_name_counter[name] += 1

target_sequence = getattr(target, "target_sequence", None)
taxonomy = getattr(target_sequence, "taxonomy", None)
organism_name = getattr(taxonomy, "organism_name", None)

if organism_name:
target_organism_name_counter[organism_name] += 1

target_accession = getattr(target, "target_accession", None)
accession = getattr(target_accession, "accession", None)

if accession:
target_accession_counter[accession] += 1

publication_author_name_counter: Counter[str] = Counter()
publication_db_name_counter: Counter[str] = Counter()
publication_journal_counter: Counter[str] = Counter()
for score_set in score_sets:
for publication_association in getattr(score_set, "publication_identifier_associations", []):
publication = getattr(publication_association, "publication", None)

authors = getattr(publication, "authors", [])
for author in authors:
name = author.get("name")
if name:
publication_author_name_counter[name] += 1

db_name = getattr(publication, "db_name", None)
if db_name:
publication_db_name_counter[db_name] += 1

journal = getattr(publication, "publication_journal", None)
if journal:
publication_journal_counter[journal] += 1

logger.debug(msg="Score set search filter options were fetched.", extra=logging_context())

return {
"target_gene_categories": score_set_search_filter_options_from_counter(target_category_counter),
"target_gene_names": score_set_search_filter_options_from_counter(target_name_counter),
"target_organism_names": score_set_search_filter_options_from_counter(target_organism_name_counter),
"target_accessions": score_set_search_filter_options_from_counter(target_accession_counter),
"publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter),
"publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter),
"publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter),
}


def fetch_superseding_score_set_in_search_result(
Expand Down
81 changes: 60 additions & 21 deletions src/mavedb/routers/score_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@
from mavedb.lib.permissions import Action, assert_permission, has_permission
from mavedb.lib.score_sets import (
csv_data_to_df,
fetch_score_set_search_filter_options,
find_meta_analyses_for_experiment_sets,
get_score_set_variants_as_csv,
variants_to_csv_rows,
)
from mavedb.lib.score_sets import (
fetch_superseding_score_set_in_search_result,
search_score_sets as _search_score_sets,
refresh_variant_urns,
)
Expand All @@ -74,10 +74,13 @@
from mavedb.models.target_sequence import TargetSequence
from mavedb.models.variant import Variant
from mavedb.view_models import mapped_variant, score_set, clinical_control, score_range, gnomad_variant
from mavedb.view_models.search import ScoreSetsSearch
from mavedb.view_models.search import ScoreSetsSearch, ScoreSetsSearchFilterOptionsResponse, ScoreSetsSearchResponse

logger = logging.getLogger(__name__)

SCORE_SET_SEARCH_MAX_LIMIT = 100
SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS = 40


async def fetch_score_set_by_urn(
db, urn: str, user: Optional[UserData], owner_or_contributor: Optional[UserData], only_published: bool
Expand Down Expand Up @@ -134,26 +137,64 @@ async def fetch_score_set_by_urn(
)


@router.post("/score-sets/search", status_code=200, response_model=list[score_set.ShortScoreSet])
@router.post("/score-sets/search", status_code=200, response_model=ScoreSetsSearchResponse)
def search_score_sets(
search: ScoreSetsSearch,
db: Session = Depends(deps.get_db),
user_data: Optional[UserData] = Depends(get_current_user),
) -> Any: # = Body(..., embed=True),
) -> Any:
"""
Search score sets.
"""
score_sets = _search_score_sets(db, None, search)
updated_score_sets = fetch_superseding_score_set_in_search_result(score_sets, user_data, search)

# Disallow searches for unpublished score sets via this endpoint.
if search.published is False:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="Cannot search for private score sets except in the context of the current user's data.",
)
Comment on lines +151 to +155
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'd be nice for this block to have an associated test.

search.published = True

# Require a limit of at most SCORE_SET_SEARCH_MAX_LIMIT when the search query does not include publication
# identifiers. We allow unlimited searches with publication identifiers, presuming that such a search will not have
# excessive results.
if search.publication_identifiers is None and search.limit is None:
search.limit = SCORE_SET_SEARCH_MAX_LIMIT
elif search.publication_identifiers is None and (search.limit is None or search.limit > SCORE_SET_SEARCH_MAX_LIMIT):
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail=f"Cannot search for more than {SCORE_SET_SEARCH_MAX_LIMIT} score sets at a time. Please use the offset and limit parameters to run a paginated search.",
)

# Also limit the search to at most SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS publication identifiers, to prevent
# artificially constructed searches that return very large result sets.
if (
search.publication_identifiers is not None
and len(search.publication_identifiers) > SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS
):
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail=f"Cannot search for score sets belonging to more than {SCORE_SET_SEARCH_MAX_PUBLICATION_IDENTIFIERS} publication identifiers at once.",
)

score_sets, num_score_sets = _search_score_sets(db, None, search).values()
enriched_score_sets = []
if search.include_experiment_score_set_urns_and_count and updated_score_sets:
for u in updated_score_sets:
enriched_experiment = enrich_experiment_with_num_score_sets(u.experiment, user_data)
response_item = score_set.ScoreSet.model_validate(u).copy(update={"experiment": enriched_experiment})
if search.include_experiment_score_set_urns_and_count:
for ss in score_sets:
enriched_experiment = enrich_experiment_with_num_score_sets(ss.experiment, user_data)
response_item = score_set.ScoreSet.model_validate(ss).copy(update={"experiment": enriched_experiment})
enriched_score_sets.append(response_item)
score_sets = enriched_score_sets

return {"score_sets": score_sets, "num_score_sets": num_score_sets}


return enriched_score_sets if search.include_experiment_score_set_urns_and_count else updated_score_sets
@router.post("/score-sets/search/filter-options", status_code=200, response_model=ScoreSetsSearchFilterOptionsResponse)
def get_filter_options_for_search(
search: ScoreSetsSearch,
db: Session = Depends(deps.get_db),
) -> Any:
return fetch_score_set_search_filter_options(db, None, search)


@router.get("/score-sets/mapped-genes", status_code=200, response_model=dict[str, list[str]])
Expand Down Expand Up @@ -190,26 +231,24 @@ def score_set_mapped_gene_mapping(
@router.post(
"/me/score-sets/search",
status_code=200,
response_model=list[score_set.ShortScoreSet],
response_model=ScoreSetsSearchResponse,
)
def search_my_score_sets(
search: ScoreSetsSearch, # = Body(..., embed=True),
search: ScoreSetsSearch,
db: Session = Depends(deps.get_db),
user_data: UserData = Depends(require_current_user),
) -> Any:
"""
Search score sets created by the current user..
"""
score_sets = _search_score_sets(db, user_data.user, search)
updated_score_sets = fetch_superseding_score_set_in_search_result(score_sets, user_data, search)
score_sets, num_score_sets = _search_score_sets(db, user_data.user, search).values()
enriched_score_sets = []
if updated_score_sets:
for u in updated_score_sets:
enriched_experiment = enrich_experiment_with_num_score_sets(u.experiment, user_data)
response_item = score_set.ScoreSet.model_validate(u).copy(update={"experiment": enriched_experiment})
enriched_score_sets.append(response_item)
for ss in score_sets:
enriched_experiment = enrich_experiment_with_num_score_sets(ss.experiment, user_data)
response_item = score_set.ScoreSet.model_validate(ss).copy(update={"experiment": enriched_experiment})
enriched_score_sets.append(response_item)

return enriched_score_sets
return {"score_sets": enriched_score_sets, "num_score_sets": num_score_sets}


@router.get(
Expand Down
32 changes: 32 additions & 0 deletions src/mavedb/view_models/search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional

from mavedb.view_models.base.base import BaseModel
from mavedb.view_models.score_set import ShortScoreSet


class ExperimentsSearch(BaseModel):
Expand All @@ -27,6 +28,37 @@ class ScoreSetsSearch(BaseModel):
keywords: Optional[list[str]] = None
text: Optional[str] = None
include_experiment_score_set_urns_and_count: Optional[bool] = True
offset: Optional[int] = None
limit: Optional[int] = None


class ScoreSetsSearchResponse(BaseModel):
score_sets: list[ShortScoreSet]
num_score_sets: int

class Config:
from_attributes = True


class ScoreSetsSearchFilterOption(BaseModel):
value: str
count: int

class Config:
from_attributes = True


class ScoreSetsSearchFilterOptionsResponse(BaseModel):
target_gene_categories: list[ScoreSetsSearchFilterOption]
target_gene_names: list[ScoreSetsSearchFilterOption]
target_organism_names: list[ScoreSetsSearchFilterOption]
target_accessions: list[ScoreSetsSearchFilterOption]
publication_author_names: list[ScoreSetsSearchFilterOption]
publication_db_names: list[ScoreSetsSearchFilterOption]
publication_journals: list[ScoreSetsSearchFilterOption]

class Config:
from_attributes = True


class TextSearch(BaseModel):
Expand Down
Loading