|
| 1 | +from collections import Counter |
1 | 2 | import csv |
2 | 3 | import io |
3 | 4 | import logging |
4 | | -import re |
5 | 5 | from operator import attrgetter |
| 6 | +import re |
6 | 7 | from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal |
7 | 8 |
|
8 | 9 | from mavedb.models.mapped_variant import MappedVariant |
9 | 10 | import numpy as np |
10 | 11 | import pandas as pd |
11 | 12 | from pandas.testing import assert_index_equal |
12 | 13 | from sqlalchemy import Integer, and_, cast, func, or_, select |
13 | | -from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, selectinload |
| 14 | +from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload |
14 | 15 |
|
15 | 16 | from mavedb.lib.exceptions import ValidationError |
16 | 17 | from mavedb.lib.logging.context import logging_context, save_to_logging_context |
@@ -71,11 +72,15 @@ def options(cls) -> list[str]: |
71 | 72 | return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] |
72 | 73 |
|
73 | 74 |
|
74 | | -def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch) -> list[ScoreSet]: |
75 | | - save_to_logging_context({"score_set_search_criteria": search.model_dump()}) |
| 75 | +def build_search_score_sets_query_filter( |
| 76 | + db: Session, query: Query[ScoreSet], owner_or_contributor: Optional[User], search: ScoreSetsSearch |
| 77 | +): |
| 78 | + superseding_score_set = aliased(ScoreSet) |
76 | 79 |
|
77 | | - query = db.query(ScoreSet) # \ |
78 | | - # .filter(ScoreSet.private.is_(False)) |
| 80 | + # Limit to unsuperseded score sets. |
| 81 | + # TODO#??? Prevent unpublished superseding score sets from hiding their published precursors in search results. |
| 82 | + query = query.join(superseding_score_set, ScoreSet.superseding_score_set, isouter=True) |
| 83 | + query = query.filter(superseding_score_set.id.is_(None)) |
79 | 84 |
|
80 | 85 | if owner_or_contributor is not None: |
81 | 86 | query = query.filter( |
@@ -213,6 +218,14 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: |
213 | 218 | ) |
214 | 219 | ) |
215 | 220 | ) |
| 221 | + return query |
| 222 | + |
| 223 | + |
| 224 | +def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch): |
| 225 | + save_to_logging_context({"score_set_search_criteria": search.model_dump()}) |
| 226 | + |
| 227 | + query = db.query(ScoreSet) |
| 228 | + query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search) |
216 | 229 |
|
217 | 230 | score_sets: list[ScoreSet] = ( |
218 | 231 | query.join(ScoreSet.experiment) |
@@ -257,15 +270,102 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: |
257 | 270 | ), |
258 | 271 | ) |
259 | 272 | .order_by(Experiment.title) |
| 273 | + .offset(search.offset if search.offset is not None else None) |
| 274 | + .limit(search.limit + 1 if search.limit is not None else None) |
260 | 275 | .all() |
261 | 276 | ) |
262 | 277 | if not score_sets: |
263 | 278 | score_sets = [] |
264 | 279 |
|
265 | | - save_to_logging_context({"matching_resources": len(score_sets)}) |
| 280 | + offset = search.offset if search.offset is not None else 0 |
| 281 | + num_score_sets = offset + len(score_sets) |
| 282 | + if search.limit is not None and num_score_sets > offset + search.limit: |
| 283 | + # In the main query, we have allowed limit + 1 results. The extra record tells us whether we need to run a count |
| 284 | + # query. |
| 285 | + score_sets = score_sets[: search.limit] |
| 286 | + count_query = db.query(ScoreSet) |
| 287 | + build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search) |
| 288 | + num_score_sets = count_query.order_by(None).limit(None).count() |
| 289 | + |
| 290 | + save_to_logging_context({"matching_resources": num_score_sets}) |
266 | 291 | logger.debug(msg=f"Score set search yielded {len(score_sets)} matching resources.", extra=logging_context()) |
267 | 292 |
|
268 | | - return score_sets # filter_visible_score_sets(score_sets) |
| 293 | + return {"score_sets": score_sets, "num_score_sets": num_score_sets} |
| 294 | + |
| 295 | + |
| 296 | +def score_set_search_filter_options_from_counter(counter: Counter): |
| 297 | + return [{"value": value, "count": count} for value, count in counter.items()] |
| 298 | + |
| 299 | + |
| 300 | +def fetch_score_set_search_filter_options(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch): |
| 301 | + save_to_logging_context({"score_set_search_criteria": search.model_dump()}) |
| 302 | + |
| 303 | + query = db.query(ScoreSet) |
| 304 | + query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search) |
| 305 | + |
| 306 | + score_sets: list[ScoreSet] = query.all() |
| 307 | + if not score_sets: |
| 308 | + score_sets = [] |
| 309 | + |
| 310 | + target_category_counter: Counter[str] = Counter() |
| 311 | + target_name_counter: Counter[str] = Counter() |
| 312 | + target_organism_name_counter: Counter[str] = Counter() |
| 313 | + target_accession_counter: Counter[str] = Counter() |
| 314 | + for score_set in score_sets: |
| 315 | + for target in getattr(score_set, "target_genes", []): |
| 316 | + category = getattr(target, "category", None) |
| 317 | + if category: |
| 318 | + target_category_counter[category] += 1 |
| 319 | + |
| 320 | + name = getattr(target, "name", None) |
| 321 | + if name: |
| 322 | + target_name_counter[name] += 1 |
| 323 | + |
| 324 | + target_sequence = getattr(target, "target_sequence", None) |
| 325 | + taxonomy = getattr(target_sequence, "taxonomy", None) |
| 326 | + organism_name = getattr(taxonomy, "organism_name", None) |
| 327 | + |
| 328 | + if organism_name: |
| 329 | + target_organism_name_counter[organism_name] += 1 |
| 330 | + |
| 331 | + target_accession = getattr(target, "target_accession", None) |
| 332 | + accession = getattr(target_accession, "accession", None) |
| 333 | + |
| 334 | + if accession: |
| 335 | + target_accession_counter[accession] += 1 |
| 336 | + |
| 337 | + publication_author_name_counter: Counter[str] = Counter() |
| 338 | + publication_db_name_counter: Counter[str] = Counter() |
| 339 | + publication_journal_counter: Counter[str] = Counter() |
| 340 | + for score_set in score_sets: |
| 341 | + for publication_association in getattr(score_set, "publication_identifier_associations", []): |
| 342 | + publication = getattr(publication_association, "publication", None) |
| 343 | + |
| 344 | + authors = getattr(publication, "authors", []) |
| 345 | + for author in authors: |
| 346 | + name = author.get("name") |
| 347 | + if name: |
| 348 | + publication_author_name_counter[name] += 1 |
| 349 | + |
| 350 | + db_name = getattr(publication, "db_name", None) |
| 351 | + if db_name: |
| 352 | + publication_db_name_counter[db_name] += 1 |
| 353 | + |
| 354 | + journal = getattr(publication, "publication_journal", None) |
| 355 | + if journal: |
| 356 | + publication_journal_counter[journal] += 1 |
| 357 | + |
| 358 | + logger.debug(msg="Score set search filter options were fetched.", extra=logging_context()) |
| 359 | + |
| 360 | + return { |
| 361 | + "target_gene_categories": score_set_search_filter_options_from_counter(target_category_counter), |
| 362 | + "target_gene_names": score_set_search_filter_options_from_counter(target_name_counter), |
| 363 | + "target_organism_names": score_set_search_filter_options_from_counter(target_organism_name_counter), |
| 364 | + "target_accessions": score_set_search_filter_options_from_counter(target_accession_counter), |
| 365 | + "publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter), |
| 366 | + "publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter), |
| 367 | + "publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter), |
| 368 | + } |
269 | 369 |
|
270 | 370 |
|
271 | 371 | def fetch_superseding_score_set_in_search_result( |
|
0 commit comments