Skip to content

Commit 2f37d6b

Browse files
authored
Merge branch 'release-2025.5.0' into davereinhart/scoreset-column-metadata
2 parents b3a5a88 + a0cec30 commit 2f37d6b

File tree

9 files changed

+511
-67
lines changed

9 files changed

+511
-67
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# python-base
33
# Set up shared environment variables
44
################################
5-
FROM --platform=amd64 python:3.11 AS python-base
5+
FROM python:3.11 AS python-base
66

77
# Poetry
88
# https://python-poetry.org/docs/configuration/#using-environment-variables

src/mavedb/db/view.py

Lines changed: 123 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
import sqlalchemy as sa
88
from sqlalchemy.ext import compiler
9-
from sqlalchemy.schema import DDLElement, MetaData
109
from sqlalchemy.orm import Session
10+
from sqlalchemy.schema import DDLElement, MetaData
1111

1212
from mavedb.db.base import Base
1313

@@ -32,7 +32,53 @@ class MaterializedView(Base):
3232

3333
@classmethod
3434
def refresh(cls, connection, concurrently=True):
35-
"""Refresh this materialized view."""
35+
"""
36+
Refresh the underlying materialized view for this ORM-mapped class.
37+
38+
This class method delegates to `refresh_mat_view` to issue a database
39+
REFRESH MATERIALIZED VIEW (optionally CONCURRENTLY) statement for the
40+
materialized view backing the current model (`cls.__table__.fullname`).
41+
42+
Parameters
43+
---------
44+
connection : sqlalchemy.engine.Connection | sqlalchemy.orm.Session
45+
An active SQLAlchemy connection or session bound to the target database.
46+
concurrently : bool, default True
47+
If True, performs a concurrent refresh (REFRESH MATERIALIZED VIEW CONCURRENTLY),
48+
allowing reads during the refresh when the database backend supports it.
49+
If False, performs a blocking refresh.
50+
51+
Returns
52+
-------
53+
None
54+
55+
Raises
56+
------
57+
sqlalchemy.exc.DBAPIError
58+
If the database reports an error while refreshing the materialized view.
59+
sqlalchemy.exc.OperationalError
60+
For operational issues such as locks or insufficient privileges.
61+
ValueError
62+
If the connection provided is not a valid SQLAlchemy connection/session.
63+
64+
Notes
65+
-----
66+
- A concurrent refresh typically requires the materialized view to have a unique
67+
index matching all rows; otherwise the database may reject the operation.
68+
- This operation does not return a value; it is executed for its side effect.
69+
- Ensure the connection/session is in a clean transactional state if you rely on
70+
consistent snapshot semantics.
71+
- This function commits no changes; it is the caller's responsibility to
72+
commit the session if needed.
73+
74+
Examples
75+
--------
76+
# Refresh with concurrent mode (default)
77+
MyMaterializedView.refresh(connection)
78+
79+
# Perform a blocking refresh
80+
MyMaterializedView.refresh(connection, concurrently=False)
81+
"""
3682
refresh_mat_view(connection, cls.__table__.fullname, concurrently)
3783

3884

@@ -123,19 +169,91 @@ class MyView(Base):
123169

124170
def refresh_mat_view(session: Session, name: str, concurrently=True):
125171
"""
126-
Refreshes a single materialized view, given by `name`.
172+
Refresh a PostgreSQL materialized view within the current SQLAlchemy session.
173+
174+
This helper issues a REFRESH MATERIALIZED VIEW statement for the specified
175+
materialized view name. It first explicitly flushes the session because
176+
session.execute() bypasses SQLAlchemy's autoflush mechanism; without the flush,
177+
pending changes (e.g., newly inserted/updated rows that the view depends on)
178+
might not be reflected in the refreshed view.
179+
180+
Parameters
181+
----------
182+
session : sqlalchemy.orm.Session
183+
An active SQLAlchemy session bound to a PostgreSQL database.
184+
name : str
185+
The exact name (optionally schema-qualified) of the materialized view to refresh.
186+
concurrently : bool, default True
187+
If True, uses REFRESH MATERIALIZED VIEW CONCURRENTLY allowing reads during
188+
the refresh and requiring a unique index on the materialized view. If False,
189+
performs a blocking refresh.
190+
191+
Raises
192+
------
193+
sqlalchemy.exc.SQLAlchemyError
194+
Propagates any database errors encountered during execution (e.g.,
195+
insufficient privileges, missing view, lack of required unique index for
196+
CONCURRENTLY).
197+
198+
Notes
199+
-----
200+
- Using CONCURRENTLY requires the materialized view to have at least one
201+
unique index; otherwise PostgreSQL will raise an error.
202+
- The operation does not return a value; it is executed for its side effect.
203+
- Ensure the session is in a clean transactional state if you rely on
204+
consistent snapshot semantics.
205+
- This function commits no changes; it is the caller's responsibility to
206+
commit the session if needed.
207+
208+
Examples
209+
--------
210+
refresh_mat_view(session, "public.my_materialized_view")
211+
refresh_mat_view(session, "reports.daily_stats", concurrently=False)
127212
"""
128213
# since session.execute() bypasses autoflush, must manually flush in order
129214
# to include newly-created/modified objects in the refresh
130215
session.flush()
216+
131217
_con = "CONCURRENTLY " if concurrently else ""
132218
session.execute(sa.text("REFRESH MATERIALIZED VIEW " + _con + name))
133219

134220

135221
def refresh_all_mat_views(session: Session, concurrently=True):
136222
"""
137-
Refreshes all materialized views. Views are refreshed in non-deterministic order,
138-
so view definitions can't depend on each other.
223+
Refreshes all PostgreSQL materialized views visible to the given SQLAlchemy session.
224+
225+
The function inspects the current database connection for registered materialized
226+
views and issues a REFRESH MATERIALIZED VIEW command for each one using the helper
227+
function `refresh_mat_view`. After all refresh operations complete, the session
228+
is committed to persist any transactional side effects of the refresh statements.
229+
230+
Parameters
231+
----------
232+
session : sqlalchemy.orm.Session
233+
An active SQLAlchemy session bound to a PostgreSQL connection.
234+
concurrently : bool, default True
235+
If True, each materialized view is refreshed using the CONCURRENTLY option
236+
(only supported when the view has a unique index that satisfies PostgreSQL
237+
requirements). If False, a standard blocking refresh is performed.
238+
239+
Behavior
240+
--------
241+
- If inspection of the connection fails or returns no inspector, the function
242+
exits without performing any work.
243+
- Each materialized view name returned by the inspector is passed to
244+
`refresh_mat_view(session, name, concurrently)`.
245+
246+
Notes
247+
-----
248+
- Using CONCURRENTLY allows reads during refresh at the cost of requiring an
249+
appropriate unique index and potentially being slower.
250+
- Exceptions raised during individual refresh operations will propagate unless
251+
`refresh_mat_view` handles them internally; in such a case the commit will
252+
not be reached.
253+
- Ensure the session is in a clean transactional state if you rely on
254+
consistent snapshot semantics.
255+
- This function commits no changes; it is the caller's responsibility to
256+
commit the session if needed.
139257
"""
140258
inspector = sa.inspect(session.connection())
141259

src/mavedb/lib/score_sets.py

Lines changed: 108 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1+
from collections import Counter
12
import csv
23
import io
34
import logging
4-
import re
55
from operator import attrgetter
6+
import re
67
from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence, Literal
78

89
from mavedb.models.mapped_variant import MappedVariant
910
import numpy as np
1011
import pandas as pd
1112
from pandas.testing import assert_index_equal
1213
from sqlalchemy import Integer, and_, cast, func, or_, select
13-
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, selectinload
14+
from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload
1415

1516
from mavedb.lib.exceptions import ValidationError
1617
from mavedb.lib.logging.context import logging_context, save_to_logging_context
@@ -71,11 +72,15 @@ def options(cls) -> list[str]:
7172
return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN]
7273

7374

74-
def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch) -> list[ScoreSet]:
75-
save_to_logging_context({"score_set_search_criteria": search.model_dump()})
75+
def build_search_score_sets_query_filter(
76+
db: Session, query: Query[ScoreSet], owner_or_contributor: Optional[User], search: ScoreSetsSearch
77+
):
78+
superseding_score_set = aliased(ScoreSet)
7679

77-
query = db.query(ScoreSet) # \
78-
# .filter(ScoreSet.private.is_(False))
80+
# Limit to unsuperseded score sets.
81+
# TODO#??? Prevent unpublished superseding score sets from hiding their published precursors in search results.
82+
query = query.join(superseding_score_set, ScoreSet.superseding_score_set, isouter=True)
83+
query = query.filter(superseding_score_set.id.is_(None))
7984

8085
if owner_or_contributor is not None:
8186
query = query.filter(
@@ -213,6 +218,14 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
213218
)
214219
)
215220
)
221+
return query
222+
223+
224+
def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch):
225+
save_to_logging_context({"score_set_search_criteria": search.model_dump()})
226+
227+
query = db.query(ScoreSet)
228+
query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search)
216229

217230
score_sets: list[ScoreSet] = (
218231
query.join(ScoreSet.experiment)
@@ -257,15 +270,102 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
257270
),
258271
)
259272
.order_by(Experiment.title)
273+
.offset(search.offset if search.offset is not None else None)
274+
.limit(search.limit + 1 if search.limit is not None else None)
260275
.all()
261276
)
262277
if not score_sets:
263278
score_sets = []
264279

265-
save_to_logging_context({"matching_resources": len(score_sets)})
280+
offset = search.offset if search.offset is not None else 0
281+
num_score_sets = offset + len(score_sets)
282+
if search.limit is not None and num_score_sets > offset + search.limit:
283+
# In the main query, we have allowed limit + 1 results. The extra record tells us whether we need to run a count
284+
# query.
285+
score_sets = score_sets[: search.limit]
286+
count_query = db.query(ScoreSet)
287+
build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search)
288+
num_score_sets = count_query.order_by(None).limit(None).count()
289+
290+
save_to_logging_context({"matching_resources": num_score_sets})
266291
logger.debug(msg=f"Score set search yielded {len(score_sets)} matching resources.", extra=logging_context())
267292

268-
return score_sets # filter_visible_score_sets(score_sets)
293+
return {"score_sets": score_sets, "num_score_sets": num_score_sets}
294+
295+
296+
def score_set_search_filter_options_from_counter(counter: Counter):
297+
return [{"value": value, "count": count} for value, count in counter.items()]
298+
299+
300+
def fetch_score_set_search_filter_options(db: Session, owner_or_contributor: Optional[User], search: ScoreSetsSearch):
301+
save_to_logging_context({"score_set_search_criteria": search.model_dump()})
302+
303+
query = db.query(ScoreSet)
304+
query = build_search_score_sets_query_filter(db, query, owner_or_contributor, search)
305+
306+
score_sets: list[ScoreSet] = query.all()
307+
if not score_sets:
308+
score_sets = []
309+
310+
target_category_counter: Counter[str] = Counter()
311+
target_name_counter: Counter[str] = Counter()
312+
target_organism_name_counter: Counter[str] = Counter()
313+
target_accession_counter: Counter[str] = Counter()
314+
for score_set in score_sets:
315+
for target in getattr(score_set, "target_genes", []):
316+
category = getattr(target, "category", None)
317+
if category:
318+
target_category_counter[category] += 1
319+
320+
name = getattr(target, "name", None)
321+
if name:
322+
target_name_counter[name] += 1
323+
324+
target_sequence = getattr(target, "target_sequence", None)
325+
taxonomy = getattr(target_sequence, "taxonomy", None)
326+
organism_name = getattr(taxonomy, "organism_name", None)
327+
328+
if organism_name:
329+
target_organism_name_counter[organism_name] += 1
330+
331+
target_accession = getattr(target, "target_accession", None)
332+
accession = getattr(target_accession, "accession", None)
333+
334+
if accession:
335+
target_accession_counter[accession] += 1
336+
337+
publication_author_name_counter: Counter[str] = Counter()
338+
publication_db_name_counter: Counter[str] = Counter()
339+
publication_journal_counter: Counter[str] = Counter()
340+
for score_set in score_sets:
341+
for publication_association in getattr(score_set, "publication_identifier_associations", []):
342+
publication = getattr(publication_association, "publication", None)
343+
344+
authors = getattr(publication, "authors", [])
345+
for author in authors:
346+
name = author.get("name")
347+
if name:
348+
publication_author_name_counter[name] += 1
349+
350+
db_name = getattr(publication, "db_name", None)
351+
if db_name:
352+
publication_db_name_counter[db_name] += 1
353+
354+
journal = getattr(publication, "publication_journal", None)
355+
if journal:
356+
publication_journal_counter[journal] += 1
357+
358+
logger.debug(msg="Score set search filter options were fetched.", extra=logging_context())
359+
360+
return {
361+
"target_gene_categories": score_set_search_filter_options_from_counter(target_category_counter),
362+
"target_gene_names": score_set_search_filter_options_from_counter(target_name_counter),
363+
"target_organism_names": score_set_search_filter_options_from_counter(target_organism_name_counter),
364+
"target_accessions": score_set_search_filter_options_from_counter(target_accession_counter),
365+
"publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter),
366+
"publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter),
367+
"publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter),
368+
}
269369

270370

271371
def fetch_superseding_score_set_in_search_result(

src/mavedb/routers/alphafold.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from fastapi import APIRouter, HTTPException
2+
import httpx
3+
import xml.etree.ElementTree as ET
4+
import re
5+
6+
from mavedb.lib.logging.logged_route import LoggedRoute
7+
8+
ALPHAFOLD_BASE = "https://alphafold.ebi.ac.uk/files/"
9+
10+
router = APIRouter(
11+
prefix="/api/v1",
12+
tags=["alphafold files"],
13+
responses={404: {"description": "Not found"}},
14+
route_class=LoggedRoute,
15+
)
16+
17+
@router.get("/alphafold-files/version")
18+
async def proxy_alphafold_index():
19+
"""
20+
Proxy the AlphaFold files index (XML document).
21+
"""
22+
async with httpx.AsyncClient(follow_redirects=True, timeout=30) as client:
23+
resp = await client.get(ALPHAFOLD_BASE, headers={"Accept": "application/xml"})
24+
if resp.status_code != 200:
25+
raise HTTPException(status_code=resp.status_code, detail="Upstream error fetching AlphaFold files index")
26+
27+
# parse XML response
28+
try:
29+
root = ET.fromstring(resp.content)
30+
31+
# Detect default namespace
32+
if root.tag.startswith("{"):
33+
ns_uri = root.tag.split("}", 1)[0][1:]
34+
ns = {"x": ns_uri}
35+
next_marker_tag = "x:NextMarker"
36+
else:
37+
ns = {}
38+
next_marker_tag = "NextMarker"
39+
40+
next_marker_el = root.find(next_marker_tag, ns)
41+
next_marker = next_marker_el.text if next_marker_el is not None else None
42+
43+
match = re.search(r"model_(v\d+)\.pdb$", next_marker, re.IGNORECASE)
44+
if not match:
45+
raise HTTPException(status_code=500, detail="Malformed AlphaFold PDB ID in XML")
46+
version = match.group(1)
47+
return {"version": version.lower()}
48+
49+
except ET.ParseError as e:
50+
raise HTTPException(status_code=502, detail=f"Failed to parse upstream XML: {e}")

0 commit comments

Comments
 (0)