Skip to content

Commit fe8af27

Browse files
committed
Merge branch 'release-2025.0.1' into release-2025.1.0
2 parents 778c517 + c95060d commit fe8af27

File tree

8 files changed

+702
-34
lines changed

8 files changed

+702
-34
lines changed

src/mavedb/lib/score_sets.py

Lines changed: 126 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import io
33
import logging
44
import re
5-
from typing import Any, BinaryIO, Iterable, Optional, Sequence
5+
from operator import attrgetter
6+
from typing import Any, BinaryIO, Iterable, Optional, TYPE_CHECKING, Sequence
67

78
import numpy as np
89
import pandas as pd
@@ -21,6 +22,7 @@
2122
)
2223
from mavedb.lib.mave.utils import is_csv_null
2324
from mavedb.lib.validation.constants.general import null_values_list
25+
from mavedb.lib.validation.utilities import is_null as validate_is_null
2426
from mavedb.models.contributor import Contributor
2527
from mavedb.models.controlled_keyword import ControlledKeyword
2628
from mavedb.models.doi_identifier import DoiIdentifier
@@ -47,6 +49,10 @@
4749
from mavedb.models.variant import Variant
4850
from mavedb.view_models.search import ScoreSetsSearch
4951

52+
if TYPE_CHECKING:
53+
from mavedb.lib.authentication import UserData
54+
from mavedb.lib.permissions import Action
55+
5056
VariantData = dict[str, Optional[dict[str, dict]]]
5157

5258
logger = logging.getLogger(__name__)
@@ -68,9 +74,6 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
6874
query = db.query(ScoreSet) # \
6975
# .filter(ScoreSet.private.is_(False))
7076

71-
# filter out the score sets that are replaced by other score sets
72-
query = query.filter(~ScoreSet.superseding_score_set.has())
73-
7477
if owner_or_contributor is not None:
7578
query = query.filter(
7679
or_(
@@ -262,6 +265,41 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
262265
return score_sets # filter_visible_score_sets(score_sets)
263266

264267

268+
def fetch_superseding_score_set_in_search_result(
269+
score_sets: list[ScoreSet],
270+
requesting_user: Optional["UserData"],
271+
search: ScoreSetsSearch) -> list[ScoreSet]:
272+
"""
273+
Remove superseded score set from search results.
274+
Check whether all of the score set are correct versions.
275+
"""
276+
from mavedb.lib.permissions import Action
277+
if search.published:
278+
filtered_score_sets_tail = [
279+
find_publish_or_private_superseded_score_set_tail(
280+
score_set,
281+
Action.READ,
282+
requesting_user,
283+
search.published
284+
) for score_set in score_sets
285+
]
286+
else:
287+
filtered_score_sets_tail = [
288+
find_superseded_score_set_tail(
289+
score_set,
290+
Action.READ,
291+
requesting_user
292+
) for score_set in score_sets
293+
]
294+
# Remove None item.
295+
filtered_score_sets = [score_set for score_set in filtered_score_sets_tail if score_set is not None]
296+
if filtered_score_sets:
297+
final_score_sets = sorted(set(filtered_score_sets), key=attrgetter("urn"))
298+
else:
299+
final_score_sets = []
300+
return final_score_sets
301+
302+
265303
def find_meta_analyses_for_experiment_sets(db: Session, urns: list[str]) -> list[ScoreSet]:
266304
"""
267305
Find all score sets that are meta-analyses for score sets from a specified collection of experiment sets.
@@ -306,11 +344,66 @@ def find_meta_analyses_for_experiment_sets(db: Session, urns: list[str]) -> list
306344
)
307345

308346

347+
def find_superseded_score_set_tail(
348+
score_set: ScoreSet,
349+
action: Optional["Action"] = None,
350+
user_data: Optional["UserData"] = None) -> Optional[ScoreSet]:
351+
from mavedb.lib.permissions import has_permission
352+
while score_set.superseding_score_set is not None:
353+
next_score_set_in_chain = score_set.superseding_score_set
354+
355+
# If we were given a permission to check and the next score set in the chain does not have that permission,
356+
# pretend like we have reached the end of the chain. Otherwise, continue to the next score set.
357+
if action is not None and not has_permission(user_data, next_score_set_in_chain, action).permitted:
358+
return score_set
359+
360+
score_set = next_score_set_in_chain
361+
362+
# Handle unpublished superseding score set case.
363+
# The score set has a published superseded score set but has not superseding score set.
364+
if action is not None and not has_permission(user_data, score_set, action).permitted:
365+
while score_set.superseded_score_set is not None:
366+
next_score_set_in_chain = score_set.superseded_score_set
367+
if has_permission(user_data, next_score_set_in_chain, action).permitted:
368+
return next_score_set_in_chain
369+
else:
370+
score_set = next_score_set_in_chain
371+
return None
372+
373+
return score_set
374+
375+
376+
def find_publish_or_private_superseded_score_set_tail(
377+
score_set: ScoreSet,
378+
action: Optional["Action"] = None,
379+
user_data: Optional["UserData"] = None,
380+
publish: bool = True) -> Optional[ScoreSet]:
381+
from mavedb.lib.permissions import has_permission
382+
if publish:
383+
while score_set.superseding_score_set is not None:
384+
next_score_set_in_chain = score_set.superseding_score_set
385+
# Find the final published one.
386+
if action is not None and has_permission(user_data, score_set, action).permitted \
387+
and next_score_set_in_chain.published_date is None:
388+
return score_set
389+
score_set = next_score_set_in_chain
390+
else:
391+
# Unpublished score set should not be superseded.
392+
# It should not have superseding score set, but possible have superseded score set.
393+
if action is not None and score_set.published_date is None \
394+
and has_permission(user_data, score_set, action).permitted:
395+
return score_set
396+
else:
397+
return None
398+
return score_set
399+
400+
309401
def get_score_set_counts_as_csv(
310402
db: Session,
311403
score_set: ScoreSet,
312404
start: Optional[int] = None,
313405
limit: Optional[int] = None,
406+
drop_na_columns: Optional[bool] = None,
314407
) -> str:
315408
assert type(score_set.dataset_columns) is dict
316409
count_columns = [str(x) for x in list(score_set.dataset_columns.get("count_columns", []))]
@@ -329,6 +422,9 @@ def get_score_set_counts_as_csv(
329422
variants = db.scalars(variants_query).all()
330423

331424
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column)
425+
if drop_na_columns:
426+
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)
427+
332428
stream = io.StringIO()
333429
writer = csv.DictWriter(stream, fieldnames=columns, quoting=csv.QUOTE_MINIMAL)
334430
writer.writeheader()
@@ -341,6 +437,7 @@ def get_score_set_scores_as_csv(
341437
score_set: ScoreSet,
342438
start: Optional[int] = None,
343439
limit: Optional[int] = None,
440+
drop_na_columns: Optional[bool] = None,
344441
) -> str:
345442
assert type(score_set.dataset_columns) is dict
346443
score_columns = [str(x) for x in list(score_set.dataset_columns.get("score_columns", []))]
@@ -359,13 +456,38 @@ def get_score_set_scores_as_csv(
359456
variants = db.scalars(variants_query).all()
360457

361458
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column)
459+
if drop_na_columns:
460+
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)
461+
362462
stream = io.StringIO()
363463
writer = csv.DictWriter(stream, fieldnames=columns, quoting=csv.QUOTE_MINIMAL)
364464
writer.writeheader()
365465
writer.writerows(rows_data)
366466
return stream.getvalue()
367467

368468

469+
def drop_na_columns_from_csv_file_rows(
470+
rows_data: Iterable[dict[str, Any]],
471+
columns: list[str]
472+
) -> tuple[list[dict[str, Any]], list[str]]:
473+
"""Process rows_data for downloadable CSV by removing empty columns."""
474+
# Convert map to list.
475+
rows_data = list(rows_data)
476+
columns_to_check = ["hgvs_nt", "hgvs_splice", "hgvs_pro"]
477+
columns_to_remove = []
478+
479+
# Check if all values in a column are None or "NA"
480+
for col in columns_to_check:
481+
if all(validate_is_null(row[col]) for row in rows_data):
482+
columns_to_remove.append(col)
483+
for row in rows_data:
484+
row.pop(col, None) # Remove column from each row
485+
486+
# Remove these columns from the header list
487+
columns = [col for col in columns if col not in columns_to_remove]
488+
return rows_data, columns
489+
490+
369491
null_values_re = re.compile(r"\s+|none|nan|na|undefined|n/a|null|nil", flags=re.IGNORECASE)
370492

371493

src/mavedb/lib/validation/urn_re.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
MAVEDB_TMP_URN_PATTERN = r"tmp:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
99
MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN)
1010

11+
# Old temp URN
12+
MAVEDB_OLD_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{16}$"
13+
MAVEDB_OLD_TMP_URN_RE = re.compile(MAVEDB_OLD_TMP_URN_PATTERN)
14+
1115
# Experiment set URN
1216
MAVEDB_EXPERIMENT_SET_URN_PATTERN = rf"urn:{MAVEDB_URN_NAMESPACE}:\d{{{MAVEDB_EXPERIMENT_SET_URN_DIGITS}}}"
1317
MAVEDB_EXPERIMENT_SET_URN_RE = re.compile(MAVEDB_EXPERIMENT_SET_URN_PATTERN)

src/mavedb/routers/experiments.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
from mavedb.lib.keywords import search_keyword
2424
from mavedb.lib.logging import LoggedRoute
2525
from mavedb.lib.logging.context import logging_context, save_to_logging_context
26-
from mavedb.lib.permissions import Action, assert_permission, has_permission
26+
from mavedb.lib.permissions import Action, assert_permission
27+
from mavedb.lib.score_sets import find_superseded_score_set_tail
2728
from mavedb.lib.validation.exceptions import ValidationError
2829
from mavedb.lib.validation.keywords import validate_keyword_list
2930
from mavedb.models.contributor import Contributor
@@ -166,20 +167,25 @@ def get_experiment_score_sets(
166167
.filter(~ScoreSet.superseding_score_set.has())
167168
.all()
168169
)
169-
score_set_result[:] = [
170-
score_set for score_set in score_set_result if has_permission(user_data, score_set, Action.READ).permitted
171-
]
172170

173-
if not score_set_result:
171+
filter_superseded_score_set_tails = [
172+
find_superseded_score_set_tail(
173+
score_set,
174+
Action.READ,
175+
user_data
176+
) for score_set in score_set_result
177+
]
178+
filtered_score_sets = [score_set for score_set in filter_superseded_score_set_tails if score_set is not None]
179+
if not filtered_score_sets:
174180
save_to_logging_context({"associated_resources": []})
175181
logger.info(msg="No score sets are associated with the requested experiment.", extra=logging_context())
176182

177183
raise HTTPException(status_code=404, detail="no associated score sets")
178184
else:
179-
score_set_result.sort(key=attrgetter("urn"))
185+
filtered_score_sets.sort(key=attrgetter("urn"))
180186
save_to_logging_context({"associated_resources": [item.urn for item in score_set_result]})
181187

182-
return score_set_result
188+
return filtered_score_sets
183189

184190

185191
@router.post(

src/mavedb/routers/score_sets.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
logging_context,
3535
save_to_logging_context,
3636
)
37-
from mavedb.lib.permissions import Action, assert_permission
37+
from mavedb.lib.permissions import Action, assert_permission, has_permission
3838
from mavedb.lib.score_sets import (
3939
csv_data_to_df,
4040
find_meta_analyses_for_experiment_sets,
@@ -43,6 +43,7 @@
4343
variants_to_csv_rows,
4444
)
4545
from mavedb.lib.score_sets import (
46+
fetch_superseding_score_set_in_search_result,
4647
search_score_sets as _search_score_sets,
4748
refresh_variant_urns,
4849
)
@@ -109,6 +110,10 @@ async def fetch_score_set_by_urn(
109110
raise HTTPException(status_code=404, detail=f"score set with URN '{urn}' not found")
110111

111112
assert_permission(user, item, Action.READ)
113+
114+
if item.superseding_score_set and not has_permission(user, item.superseding_score_set, Action.READ).permitted:
115+
item.superseding_score_set = None
116+
112117
return item
113118

114119

@@ -121,11 +126,16 @@ async def fetch_score_set_by_urn(
121126

122127

123128
@router.post("/score-sets/search", status_code=200, response_model=list[score_set.ShortScoreSet])
124-
def search_score_sets(search: ScoreSetsSearch, db: Session = Depends(deps.get_db)) -> Any: # = Body(..., embed=True),
129+
def search_score_sets(
130+
search: ScoreSetsSearch,
131+
db: Session = Depends(deps.get_db),
132+
user_data: Optional[UserData] = Depends(get_current_user),
133+
) -> Any: # = Body(..., embed=True),
125134
"""
126135
Search score sets.
127136
"""
128-
return _search_score_sets(db, None, search)
137+
score_sets = _search_score_sets(db, None, search)
138+
return fetch_superseding_score_set_in_search_result(score_sets, user_data, search)
129139

130140

131141
@router.post(
@@ -141,7 +151,8 @@ def search_my_score_sets(
141151
"""
142152
Search score sets created by the current user..
143153
"""
144-
return _search_score_sets(db, user_data.user, search)
154+
score_sets = _search_score_sets(db, user_data.user, search)
155+
return fetch_superseding_score_set_in_search_result(score_sets, user_data, search)
145156

146157

147158
@router.get(
@@ -180,6 +191,7 @@ def get_score_set_scores_csv(
180191
urn: str,
181192
start: int = Query(default=None, description="Start index for pagination"),
182193
limit: int = Query(default=None, description="Number of variants to return"),
194+
drop_na_columns: Optional[bool] = None,
183195
db: Session = Depends(deps.get_db),
184196
user_data: Optional[UserData] = Depends(get_current_user),
185197
) -> Any:
@@ -214,7 +226,7 @@ def get_score_set_scores_csv(
214226

215227
assert_permission(user_data, score_set, Action.READ)
216228

217-
csv_str = get_score_set_scores_as_csv(db, score_set, start, limit)
229+
csv_str = get_score_set_scores_as_csv(db, score_set, start, limit, drop_na_columns)
218230
return StreamingResponse(iter([csv_str]), media_type="text/csv")
219231

220232

@@ -234,6 +246,7 @@ async def get_score_set_counts_csv(
234246
urn: str,
235247
start: int = Query(default=None, description="Start index for pagination"),
236248
limit: int = Query(default=None, description="Number of variants to return"),
249+
drop_na_columns: Optional[bool] = None,
237250
db: Session = Depends(deps.get_db),
238251
user_data: Optional[UserData] = Depends(get_current_user),
239252
) -> Any:
@@ -268,7 +281,7 @@ async def get_score_set_counts_csv(
268281

269282
assert_permission(user_data, score_set, Action.READ)
270283

271-
csv_str = get_score_set_counts_as_csv(db, score_set, start, limit)
284+
csv_str = get_score_set_counts_as_csv(db, score_set, start, limit, drop_na_columns)
272285
return StreamingResponse(iter([csv_str]), media_type="text/csv")
273286

274287

@@ -299,10 +312,10 @@ def get_score_set_mapped_variants(
299312

300313
mapped_variants = (
301314
db.query(MappedVariant)
302-
.filter(ScoreSet.urn == urn)
303-
.filter(ScoreSet.id == Variant.score_set_id)
304-
.filter(Variant.id == MappedVariant.variant_id)
305-
.all()
315+
.filter(ScoreSet.urn == urn)
316+
.filter(ScoreSet.id == Variant.score_set_id)
317+
.filter(Variant.id == MappedVariant.variant_id)
318+
.all()
306319
)
307320

308321
if not mapped_variants:
@@ -469,9 +482,10 @@ async def create_score_set(
469482
for identifier in item_create.primary_publication_identifiers or []
470483
]
471484
publication_identifiers = [
472-
await find_or_create_publication_identifier(db, identifier.identifier, identifier.db_name)
473-
for identifier in item_create.secondary_publication_identifiers or []
474-
] + primary_publication_identifiers
485+
await find_or_create_publication_identifier(db, identifier.identifier,
486+
identifier.db_name)
487+
for identifier in item_create.secondary_publication_identifiers or []
488+
] + primary_publication_identifiers
475489

476490
# create a temporary `primary` attribute on each of our publications that indicates
477491
# to our association proxy whether it is a primary publication or not

tests/helpers/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from mavedb.models.enums.processing_state import ProcessingState
66

77
TEST_PUBMED_IDENTIFIER = "20711194"
8+
TEST_PUBMED_URL_IDENTIFIER = "https://pubmed.ncbi.nlm.nih.gov/37162834/"
89
TEST_BIORXIV_IDENTIFIER = "2021.06.21.212592"
910
TEST_MEDRXIV_IDENTIFIER = "2021.06.22.21259265"
1011
TEST_CROSSREF_IDENTIFIER = "10.1371/2021.06.22.21259265"

tests/helpers/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def mock_worker_variant_insertion(client, db, data_provider, score_set, scores_c
159159
score_df = csv_data_to_df(score_file)
160160

161161
if counts_csv_path is not None:
162-
with open(scores_csv_path, "rb") as counts_file:
162+
with open(counts_csv_path, "rb") as counts_file:
163163
counts_df = csv_data_to_df(counts_file)
164164
else:
165165
counts_df = None

0 commit comments

Comments
 (0)