Skip to content

Commit 5749652

Browse files
authored
Merge pull request #385 from VariantEffect/estelle/dropDownloadNullHgvsColumns
Remove NA columns from dowloading scores and counts files.
2 parents 5ce410d + 51d41b0 commit 5749652

File tree

8 files changed

+309
-8
lines changed

8 files changed

+309
-8
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""score set level score thresholds
2+
3+
Revision ID: aa73d39b3705
4+
Revises: 68a0ec57694e
5+
Create Date: 2024-11-13 11:23:57.917725
6+
7+
"""
8+
9+
from alembic import op
10+
import sqlalchemy as sa
11+
from sqlalchemy.dialects import postgresql
12+
13+
# revision identifiers, used by Alembic.
14+
revision = "aa73d39b3705"
15+
down_revision = "68a0ec57694e"
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade():
21+
# ### commands auto generated by Alembic - please adjust! ###
22+
op.add_column("scoresets", sa.Column("score_calibrations", postgresql.JSONB(astext_type=sa.Text()), nullable=True))
23+
# ### end Alembic commands ###
24+
25+
26+
def downgrade():
27+
# ### commands auto generated by Alembic - please adjust! ###
28+
op.drop_column("scoresets", "score_calibrations")
29+
# ### end Alembic commands ###

src/mavedb/lib/score_sets.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
)
2222
from mavedb.lib.mave.utils import is_csv_null
2323
from mavedb.lib.validation.constants.general import null_values_list
24+
from mavedb.lib.validation.utilities import is_null as validate_is_null
2425
from mavedb.models.contributor import Contributor
2526
from mavedb.models.controlled_keyword import ControlledKeyword
2627
from mavedb.models.doi_identifier import DoiIdentifier
@@ -311,6 +312,7 @@ def get_score_set_counts_as_csv(
311312
score_set: ScoreSet,
312313
start: Optional[int] = None,
313314
limit: Optional[int] = None,
315+
drop_na_columns: Optional[bool] = None,
314316
) -> str:
315317
assert type(score_set.dataset_columns) is dict
316318
count_columns = [str(x) for x in list(score_set.dataset_columns.get("count_columns", []))]
@@ -329,6 +331,9 @@ def get_score_set_counts_as_csv(
329331
variants = db.scalars(variants_query).all()
330332

331333
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column)
334+
if drop_na_columns:
335+
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)
336+
332337
stream = io.StringIO()
333338
writer = csv.DictWriter(stream, fieldnames=columns, quoting=csv.QUOTE_MINIMAL)
334339
writer.writeheader()
@@ -341,6 +346,7 @@ def get_score_set_scores_as_csv(
341346
score_set: ScoreSet,
342347
start: Optional[int] = None,
343348
limit: Optional[int] = None,
349+
drop_na_columns: Optional[bool] = None,
344350
) -> str:
345351
assert type(score_set.dataset_columns) is dict
346352
score_columns = [str(x) for x in list(score_set.dataset_columns.get("score_columns", []))]
@@ -359,13 +365,38 @@ def get_score_set_scores_as_csv(
359365
variants = db.scalars(variants_query).all()
360366

361367
rows_data = variants_to_csv_rows(variants, columns=columns, dtype=type_column)
368+
if drop_na_columns:
369+
rows_data, columns = drop_na_columns_from_csv_file_rows(rows_data, columns)
370+
362371
stream = io.StringIO()
363372
writer = csv.DictWriter(stream, fieldnames=columns, quoting=csv.QUOTE_MINIMAL)
364373
writer.writeheader()
365374
writer.writerows(rows_data)
366375
return stream.getvalue()
367376

368377

378+
def drop_na_columns_from_csv_file_rows(
379+
rows_data: Iterable[dict[str, Any]],
380+
columns: list[str]
381+
) -> tuple[list[dict[str, Any]], list[str]]:
382+
"""Process rows_data for downloadable CSV by removing empty columns."""
383+
# Convert map to list.
384+
rows_data = list(rows_data)
385+
columns_to_check = ["hgvs_nt", "hgvs_splice", "hgvs_pro"]
386+
columns_to_remove = []
387+
388+
# Check if all values in a column are None or "NA"
389+
for col in columns_to_check:
390+
if all(validate_is_null(row[col]) for row in rows_data):
391+
columns_to_remove.append(col)
392+
for row in rows_data:
393+
row.pop(col, None) # Remove column from each row
394+
395+
# Remove these columns from the header list
396+
columns = [col for col in columns if col not in columns_to_remove]
397+
return rows_data, columns
398+
399+
369400
null_values_re = re.compile(r"\s+|none|nan|na|undefined|n/a|null|nil", flags=re.IGNORECASE)
370401

371402

src/mavedb/models/score_set.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ class ScoreSet(Base):
157157

158158
target_genes: Mapped[List["TargetGene"]] = relationship(back_populates="score_set", cascade="all, delete-orphan")
159159
score_ranges = Column(JSONB, nullable=True)
160+
score_calibrations = Column(JSONB, nullable=True)
160161

161162
# Unfortunately, we can't use association_proxy here, because in spite of what the documentation seems to imply, it
162163
# doesn't check for a pre-existing keyword with the same text.

src/mavedb/routers/score_sets.py

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,18 @@
99
from fastapi.encoders import jsonable_encoder
1010
from fastapi.exceptions import HTTPException
1111
from fastapi.responses import StreamingResponse
12-
from sqlalchemy import or_
13-
from sqlalchemy.exc import MultipleResultsFound
12+
from sqlalchemy import or_, select
13+
from sqlalchemy.exc import MultipleResultsFound, NoResultFound
1414
from sqlalchemy.orm import Session
1515

1616
from mavedb import deps
1717
from mavedb.lib.authentication import UserData
18-
from mavedb.lib.authorization import get_current_user, require_current_user, require_current_user_with_email
18+
from mavedb.lib.authorization import (
19+
get_current_user,
20+
require_current_user,
21+
require_current_user_with_email,
22+
RoleRequirer,
23+
)
1924
from mavedb.lib.contributors import find_or_create_contributor
2025
from mavedb.lib.exceptions import MixedTargetError, NonexistentOrcidUserError, ValidationError
2126
from mavedb.lib.identifiers import (
@@ -49,6 +54,7 @@
4954
)
5055
from mavedb.models.contributor import Contributor
5156
from mavedb.models.enums.processing_state import ProcessingState
57+
from mavedb.models.enums.user_role import UserRole
5258
from mavedb.models.experiment import Experiment
5359
from mavedb.models.license import License
5460
from mavedb.models.mapped_variant import MappedVariant
@@ -57,7 +63,7 @@
5763
from mavedb.models.target_gene import TargetGene
5864
from mavedb.models.target_sequence import TargetSequence
5965
from mavedb.models.variant import Variant
60-
from mavedb.view_models import mapped_variant, score_set
66+
from mavedb.view_models import mapped_variant, score_set, calibration
6167
from mavedb.view_models.search import ScoreSetsSearch
6268

6369
logger = logging.getLogger(__name__)
@@ -174,6 +180,7 @@ def get_score_set_scores_csv(
174180
urn: str,
175181
start: int = Query(default=None, description="Start index for pagination"),
176182
limit: int = Query(default=None, description="Number of variants to return"),
183+
drop_na_columns: Optional[bool] = None,
177184
db: Session = Depends(deps.get_db),
178185
user_data: Optional[UserData] = Depends(get_current_user),
179186
) -> Any:
@@ -208,7 +215,7 @@ def get_score_set_scores_csv(
208215

209216
assert_permission(user_data, score_set, Action.READ)
210217

211-
csv_str = get_score_set_scores_as_csv(db, score_set, start, limit)
218+
csv_str = get_score_set_scores_as_csv(db, score_set, start, limit, drop_na_columns)
212219
return StreamingResponse(iter([csv_str]), media_type="text/csv")
213220

214221

@@ -228,6 +235,7 @@ async def get_score_set_counts_csv(
228235
urn: str,
229236
start: int = Query(default=None, description="Start index for pagination"),
230237
limit: int = Query(default=None, description="Number of variants to return"),
238+
drop_na_columns: Optional[bool] = None,
231239
db: Session = Depends(deps.get_db),
232240
user_data: Optional[UserData] = Depends(get_current_user),
233241
) -> Any:
@@ -262,7 +270,7 @@ async def get_score_set_counts_csv(
262270

263271
assert_permission(user_data, score_set, Action.READ)
264272

265-
csv_str = get_score_set_counts_as_csv(db, score_set, start, limit)
273+
csv_str = get_score_set_counts_as_csv(db, score_set, start, limit, drop_na_columns)
266274
return StreamingResponse(iter([csv_str]), media_type="text/csv")
267275

268276

@@ -336,8 +344,10 @@ async def create_score_set(
336344
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Unknown experiment")
337345
# Not allow add score set in meta-analysis experiments.
338346
if any(s.meta_analyzes_score_sets for s in experiment.score_sets):
339-
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN,
340-
detail="Score sets may not be added to a meta-analysis experiment.")
347+
raise HTTPException(
348+
status_code=status.HTTP_403_FORBIDDEN,
349+
detail="Score sets may not be added to a meta-analysis experiment.",
350+
)
341351

342352
save_to_logging_context({"experiment": experiment.urn})
343353
assert_permission(user_data, experiment, Action.ADD_SCORE_SET)
@@ -656,6 +666,43 @@ async def upload_score_set_variant_data(
656666
return item
657667

658668

669+
@router.post(
670+
"/score-sets/{urn}/calibration/data",
671+
response_model=score_set.ScoreSet,
672+
responses={422: {}},
673+
response_model_exclude_none=True,
674+
)
675+
async def update_score_set_calibration_data(
676+
*,
677+
urn: str,
678+
calibration_update: dict[str, calibration.Calibration],
679+
db: Session = Depends(deps.get_db),
680+
user_data: UserData = Depends(RoleRequirer([UserRole.admin])),
681+
):
682+
"""
683+
Update thresholds / score calibrations for a score set.
684+
"""
685+
save_to_logging_context({"requested_resource": urn, "resource_property": "score_thresholds"})
686+
687+
try:
688+
item = db.scalars(select(ScoreSet).where(ScoreSet.urn == urn)).one()
689+
except NoResultFound:
690+
logger.info(
691+
msg="Failed to add score thresholds; The requested score set does not exist.", extra=logging_context()
692+
)
693+
raise HTTPException(status_code=404, detail=f"score set with URN '{urn}' not found")
694+
695+
assert_permission(user_data, item, Action.UPDATE)
696+
697+
item.score_calibrations = {k: v.dict() for k, v in calibration_update.items()}
698+
db.add(item)
699+
db.commit()
700+
db.refresh(item)
701+
702+
save_to_logging_context({"updated_resource": item.urn})
703+
return item
704+
705+
659706
@router.put(
660707
"/score-sets/{urn}", response_model=score_set.ScoreSet, responses={422: {}}, response_model_exclude_none=True
661708
)
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from typing import Union
2+
3+
from pydantic import root_validator
4+
5+
from mavedb.lib.validation.exceptions import ValidationError
6+
from mavedb.view_models.base.base import BaseModel
7+
8+
9+
class PillarProjectParameters(BaseModel):
10+
skew: float
11+
location: float
12+
scale: float
13+
14+
15+
class PillarProjectParameterSet(BaseModel):
16+
functionally_altering: PillarProjectParameters
17+
functionally_normal: PillarProjectParameters
18+
fraction_functionally_altering: float
19+
20+
21+
class PillarProjectCalibration(BaseModel):
22+
parameter_sets: list[PillarProjectParameterSet]
23+
evidence_strengths: list[int]
24+
thresholds: list[float]
25+
positive_likelihood_ratios: list[float]
26+
prior_probability_pathogenicity: float
27+
28+
@root_validator
29+
def validate_all_calibrations_have_a_pairwise_companion(cls, values):
30+
num_es = len(values.get("evidence_strengths"))
31+
num_st = len(values.get("thresholds"))
32+
num_plr = len(values.get("positive_likelihood_ratios"))
33+
34+
if len(set((num_es, num_st, num_plr))) != 1:
35+
raise ValidationError(
36+
"Calibration object must provide the same number of evidence strengths, score thresholds, and positive likelihood ratios. "
37+
"One or more of these provided objects was not the same length as the others."
38+
)
39+
40+
return values
41+
42+
43+
Calibration = Union[PillarProjectCalibration]

src/mavedb/view_models/score_set.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from mavedb.models.enums.processing_state import ProcessingState
1616
from mavedb.view_models import PublicationIdentifiersGetter, record_type_validator, set_record_type
1717
from mavedb.view_models.base.base import BaseModel, validator
18+
from mavedb.view_models.calibration import Calibration
1819
from mavedb.view_models.contributor import Contributor, ContributorCreate
1920
from mavedb.view_models.doi_identifier import (
2021
DoiIdentifier,
@@ -387,6 +388,7 @@ class SavedScoreSet(ScoreSetBase):
387388
external_links: Dict[str, ExternalLink]
388389
contributors: list[Contributor]
389390
score_ranges: Optional[ScoreRanges]
391+
score_calibrations: Optional[dict[str, Calibration]]
390392

391393
_record_type_factory = record_type_validator()(set_record_type)
392394

tests/helpers/constants.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,10 +658,41 @@
658658
],
659659
}
660660

661+
661662
TEST_SAVED_SCORESET_RANGE = {
662663
"wtScore": 1.0,
663664
"ranges": [
664665
{"label": "test1", "classification": "normal", "range": [0.0, 2.0]},
665666
{"label": "test2", "classification": "abnormal", "range": [-2.0, 0.0]},
666667
],
667668
}
669+
670+
671+
TEST_SCORE_CALIBRATION = {
672+
"parameter_sets": [
673+
{
674+
"functionally_altering": {"skew": 1.15, "location": -2.20, "scale": 1.20},
675+
"functionally_normal": {"skew": -1.5, "location": 2.25, "scale": 0.8},
676+
"fraction_functionally_altering": 0.20,
677+
},
678+
],
679+
"evidence_strengths": [3, 2, 1, -1],
680+
"thresholds": [1.25, 2.5, 3, 5.5],
681+
"positive_likelihood_ratios": [100, 10, 1, 0.1],
682+
"prior_probability_pathogenicity": 0.20,
683+
}
684+
685+
686+
TEST_SAVED_SCORE_CALIBRATION = {
687+
"parameterSets": [
688+
{
689+
"functionallyAltering": {"skew": 1.15, "location": -2.20, "scale": 1.20},
690+
"functionallyNormal": {"skew": -1.5, "location": 2.25, "scale": 0.8},
691+
"fractionFunctionallyAltering": 0.20,
692+
},
693+
],
694+
"evidenceStrengths": [3, 2, 1, -1],
695+
"thresholds": [1.25, 2.5, 3, 5.5],
696+
"positiveLikelihoodRatios": [100, 10, 1, 0.1],
697+
"priorProbabilityPathogenicity": 0.20,
698+
}

0 commit comments

Comments
 (0)