Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
61aa6ef
Add build path to docker-compose-dev.yaml for dcd-mapping and cdot-re…
davereinhart Sep 25, 2025
73b3a1a
Update redis environment variables in template file
davereinhart Sep 25, 2025
12d6c0e
Update value check on score_set and experiment updates to handle non-…
davereinhart Sep 29, 2025
9e69143
feat: add dataset_columns support in score set updates
davereinhart Sep 30, 2025
9d599d0
feat: process and validate dataset column metadata for scores and cou…
davereinhart Oct 3, 2025
f28c8e7
feat: add ScoreSetUpdateAllOptional with multipart form helper
davereinhart Oct 7, 2025
5cb60e0
refactor: move dataset column pydantic models to dedicated module
davereinhart Oct 7, 2025
bbcb3f2
refactor: replace dynamic camelization test with explicit model
davereinhart Oct 8, 2025
454cf86
feat: extend SavedDatasetColumns with recordType
davereinhart Oct 8, 2025
e3812b1
feat: add PATCH endpoint supporting variants + score/count metadata u…
davereinhart Oct 9, 2025
45c195d
feat: add target gene find_or_create helpers for sequence/accession
davereinhart Oct 11, 2025
de87fc4
refactor: unify variable names for score/count metadata & extend routes
davereinhart Oct 13, 2025
7fe4ed8
feat: enhance PATCH endpoint to fully process uploaded files
davereinhart Oct 14, 2025
cf89e6e
test: update and add unit tests aligned with new score set update flow
davereinhart Oct 17, 2025
d49fb27
test: add ScoreSetUpdateAllOptional model tests and router validation…
davereinhart Oct 17, 2025
4fa4995
feat: add worker job fixtures for score/count column metadata
davereinhart Oct 17, 2025
b57c11f
Apply ruff format and organize import on files in this branch
davereinhart Oct 27, 2025
8391f67
Cleanup
davereinhart Oct 27, 2025
5077533
Move all_fields_optional_model decorator to view models utils module …
davereinhart Oct 27, 2025
2950b2e
Add unit tests for all_fields_optional_model decorator
davereinhart Oct 27, 2025
cab9fe5
Updates to score set endpoints to receieve score and count columns me…
davereinhart Oct 29, 2025
b3a5a88
Update unit tests to include score and count columns metadata fields …
davereinhart Oct 29, 2025
2f37d6b
Merge branch 'release-2025.5.0' into davereinhart/scoreset-column-met…
davereinhart Nov 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docker-compose-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ services:
- redis

dcd-mapping:
build: ../dcd_mapping
image: dcd-mapping:dev
command: bash -c "uvicorn api.server_main:app --host 0.0.0.0 --port 8000 --reload"
depends_on:
Expand All @@ -61,6 +62,7 @@ services:
- mavedb-seqrepo-dev:/usr/local/share/seqrepo

cdot-rest:
build: ../cdot_rest
image: cdot-rest:dev
command: bash -c "gunicorn cdot_rest.wsgi:application --bind 0.0.0.0:8000"
env_file:
Expand Down
4 changes: 3 additions & 1 deletion settings/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,10 @@ DCD_MAPPING_URL=http://dcd-mapping:8000
####################################################################################################

CDOT_URL=http://cdot-rest:8000
REDIS_HOST=localhost
REDIS_HOST=redis
REDIS_IP=redis
REDIS_PORT=6379
REDIS_SSL=false

####################################################################################################
# Environment variables for ClinGen
Expand Down
137 changes: 136 additions & 1 deletion src/mavedb/lib/target_genes.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,154 @@
import logging
from typing import Optional

from sqlalchemy import func, or_
from sqlalchemy import and_, func, or_
from sqlalchemy.orm import Session

from mavedb.lib.logging.context import logging_context, save_to_logging_context
from mavedb.models.contributor import Contributor
from mavedb.models.score_set import ScoreSet
from mavedb.models.target_accession import TargetAccession
from mavedb.models.target_gene import TargetGene
from mavedb.models.target_sequence import TargetSequence
from mavedb.models.taxonomy import Taxonomy
from mavedb.models.user import User
from mavedb.view_models.search import TextSearch

logger = logging.getLogger(__name__)


def find_or_create_target_gene_by_accession(
db: Session,
score_set_id: int,
tg: dict,
tg_accession: dict,
) -> TargetGene:
"""
Find or create a target gene for a score set by accession. If the existing target gene or related accession record is modified,
this function creates a new target gene so that that its id can be used to determine if a score set has changed in a way
that requires the create variants job to be re-run.

: param db: Database session
: param score_set_id: ID of the score set to associate the target gene with
: param tg: Dictionary with target gene details (name, category, etc.)
: param tg_accession: Dictionary with target accession details (accession, assembly, gene, etc.)
: return: The found or newly created TargetGene instance
"""
target_gene = None
logger.info(
msg=f"Searching for existing target gene by accession within score set {score_set_id}.",
extra=logging_context(),
)
if tg_accession is not None and tg_accession.get("accession"):
target_gene = (
db.query(TargetGene)
.filter(
and_(
TargetGene.target_accession.has(
and_(
TargetAccession.accession == tg_accession["accession"],
TargetAccession.assembly == tg_accession["assembly"],
TargetAccession.gene == tg_accession["gene"],
TargetAccession.is_base_editor == tg_accession.get("is_base_editor", False),
)
),
TargetGene.name == tg["name"],
TargetGene.category == tg["category"],
TargetGene.score_set_id == score_set_id,
)
)
.first()
)

if target_gene is None:
target_accession = TargetAccession(**tg_accession)
target_gene = TargetGene(
**tg,
score_set_id=score_set_id,
target_accession=target_accession,
)
db.add(target_gene)
db.commit()
db.refresh(target_gene)
logger.info(
msg=f"Created new target gene '{target_gene.name}' with ID {target_gene.id}.",
extra=logging_context(),
)
else:
logger.info(
msg=f"Found existing target gene '{target_gene.name}' with ID {target_gene.id}.",
extra=logging_context(),
)

return target_gene


def find_or_create_target_gene_by_sequence(
db: Session,
score_set_id: int,
tg: dict,
tg_sequence: dict,
) -> TargetGene:
"""
Find or create a target gene for a score set by sequence. If the existing target gene or related sequence record is modified,
this function creates a new target gene so that that its id can be used to determine if a score set has changed in a way
that requires the create variants job to be re-run.

: param db: Database session
: param score_set_id: ID of the score set to associate the target gene with
: param tg: Dictionary with target gene details (name, category, etc.)
: param tg_sequence: Dictionary with target sequence details (sequence, sequence_type, taxonomy, label, etc.)
: return: The found or newly created TargetGene instance
"""
target_gene = None
logger.info(
msg=f"Searching for existing target gene by sequence within score set {score_set_id}.",
extra=logging_context(),
)
if tg_sequence is not None and tg_sequence.get("sequence"):
target_gene = (
db.query(TargetGene)
.filter(
and_(
TargetGene.target_sequence.has(
and_(
TargetSequence.sequence == tg_sequence["sequence"],
TargetSequence.sequence_type == tg_sequence["sequence_type"],
TargetSequence.taxonomy.has(Taxonomy.id == tg_sequence["taxonomy"].id),
TargetSequence.label == tg_sequence["label"],
)
),
TargetGene.name == tg["name"],
TargetGene.category == tg["category"],
TargetGene.score_set_id == score_set_id,
)
)
.first()
)

if target_gene is None:
target_sequence = TargetSequence(**tg_sequence)
target_gene = TargetGene(
**tg,
score_set_id=score_set_id,
target_sequence=target_sequence,
)
db.add(target_gene)
db.commit()
db.refresh(target_gene)
logger.info(
msg=f"Created new target gene '{target_gene.name}' with ID {target_gene.id}.",
extra=logging_context(),
)
else:
logger.info(
msg=f"Found existing target gene '{target_gene.name}' with ID {target_gene.id}.",
extra=logging_context(),
)

return target_gene


def search_target_genes(
db: Session,
owner_or_contributor: Optional[User],
Expand Down
119 changes: 99 additions & 20 deletions src/mavedb/lib/validation/dataframe/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
from typing import Optional, Tuple, TYPE_CHECKING
from typing import TYPE_CHECKING, Any, Optional, Tuple

import numpy as np
import pandas as pd

from mavedb.lib.exceptions import MixedTargetError
from mavedb.lib.validation.constants.general import (
guide_sequence_column,
hgvs_nt_column,
hgvs_pro_column,
hgvs_splice_column,
guide_sequence_column,
required_score_column,
)
from mavedb.lib.validation.exceptions import ValidationError
from mavedb.models.target_gene import TargetGene
from mavedb.lib.validation.dataframe.column import validate_data_column
from mavedb.lib.validation.dataframe.variant import (
validate_hgvs_transgenic_column,
validate_hgvs_genomic_column,
validate_guide_sequence_column,
validate_hgvs_genomic_column,
validate_hgvs_prefix_combinations,
validate_hgvs_transgenic_column,
)
from mavedb.lib.validation.exceptions import ValidationError
from mavedb.models.target_gene import TargetGene
from mavedb.view_models.score_set_dataset_columns import DatasetColumnMetadata

if TYPE_CHECKING:
from cdot.hgvs.dataproviders import RESTDataProvider
Expand All @@ -28,12 +29,28 @@
STANDARD_COLUMNS = (hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, required_score_column, guide_sequence_column)


def clean_col_name(col: str) -> str:
col = col.strip()
# Only remove quotes if the column name is fully quoted
if (col.startswith('"') and col.endswith('"')) or (col.startswith("'") and col.endswith("'")):
col = col[1:-1]

return col.strip()


def validate_and_standardize_dataframe_pair(
scores_df: pd.DataFrame,
counts_df: Optional[pd.DataFrame],
score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
targets: list[TargetGene],
hdp: Optional["RESTDataProvider"],
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
) -> Tuple[
pd.DataFrame,
Optional[pd.DataFrame],
Optional[dict[str, DatasetColumnMetadata]],
Optional[dict[str, DatasetColumnMetadata]],
]:
"""
Perform validation and standardization on a pair of score and count dataframes.

Expand All @@ -43,15 +60,19 @@ def validate_and_standardize_dataframe_pair(
The scores dataframe
counts_df : Optional[pandas.DataFrame]
The counts dataframe, can be None if not present
score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
The scores column metadata, can be None if not present
count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
The counts column metadata, can be None if not present
targets : str
The target genes on which to validate dataframes
hdp : RESTDataProvider
The biocommons.hgvs compatible data provider. Used to fetch sequences for hgvs validation.

Returns
-------
Tuple[pd.DataFrame, Optional[pd.DataFrame]]
The standardized score and count dataframes, or score and None if no count dataframe was provided
Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[dict[str, DatasetColumnMetadata]], Optional[dict[str, DatasetColumnMetadata]]]
The standardized score and count dataframes, plus score column metadata and counts column metadata dictionaries. Counts dataframe and column metadata dictionaries can be None if not provided.

Raises
------
Expand All @@ -65,11 +86,32 @@ def validate_and_standardize_dataframe_pair(
standardized_counts_df = standardize_dataframe(counts_df) if counts_df is not None else None

validate_dataframe(standardized_scores_df, "scores", targets, hdp)

if score_columns_metadata is not None:
standardized_score_columns_metadata = standardize_dict_keys(score_columns_metadata)
validate_df_column_metadata_match(standardized_scores_df, standardized_score_columns_metadata)
else:
standardized_score_columns_metadata = None

if standardized_counts_df is not None:
validate_dataframe(standardized_counts_df, "counts", targets, hdp)
validate_variant_columns_match(standardized_scores_df, standardized_counts_df)

return standardized_scores_df, standardized_counts_df
if count_columns_metadata is not None:
standardized_count_columns_metadata = standardize_dict_keys(count_columns_metadata)
validate_df_column_metadata_match(standardized_counts_df, standardized_count_columns_metadata)
else:
standardized_count_columns_metadata = None
else:
if count_columns_metadata is not None and len(count_columns_metadata.keys()) > 0:
raise ValidationError("Counts column metadata provided without counts dataframe")
standardized_count_columns_metadata = None

return (
standardized_scores_df,
standardized_counts_df,
standardized_score_columns_metadata,
standardized_count_columns_metadata,
)


def validate_dataframe(
Expand Down Expand Up @@ -163,6 +205,25 @@ def validate_dataframe(
)


def standardize_dict_keys(d: dict[str, Any]) -> dict[str, Any]:
"""
Standardize the keys of a dictionary by stripping leading and trailing whitespace
and removing any quoted strings from the keys.

Parameters
----------
d : dict[str, DatasetColumnMetadata]
The dictionary to standardize

Returns
-------
dict[str, DatasetColumnMetadata]
The standardized dictionary
"""

return {clean_col_name(k): v for k, v in d.items()}


def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""Standardize a dataframe by sorting the columns and changing the standard column names to lowercase.
Also strips leading and trailing whitespace from column names and removes any quoted strings from column names.
Expand All @@ -186,15 +247,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
The standardized dataframe
"""

def clean_column(col: str) -> str:
col = col.strip()
# Only remove quotes if the column name is fully quoted
if (col.startswith('"') and col.endswith('"')) or (col.startswith("'") and col.endswith("'")):
col = col[1:-1]

return col.strip()

cleaned_columns = {c: clean_column(c) for c in df.columns}
cleaned_columns = {c: clean_col_name(c) for c in df.columns}
df.rename(columns=cleaned_columns, inplace=True)

column_mapper = {x: x.lower() for x in df.columns if x.lower() in STANDARD_COLUMNS}
Expand Down Expand Up @@ -368,6 +421,32 @@ def validate_variant_consistency(df: pd.DataFrame) -> None:
pass


def validate_df_column_metadata_match(df: pd.DataFrame, columnMetadata: dict[str, DatasetColumnMetadata]):
"""
Checks that metadata keys match the dataframe column names and exclude standard column names.

Parameters
----------
df1 : pandas.DataFrame
Dataframe parsed from an uploaded scores file
columnMetadata : dict[str, DatasetColumnMetadata]
Metadata for the scores columns

Raises
------
ValidationError
If any metadata keys do not match dataframe column names
ValidationError
If any metadata keys match standard columns

"""
for key in columnMetadata.keys():
if key.lower() in STANDARD_COLUMNS:
raise ValidationError(f"standard column '{key}' cannot have metadata defined")
elif key not in df.columns:
raise ValidationError(f"column metadata key '{key}' does not match any dataframe column names")


def validate_variant_columns_match(df1: pd.DataFrame, df2: pd.DataFrame):
"""
Checks if two dataframes have matching HGVS columns.
Expand Down
Loading