Skip to content

Commit b155ed6

Browse files
authored
Merge pull request #546 from VariantEffect/davereinhart/scoreset-column-metadata
Adds column metadata support and a multipart PATCH endpoint to update score sets and variants
2 parents a0cec30 + 2f37d6b commit b155ed6

28 files changed

+1751
-502
lines changed

docker-compose-dev.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ services:
4848
- redis
4949

5050
dcd-mapping:
51+
build: ../dcd_mapping
5152
image: dcd-mapping:dev
5253
command: bash -c "uvicorn api.server_main:app --host 0.0.0.0 --port 8000 --reload"
5354
depends_on:
@@ -61,6 +62,7 @@ services:
6162
- mavedb-seqrepo-dev:/usr/local/share/seqrepo
6263

6364
cdot-rest:
65+
build: ../cdot_rest
6466
image: cdot-rest:dev
6567
command: bash -c "gunicorn cdot_rest.wsgi:application --bind 0.0.0.0:8000"
6668
env_file:

settings/.env.template

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,10 @@ DCD_MAPPING_URL=http://dcd-mapping:8000
6767
####################################################################################################
6868

6969
CDOT_URL=http://cdot-rest:8000
70-
REDIS_HOST=localhost
70+
REDIS_HOST=redis
71+
REDIS_IP=redis
7172
REDIS_PORT=6379
73+
REDIS_SSL=false
7274

7375
####################################################################################################
7476
# Environment variables for ClinGen

src/mavedb/lib/target_genes.py

Lines changed: 136 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,154 @@
11
import logging
22
from typing import Optional
33

4-
from sqlalchemy import func, or_
4+
from sqlalchemy import and_, func, or_
55
from sqlalchemy.orm import Session
66

77
from mavedb.lib.logging.context import logging_context, save_to_logging_context
88
from mavedb.models.contributor import Contributor
99
from mavedb.models.score_set import ScoreSet
10+
from mavedb.models.target_accession import TargetAccession
1011
from mavedb.models.target_gene import TargetGene
12+
from mavedb.models.target_sequence import TargetSequence
13+
from mavedb.models.taxonomy import Taxonomy
1114
from mavedb.models.user import User
1215
from mavedb.view_models.search import TextSearch
1316

1417
logger = logging.getLogger(__name__)
1518

1619

20+
def find_or_create_target_gene_by_accession(
21+
db: Session,
22+
score_set_id: int,
23+
tg: dict,
24+
tg_accession: dict,
25+
) -> TargetGene:
26+
"""
27+
Find or create a target gene for a score set by accession. If the existing target gene or related accession record is modified,
28+
this function creates a new target gene so that that its id can be used to determine if a score set has changed in a way
29+
that requires the create variants job to be re-run.
30+
31+
: param db: Database session
32+
: param score_set_id: ID of the score set to associate the target gene with
33+
: param tg: Dictionary with target gene details (name, category, etc.)
34+
: param tg_accession: Dictionary with target accession details (accession, assembly, gene, etc.)
35+
: return: The found or newly created TargetGene instance
36+
"""
37+
target_gene = None
38+
logger.info(
39+
msg=f"Searching for existing target gene by accession within score set {score_set_id}.",
40+
extra=logging_context(),
41+
)
42+
if tg_accession is not None and tg_accession.get("accession"):
43+
target_gene = (
44+
db.query(TargetGene)
45+
.filter(
46+
and_(
47+
TargetGene.target_accession.has(
48+
and_(
49+
TargetAccession.accession == tg_accession["accession"],
50+
TargetAccession.assembly == tg_accession["assembly"],
51+
TargetAccession.gene == tg_accession["gene"],
52+
TargetAccession.is_base_editor == tg_accession.get("is_base_editor", False),
53+
)
54+
),
55+
TargetGene.name == tg["name"],
56+
TargetGene.category == tg["category"],
57+
TargetGene.score_set_id == score_set_id,
58+
)
59+
)
60+
.first()
61+
)
62+
63+
if target_gene is None:
64+
target_accession = TargetAccession(**tg_accession)
65+
target_gene = TargetGene(
66+
**tg,
67+
score_set_id=score_set_id,
68+
target_accession=target_accession,
69+
)
70+
db.add(target_gene)
71+
db.commit()
72+
db.refresh(target_gene)
73+
logger.info(
74+
msg=f"Created new target gene '{target_gene.name}' with ID {target_gene.id}.",
75+
extra=logging_context(),
76+
)
77+
else:
78+
logger.info(
79+
msg=f"Found existing target gene '{target_gene.name}' with ID {target_gene.id}.",
80+
extra=logging_context(),
81+
)
82+
83+
return target_gene
84+
85+
86+
def find_or_create_target_gene_by_sequence(
87+
db: Session,
88+
score_set_id: int,
89+
tg: dict,
90+
tg_sequence: dict,
91+
) -> TargetGene:
92+
"""
93+
Find or create a target gene for a score set by sequence. If the existing target gene or related sequence record is modified,
94+
this function creates a new target gene so that that its id can be used to determine if a score set has changed in a way
95+
that requires the create variants job to be re-run.
96+
97+
: param db: Database session
98+
: param score_set_id: ID of the score set to associate the target gene with
99+
: param tg: Dictionary with target gene details (name, category, etc.)
100+
: param tg_sequence: Dictionary with target sequence details (sequence, sequence_type, taxonomy, label, etc.)
101+
: return: The found or newly created TargetGene instance
102+
"""
103+
target_gene = None
104+
logger.info(
105+
msg=f"Searching for existing target gene by sequence within score set {score_set_id}.",
106+
extra=logging_context(),
107+
)
108+
if tg_sequence is not None and tg_sequence.get("sequence"):
109+
target_gene = (
110+
db.query(TargetGene)
111+
.filter(
112+
and_(
113+
TargetGene.target_sequence.has(
114+
and_(
115+
TargetSequence.sequence == tg_sequence["sequence"],
116+
TargetSequence.sequence_type == tg_sequence["sequence_type"],
117+
TargetSequence.taxonomy.has(Taxonomy.id == tg_sequence["taxonomy"].id),
118+
TargetSequence.label == tg_sequence["label"],
119+
)
120+
),
121+
TargetGene.name == tg["name"],
122+
TargetGene.category == tg["category"],
123+
TargetGene.score_set_id == score_set_id,
124+
)
125+
)
126+
.first()
127+
)
128+
129+
if target_gene is None:
130+
target_sequence = TargetSequence(**tg_sequence)
131+
target_gene = TargetGene(
132+
**tg,
133+
score_set_id=score_set_id,
134+
target_sequence=target_sequence,
135+
)
136+
db.add(target_gene)
137+
db.commit()
138+
db.refresh(target_gene)
139+
logger.info(
140+
msg=f"Created new target gene '{target_gene.name}' with ID {target_gene.id}.",
141+
extra=logging_context(),
142+
)
143+
else:
144+
logger.info(
145+
msg=f"Found existing target gene '{target_gene.name}' with ID {target_gene.id}.",
146+
extra=logging_context(),
147+
)
148+
149+
return target_gene
150+
151+
17152
def search_target_genes(
18153
db: Session,
19154
owner_or_contributor: Optional[User],

src/mavedb/lib/validation/dataframe/dataframe.py

Lines changed: 99 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,26 @@
1-
from typing import Optional, Tuple, TYPE_CHECKING
1+
from typing import TYPE_CHECKING, Any, Optional, Tuple
22

33
import numpy as np
44
import pandas as pd
55

66
from mavedb.lib.exceptions import MixedTargetError
77
from mavedb.lib.validation.constants.general import (
8+
guide_sequence_column,
89
hgvs_nt_column,
910
hgvs_pro_column,
1011
hgvs_splice_column,
11-
guide_sequence_column,
1212
required_score_column,
1313
)
14-
from mavedb.lib.validation.exceptions import ValidationError
15-
from mavedb.models.target_gene import TargetGene
1614
from mavedb.lib.validation.dataframe.column import validate_data_column
1715
from mavedb.lib.validation.dataframe.variant import (
18-
validate_hgvs_transgenic_column,
19-
validate_hgvs_genomic_column,
2016
validate_guide_sequence_column,
17+
validate_hgvs_genomic_column,
2118
validate_hgvs_prefix_combinations,
19+
validate_hgvs_transgenic_column,
2220
)
21+
from mavedb.lib.validation.exceptions import ValidationError
22+
from mavedb.models.target_gene import TargetGene
23+
from mavedb.view_models.score_set_dataset_columns import DatasetColumnMetadata
2324

2425
if TYPE_CHECKING:
2526
from cdot.hgvs.dataproviders import RESTDataProvider
@@ -28,12 +29,28 @@
2829
STANDARD_COLUMNS = (hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, required_score_column, guide_sequence_column)
2930

3031

32+
def clean_col_name(col: str) -> str:
33+
col = col.strip()
34+
# Only remove quotes if the column name is fully quoted
35+
if (col.startswith('"') and col.endswith('"')) or (col.startswith("'") and col.endswith("'")):
36+
col = col[1:-1]
37+
38+
return col.strip()
39+
40+
3141
def validate_and_standardize_dataframe_pair(
3242
scores_df: pd.DataFrame,
3343
counts_df: Optional[pd.DataFrame],
44+
score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
45+
count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
3446
targets: list[TargetGene],
3547
hdp: Optional["RESTDataProvider"],
36-
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
48+
) -> Tuple[
49+
pd.DataFrame,
50+
Optional[pd.DataFrame],
51+
Optional[dict[str, DatasetColumnMetadata]],
52+
Optional[dict[str, DatasetColumnMetadata]],
53+
]:
3754
"""
3855
Perform validation and standardization on a pair of score and count dataframes.
3956
@@ -43,15 +60,19 @@ def validate_and_standardize_dataframe_pair(
4360
The scores dataframe
4461
counts_df : Optional[pandas.DataFrame]
4562
The counts dataframe, can be None if not present
63+
score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
64+
The scores column metadata, can be None if not present
65+
count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
66+
The counts column metadata, can be None if not present
4667
targets : str
4768
The target genes on which to validate dataframes
4869
hdp : RESTDataProvider
4970
The biocommons.hgvs compatible data provider. Used to fetch sequences for hgvs validation.
5071
5172
Returns
5273
-------
53-
Tuple[pd.DataFrame, Optional[pd.DataFrame]]
54-
The standardized score and count dataframes, or score and None if no count dataframe was provided
74+
Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[dict[str, DatasetColumnMetadata]], Optional[dict[str, DatasetColumnMetadata]]]
75+
The standardized score and count dataframes, plus score column metadata and counts column metadata dictionaries. Counts dataframe and column metadata dictionaries can be None if not provided.
5576
5677
Raises
5778
------
@@ -65,11 +86,32 @@ def validate_and_standardize_dataframe_pair(
6586
standardized_counts_df = standardize_dataframe(counts_df) if counts_df is not None else None
6687

6788
validate_dataframe(standardized_scores_df, "scores", targets, hdp)
89+
90+
if score_columns_metadata is not None:
91+
standardized_score_columns_metadata = standardize_dict_keys(score_columns_metadata)
92+
validate_df_column_metadata_match(standardized_scores_df, standardized_score_columns_metadata)
93+
else:
94+
standardized_score_columns_metadata = None
95+
6896
if standardized_counts_df is not None:
6997
validate_dataframe(standardized_counts_df, "counts", targets, hdp)
7098
validate_variant_columns_match(standardized_scores_df, standardized_counts_df)
71-
72-
return standardized_scores_df, standardized_counts_df
99+
if count_columns_metadata is not None:
100+
standardized_count_columns_metadata = standardize_dict_keys(count_columns_metadata)
101+
validate_df_column_metadata_match(standardized_counts_df, standardized_count_columns_metadata)
102+
else:
103+
standardized_count_columns_metadata = None
104+
else:
105+
if count_columns_metadata is not None and len(count_columns_metadata.keys()) > 0:
106+
raise ValidationError("Counts column metadata provided without counts dataframe")
107+
standardized_count_columns_metadata = None
108+
109+
return (
110+
standardized_scores_df,
111+
standardized_counts_df,
112+
standardized_score_columns_metadata,
113+
standardized_count_columns_metadata,
114+
)
73115

74116

75117
def validate_dataframe(
@@ -163,6 +205,25 @@ def validate_dataframe(
163205
)
164206

165207

208+
def standardize_dict_keys(d: dict[str, Any]) -> dict[str, Any]:
209+
"""
210+
Standardize the keys of a dictionary by stripping leading and trailing whitespace
211+
and removing any quoted strings from the keys.
212+
213+
Parameters
214+
----------
215+
d : dict[str, DatasetColumnMetadata]
216+
The dictionary to standardize
217+
218+
Returns
219+
-------
220+
dict[str, DatasetColumnMetadata]
221+
The standardized dictionary
222+
"""
223+
224+
return {clean_col_name(k): v for k, v in d.items()}
225+
226+
166227
def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
167228
"""Standardize a dataframe by sorting the columns and changing the standard column names to lowercase.
168229
Also strips leading and trailing whitespace from column names and removes any quoted strings from column names.
@@ -186,15 +247,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
186247
The standardized dataframe
187248
"""
188249

189-
def clean_column(col: str) -> str:
190-
col = col.strip()
191-
# Only remove quotes if the column name is fully quoted
192-
if (col.startswith('"') and col.endswith('"')) or (col.startswith("'") and col.endswith("'")):
193-
col = col[1:-1]
194-
195-
return col.strip()
196-
197-
cleaned_columns = {c: clean_column(c) for c in df.columns}
250+
cleaned_columns = {c: clean_col_name(c) for c in df.columns}
198251
df.rename(columns=cleaned_columns, inplace=True)
199252

200253
column_mapper = {x: x.lower() for x in df.columns if x.lower() in STANDARD_COLUMNS}
@@ -368,6 +421,32 @@ def validate_variant_consistency(df: pd.DataFrame) -> None:
368421
pass
369422

370423

424+
def validate_df_column_metadata_match(df: pd.DataFrame, columnMetadata: dict[str, DatasetColumnMetadata]):
425+
"""
426+
Checks that metadata keys match the dataframe column names and exclude standard column names.
427+
428+
Parameters
429+
----------
430+
df1 : pandas.DataFrame
431+
Dataframe parsed from an uploaded scores file
432+
columnMetadata : dict[str, DatasetColumnMetadata]
433+
Metadata for the scores columns
434+
435+
Raises
436+
------
437+
ValidationError
438+
If any metadata keys do not match dataframe column names
439+
ValidationError
440+
If any metadata keys match standard columns
441+
442+
"""
443+
for key in columnMetadata.keys():
444+
if key.lower() in STANDARD_COLUMNS:
445+
raise ValidationError(f"standard column '{key}' cannot have metadata defined")
446+
elif key not in df.columns:
447+
raise ValidationError(f"column metadata key '{key}' does not match any dataframe column names")
448+
449+
371450
def validate_variant_columns_match(df1: pd.DataFrame, df2: pd.DataFrame):
372451
"""
373452
Checks if two dataframes have matching HGVS columns.

0 commit comments

Comments
 (0)