VariantEffect · davereinhart · Nov 7, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 29, 2025
diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
@@ -48,6 +48,7 @@ services:
       - redis
 
   dcd-mapping:
+    build: ../dcd_mapping
     image: dcd-mapping:dev
     command: bash -c "uvicorn api.server_main:app --host 0.0.0.0 --port 8000 --reload"
     depends_on:
@@ -61,6 +62,7 @@ services:
       - mavedb-seqrepo-dev:/usr/local/share/seqrepo
 
   cdot-rest:
+    build: ../cdot_rest
     image: cdot-rest:dev
     command: bash -c "gunicorn cdot_rest.wsgi:application --bind 0.0.0.0:8000"
     env_file:

diff --git a/settings/.env.template b/settings/.env.template
@@ -67,8 +67,10 @@ DCD_MAPPING_URL=http://dcd-mapping:8000
 ####################################################################################################
 
 CDOT_URL=http://cdot-rest:8000
-REDIS_HOST=localhost
+REDIS_HOST=redis
+REDIS_IP=redis
 REDIS_PORT=6379
+REDIS_SSL=false
 
 ####################################################################################################
 # Environment variables for ClinGen

diff --git a/src/mavedb/lib/target_genes.py b/src/mavedb/lib/target_genes.py
@@ -1,19 +1,154 @@
 import logging
 from typing import Optional
 
-from sqlalchemy import func, or_
+from sqlalchemy import and_, func, or_
 from sqlalchemy.orm import Session
 
 from mavedb.lib.logging.context import logging_context, save_to_logging_context
 from mavedb.models.contributor import Contributor
 from mavedb.models.score_set import ScoreSet
+from mavedb.models.target_accession import TargetAccession
 from mavedb.models.target_gene import TargetGene
+from mavedb.models.target_sequence import TargetSequence
+from mavedb.models.taxonomy import Taxonomy
 from mavedb.models.user import User
 from mavedb.view_models.search import TextSearch
 
 logger = logging.getLogger(__name__)
 
 
+def find_or_create_target_gene_by_accession(
+    db: Session,
+    score_set_id: int,
+    tg: dict,
+    tg_accession: dict,
+) -> TargetGene:
+    """
+    Find or create a target gene for a score set by accession. If the existing target gene or related accession record is modified,
+    this function creates a new target gene so that that its id can be used to determine if a score set has changed in a way
+    that requires the create variants job to be re-run.
+
+    : param db: Database session
+    : param score_set_id: ID of the score set to associate the target gene with
+    : param tg: Dictionary with target gene details (name, category, etc.)
+    : param tg_accession: Dictionary with target accession details (accession, assembly, gene, etc.)
+    : return: The found or newly created TargetGene instance
+    """
+    target_gene = None
+    logger.info(
+        msg=f"Searching for existing target gene by accession within score set {score_set_id}.",
+        extra=logging_context(),
+    )
+    if tg_accession is not None and tg_accession.get("accession"):
+        target_gene = (
+            db.query(TargetGene)
+            .filter(
+                and_(
+                    TargetGene.target_accession.has(
+                        and_(
+                            TargetAccession.accession == tg_accession["accession"],
+                            TargetAccession.assembly == tg_accession["assembly"],
+                            TargetAccession.gene == tg_accession["gene"],
+                            TargetAccession.is_base_editor == tg_accession.get("is_base_editor", False),
+                        )
+                    ),
+                    TargetGene.name == tg["name"],
+                    TargetGene.category == tg["category"],
+                    TargetGene.score_set_id == score_set_id,
+                )
+            )
+            .first()
+        )
+
+    if target_gene is None:
+        target_accession = TargetAccession(**tg_accession)
+        target_gene = TargetGene(
+            **tg,
+            score_set_id=score_set_id,
+            target_accession=target_accession,
+        )
+        db.add(target_gene)
+        db.commit()
+        db.refresh(target_gene)
+        logger.info(
+            msg=f"Created new target gene '{target_gene.name}' with ID {target_gene.id}.",
+            extra=logging_context(),
+        )
+    else:
+        logger.info(
+            msg=f"Found existing target gene '{target_gene.name}' with ID {target_gene.id}.",
+            extra=logging_context(),
+        )
+
+    return target_gene
+
+
+def find_or_create_target_gene_by_sequence(
+    db: Session,
+    score_set_id: int,
+    tg: dict,
+    tg_sequence: dict,
+) -> TargetGene:
+    """
+    Find or create a target gene for a score set by sequence. If the existing target gene or related sequence record is modified,
+    this function creates a new target gene so that that its id can be used to determine if a score set has changed in a way
+    that requires the create variants job to be re-run.
+
+    : param db: Database session
+    : param score_set_id: ID of the score set to associate the target gene with
+    : param tg: Dictionary with target gene details (name, category, etc.)
+    : param tg_sequence: Dictionary with target sequence details (sequence, sequence_type, taxonomy, label, etc.)
+    : return: The found or newly created TargetGene instance
+    """
+    target_gene = None
+    logger.info(
+        msg=f"Searching for existing target gene by sequence within score set {score_set_id}.",
+        extra=logging_context(),
+    )
+    if tg_sequence is not None and tg_sequence.get("sequence"):
+        target_gene = (
+            db.query(TargetGene)
+            .filter(
+                and_(
+                    TargetGene.target_sequence.has(
+                        and_(
+                            TargetSequence.sequence == tg_sequence["sequence"],
+                            TargetSequence.sequence_type == tg_sequence["sequence_type"],
+                            TargetSequence.taxonomy.has(Taxonomy.id == tg_sequence["taxonomy"].id),
+                            TargetSequence.label == tg_sequence["label"],
+                        )
+                    ),
+                    TargetGene.name == tg["name"],
+                    TargetGene.category == tg["category"],
+                    TargetGene.score_set_id == score_set_id,
+                )
+            )
+            .first()
+        )
+
+    if target_gene is None:
+        target_sequence = TargetSequence(**tg_sequence)
+        target_gene = TargetGene(
+            **tg,
+            score_set_id=score_set_id,
+            target_sequence=target_sequence,
+        )
+        db.add(target_gene)
+        db.commit()
+        db.refresh(target_gene)
+        logger.info(
+            msg=f"Created new target gene '{target_gene.name}' with ID {target_gene.id}.",
+            extra=logging_context(),
+        )
+    else:
+        logger.info(
+            msg=f"Found existing target gene '{target_gene.name}' with ID {target_gene.id}.",
+            extra=logging_context(),
+        )
+
+    return target_gene
+
+
 def search_target_genes(
     db: Session,
     owner_or_contributor: Optional[User],

diff --git a/src/mavedb/lib/validation/dataframe/dataframe.py b/src/mavedb/lib/validation/dataframe/dataframe.py
@@ -1,25 +1,26 @@
-from typing import Optional, Tuple, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Optional, Tuple
 
 import numpy as np
 import pandas as pd
 
 from mavedb.lib.exceptions import MixedTargetError
 from mavedb.lib.validation.constants.general import (
+    guide_sequence_column,
     hgvs_nt_column,
     hgvs_pro_column,
     hgvs_splice_column,
-    guide_sequence_column,
     required_score_column,
 )
-from mavedb.lib.validation.exceptions import ValidationError
-from mavedb.models.target_gene import TargetGene
 from mavedb.lib.validation.dataframe.column import validate_data_column
 from mavedb.lib.validation.dataframe.variant import (
-    validate_hgvs_transgenic_column,
-    validate_hgvs_genomic_column,
     validate_guide_sequence_column,
+    validate_hgvs_genomic_column,
     validate_hgvs_prefix_combinations,
+    validate_hgvs_transgenic_column,
 )
+from mavedb.lib.validation.exceptions import ValidationError
+from mavedb.models.target_gene import TargetGene
+from mavedb.view_models.score_set_dataset_columns import DatasetColumnMetadata
 
 if TYPE_CHECKING:
     from cdot.hgvs.dataproviders import RESTDataProvider
@@ -28,12 +29,28 @@
 STANDARD_COLUMNS = (hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, required_score_column, guide_sequence_column)
 
 
+def clean_col_name(col: str) -> str:
+    col = col.strip()
+    # Only remove quotes if the column name is fully quoted
+    if (col.startswith('"') and col.endswith('"')) or (col.startswith("'") and col.endswith("'")):
+        col = col[1:-1]
+
+    return col.strip()
+
+
 def validate_and_standardize_dataframe_pair(
     scores_df: pd.DataFrame,
     counts_df: Optional[pd.DataFrame],
+    score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
+    count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
     targets: list[TargetGene],
     hdp: Optional["RESTDataProvider"],
-) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
+) -> Tuple[
+    pd.DataFrame,
+    Optional[pd.DataFrame],
+    Optional[dict[str, DatasetColumnMetadata]],
+    Optional[dict[str, DatasetColumnMetadata]],
+]:
     """
     Perform validation and standardization on a pair of score and count dataframes.
 
@@ -43,15 +60,19 @@ def validate_and_standardize_dataframe_pair(
         The scores dataframe
     counts_df : Optional[pandas.DataFrame]
         The counts dataframe, can be None if not present
+    score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
+        The scores column metadata, can be None if not present
+    count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
+        The counts column metadata, can be None if not present
     targets : str
         The target genes on which to validate dataframes
     hdp : RESTDataProvider
         The biocommons.hgvs compatible data provider. Used to fetch sequences for hgvs validation.
 
     Returns
     -------
-    Tuple[pd.DataFrame, Optional[pd.DataFrame]]
-        The standardized score and count dataframes, or score and None if no count dataframe was provided
+    Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[dict[str, DatasetColumnMetadata]], Optional[dict[str, DatasetColumnMetadata]]]
+        The standardized score and count dataframes, plus score column metadata and counts column metadata dictionaries. Counts dataframe and column metadata dictionaries can be None if not provided.
 
     Raises
     ------
@@ -65,11 +86,32 @@ def validate_and_standardize_dataframe_pair(
     standardized_counts_df = standardize_dataframe(counts_df) if counts_df is not None else None
 
     validate_dataframe(standardized_scores_df, "scores", targets, hdp)
+
+    if score_columns_metadata is not None:
+        standardized_score_columns_metadata = standardize_dict_keys(score_columns_metadata)
+        validate_df_column_metadata_match(standardized_scores_df, standardized_score_columns_metadata)
+    else:
+        standardized_score_columns_metadata = None
+
     if standardized_counts_df is not None:
         validate_dataframe(standardized_counts_df, "counts", targets, hdp)
         validate_variant_columns_match(standardized_scores_df, standardized_counts_df)
-
-    return standardized_scores_df, standardized_counts_df
+        if count_columns_metadata is not None:
+            standardized_count_columns_metadata = standardize_dict_keys(count_columns_metadata)
+            validate_df_column_metadata_match(standardized_counts_df, standardized_count_columns_metadata)
+        else:
+            standardized_count_columns_metadata = None
+    else:
+        if count_columns_metadata is not None and len(count_columns_metadata.keys()) > 0:
+            raise ValidationError("Counts column metadata provided without counts dataframe")
+        standardized_count_columns_metadata = None
+
+    return (
+        standardized_scores_df,
+        standardized_counts_df,
+        standardized_score_columns_metadata,
+        standardized_count_columns_metadata,
+    )
 
 
 def validate_dataframe(
@@ -163,6 +205,25 @@ def validate_dataframe(
     )
 
 
+def standardize_dict_keys(d: dict[str, Any]) -> dict[str, Any]:
+    """
+    Standardize the keys of a dictionary by stripping leading and trailing whitespace
+    and removing any quoted strings from the keys.
+
+    Parameters
+    ----------
+    d : dict[str, DatasetColumnMetadata]
+        The dictionary to standardize
+
+    Returns
+    -------
+    dict[str, DatasetColumnMetadata]
+        The standardized dictionary
+    """
+
+    return {clean_col_name(k): v for k, v in d.items()}
+
+
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """Standardize a dataframe by sorting the columns and changing the standard column names to lowercase.
     Also strips leading and trailing whitespace from column names and removes any quoted strings from column names.
@@ -186,15 +247,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         The standardized dataframe
     """
 
-    def clean_column(col: str) -> str:
-        col = col.strip()
-        # Only remove quotes if the column name is fully quoted
-        if (col.startswith('"') and col.endswith('"')) or (col.startswith("'") and col.endswith("'")):
-            col = col[1:-1]
-
-        return col.strip()
-
-    cleaned_columns = {c: clean_column(c) for c in df.columns}
+    cleaned_columns = {c: clean_col_name(c) for c in df.columns}
     df.rename(columns=cleaned_columns, inplace=True)
 
     column_mapper = {x: x.lower() for x in df.columns if x.lower() in STANDARD_COLUMNS}
@@ -368,6 +421,32 @@ def validate_variant_consistency(df: pd.DataFrame) -> None:
     pass
 
 
+def validate_df_column_metadata_match(df: pd.DataFrame, columnMetadata: dict[str, DatasetColumnMetadata]):
+    """
+    Checks that metadata keys match the dataframe column names and exclude standard column names.
+
+    Parameters
+    ----------
+    df1 : pandas.DataFrame
+        Dataframe parsed from an uploaded scores file
+    columnMetadata : dict[str, DatasetColumnMetadata]
+        Metadata for the scores columns
+
+    Raises
+    ------
+    ValidationError
+        If any metadata keys do not match dataframe column names
+    ValidationError
+        If any metadata keys match standard columns
+
+    """
+    for key in columnMetadata.keys():
+        if key.lower() in STANDARD_COLUMNS:
+            raise ValidationError(f"standard column '{key}' cannot have metadata defined")
+        elif key not in df.columns:
+            raise ValidationError(f"column metadata key '{key}' does not match any dataframe column names")
+
+
 def validate_variant_columns_match(df1: pd.DataFrame, df2: pd.DataFrame):
     """
     Checks if two dataframes have matching HGVS columns.