Skip to content

Commit 3fda888

Browse files
committed
feat: add validation and standardization for calibration classes dataframe
1 parent 407377b commit 3fda888

File tree

3 files changed

+936
-0
lines changed

3 files changed

+936
-0
lines changed

src/mavedb/lib/validation/constants/general.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@
4444
variant_count_data = "count_data"
4545
required_score_column = "score"
4646

47+
calibration_variant_column_name = "variant_urn"
48+
calibration_class_column_name = "class_name"
49+
4750
valid_dataset_columns = [score_columns, count_columns]
4851
valid_variant_columns = [variant_score_data, variant_count_data]
4952

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
import pandas as pd
2+
from sqlalchemy import select
3+
from sqlalchemy.orm import Session
4+
5+
from mavedb.lib.validation.constants.general import (
6+
calibration_class_column_name,
7+
calibration_variant_column_name,
8+
)
9+
from mavedb.lib.validation.dataframe.column import validate_data_column, validate_variant_column
10+
from mavedb.lib.validation.dataframe.dataframe import standardize_dataframe, validate_no_null_rows
11+
from mavedb.lib.validation.exceptions import ValidationError
12+
from mavedb.models.score_set import ScoreSet
13+
from mavedb.models.variant import Variant
14+
from mavedb.view_models import score_calibration
15+
16+
STANDARD_CALIBRATION_COLUMNS = (calibration_variant_column_name, calibration_class_column_name)
17+
18+
19+
def validate_and_standardize_calibration_classes_dataframe(
20+
db: Session,
21+
score_set: ScoreSet,
22+
calibration: score_calibration.ScoreCalibrationCreate | score_calibration.ScoreCalibrationModify,
23+
classes_df: pd.DataFrame,
24+
) -> pd.DataFrame:
25+
"""
26+
Validate and standardize a calibration classes dataframe for functional classification calibrations.
27+
28+
This function performs comprehensive validation of a calibration classes dataframe, ensuring
29+
it meets the requirements for functional classification calibrations. It standardizes column
30+
names, validates data integrity, and checks that variants and classes are properly formatted.
31+
32+
Args:
33+
db (Session): Database session for validation queries.
34+
score_set (ScoreSet): The score set associated with the calibration.
35+
calibration (ScoreCalibrationCreate | ScoreCalibrationModify): The calibration object
36+
containing configuration details. Must be class-based.
37+
classes_df (pd.DataFrame): The input dataframe containing calibration classes data.
38+
39+
Returns:
40+
pd.DataFrame: The standardized and validated calibration classes dataframe.
41+
42+
Raises:
43+
ValueError: If the calibration is not class-based.
44+
ValidationError: If the dataframe contains invalid data, unexpected columns,
45+
invalid variant URNs, or improperly formatted classes.
46+
47+
Note:
48+
The function expects the dataframe to contain specific columns for variants and
49+
calibration classes, and performs strict validation on both column structure
50+
and data content.
51+
"""
52+
if not calibration.class_based:
53+
raise ValueError("Calibration classes file can only be provided for functional classification calibrations.")
54+
55+
standardized_classes_df = standardize_dataframe(classes_df, STANDARD_CALIBRATION_COLUMNS)
56+
validate_calibration_df_column_names(standardized_classes_df)
57+
validate_no_null_rows(standardized_classes_df)
58+
59+
column_mapping = {c.lower(): c for c in standardized_classes_df.columns}
60+
index_column = column_mapping[calibration_variant_column_name]
61+
62+
for c in column_mapping:
63+
if c == calibration_variant_column_name:
64+
validate_variant_column(standardized_classes_df[c], column_mapping[c] == index_column)
65+
validate_calibration_variant_urns(db, score_set, standardized_classes_df[c])
66+
elif c == calibration_class_column_name:
67+
validate_data_column(standardized_classes_df[c], force_numeric=False)
68+
validate_calibration_classes(calibration, standardized_classes_df[c])
69+
70+
# handle unexpected columns. These should have already been caught by
71+
# validate_calibration_df_column_names, but we include this for completeness.
72+
else: # pragma: no cover
73+
raise ValidationError(f"unexpected column in calibration classes file: '{c}'")
74+
75+
return standardized_classes_df
76+
77+
78+
def validate_calibration_df_column_names(df: pd.DataFrame) -> None:
79+
"""
80+
Validate the column names of a calibration DataFrame.
81+
82+
This function performs comprehensive validation of DataFrame column names to ensure
83+
they meet the required format and structure for calibration data processing.
84+
85+
Args:
86+
df (pd.DataFrame): The DataFrame whose columns need to be validated.
87+
88+
Raises:
89+
ValidationError: If any of the following validation checks fail:
90+
- Column names are not strings
91+
- Column names are empty or contain only whitespace
92+
- Required calibration variant column is missing
93+
- Required calibration class column is missing
94+
- DataFrame contains unexpected columns (must match STANDARD_CALIBRATION_COLUMNS exactly)
95+
96+
Returns:
97+
None: This function performs validation only and returns nothing on success.
98+
99+
Note:
100+
Column name comparison is case-insensitive. The function converts all column
101+
names to lowercase before performing validation checks.
102+
"""
103+
if any(type(c) is not str for c in df.columns):
104+
raise ValidationError("column names must be strings")
105+
106+
if any(c.isspace() for c in df.columns) or any(len(c) == 0 for c in df.columns):
107+
raise ValidationError("column names cannot be empty or whitespace")
108+
109+
if len(df.columns) != len(set(c.lower() for c in df.columns)):
110+
raise ValidationError("duplicate column names are not allowed (case-insensitive)")
111+
112+
columns = [c.lower() for c in df.columns]
113+
114+
if calibration_variant_column_name not in columns:
115+
raise ValidationError(f"missing required column: '{calibration_variant_column_name}'")
116+
117+
if calibration_class_column_name not in columns:
118+
raise ValidationError(f"missing required column: '{calibration_class_column_name}'")
119+
120+
if set(STANDARD_CALIBRATION_COLUMNS) != set(columns):
121+
raise ValidationError(
122+
f"unexpected column(s) in calibration classes file: {', '.join(sorted(set(columns) - set(STANDARD_CALIBRATION_COLUMNS)))}"
123+
)
124+
125+
126+
def validate_calibration_variant_urns(db: Session, score_set: ScoreSet, variant_urns: pd.Series) -> None:
127+
"""
128+
Validate that all provided variant URNs exist in the given score set.
129+
130+
Args:
131+
db (Session): Database session for querying variants.
132+
score_set (ScoreSet): The score set to validate variants against.
133+
variant_urns (pd.Series): Series of variant URNs to validate.
134+
135+
Raises:
136+
ValidationError: If any variant URNs do not exist in the score set.
137+
138+
Returns:
139+
None: Function returns nothing if validation passes.
140+
"""
141+
existing_variant_urns = set(
142+
db.scalars(
143+
select(Variant.urn).where(Variant.score_set_id == score_set.id, Variant.urn.in_(variant_urns.tolist()))
144+
).all()
145+
)
146+
147+
missing_variant_urns = set(variant_urns.tolist()) - existing_variant_urns
148+
if missing_variant_urns:
149+
raise ValidationError(
150+
f"The following variant URNs do not exist in the score set: {', '.join(sorted(missing_variant_urns))}"
151+
)
152+
153+
154+
def validate_calibration_classes(
155+
calibration: score_calibration.ScoreCalibrationCreate | score_calibration.ScoreCalibrationModify, classes: pd.Series
156+
) -> None:
157+
"""
158+
Validate that the functional classifications in a calibration match the provided classes.
159+
160+
This function ensures that:
161+
1. The calibration has functional classifications defined
162+
2. All classes in the provided series are defined in the calibration
163+
3. All classes defined in the calibration are present in the provided series
164+
165+
Args:
166+
calibration: A ScoreCalibrationCreate or ScoreCalibrationModify object containing
167+
functional classifications to validate against.
168+
classes: A pandas Series containing class labels to validate.
169+
170+
Raises:
171+
ValueError: If the calibration does not have functional classifications defined.
172+
ValidationError: If there are classes in the series that are not defined in the
173+
calibration, or if there are classes defined in the calibration
174+
that are missing from the series.
175+
"""
176+
if not calibration.functional_classifications:
177+
raise ValueError("Calibration must have functional classifications defined for class validation.")
178+
179+
defined_classes = {c.class_ for c in calibration.functional_classifications}
180+
provided_classes = set(classes.tolist())
181+
182+
undefined_classes = provided_classes - defined_classes
183+
if undefined_classes:
184+
raise ValidationError(
185+
f"The following classes are not defined in the calibration: {', '.join(sorted(undefined_classes))}"
186+
)
187+
188+
unprovided_classes = defined_classes - provided_classes
189+
if unprovided_classes:
190+
raise ValidationError("Some defined classes in the calibration are missing from the classes file.")

0 commit comments

Comments
 (0)