Skip to content

Commit 9dafe45

Browse files
committed
feat: update standardize_dataframe to accept custom standard columns and adjust related tests
1 parent ef3eb8c commit 9dafe45

File tree

2 files changed

+17
-11
lines changed

2 files changed

+17
-11
lines changed

src/mavedb/lib/validation/dataframe/dataframe.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ def validate_and_standardize_dataframe_pair(
8282
if not targets:
8383
raise ValueError("Can't validate provided file with no targets.")
8484

85-
standardized_scores_df = standardize_dataframe(scores_df)
86-
standardized_counts_df = standardize_dataframe(counts_df) if counts_df is not None else None
85+
standardized_scores_df = standardize_dataframe(scores_df, STANDARD_COLUMNS)
86+
standardized_counts_df = standardize_dataframe(counts_df, STANDARD_COLUMNS) if counts_df is not None else None
8787

8888
validate_dataframe(standardized_scores_df, "scores", targets, hdp)
8989

@@ -224,7 +224,7 @@ def standardize_dict_keys(d: dict[str, Any]) -> dict[str, Any]:
224224
return {clean_col_name(k): v for k, v in d.items()}
225225

226226

227-
def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
227+
def standardize_dataframe(df: pd.DataFrame, standard_columns: tuple[str, ...]) -> pd.DataFrame:
228228
"""Standardize a dataframe by sorting the columns and changing the standard column names to lowercase.
229229
Also strips leading and trailing whitespace from column names and removes any quoted strings from column names.
230230
@@ -250,7 +250,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
250250
cleaned_columns = {c: clean_col_name(c) for c in df.columns}
251251
df.rename(columns=cleaned_columns, inplace=True)
252252

253-
column_mapper = {x: x.lower() for x in df.columns if x.lower() in STANDARD_COLUMNS}
253+
column_mapper = {x: x.lower() for x in df.columns if x.lower() in standard_columns}
254254
df.rename(columns=column_mapper, inplace=True)
255255

256256
return sort_dataframe_columns(df)

tests/validation/dataframe/test_dataframe.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
required_score_column,
1414
)
1515
from mavedb.lib.validation.dataframe.dataframe import (
16+
STANDARD_COLUMNS,
1617
choose_dataframe_index_column,
1718
sort_dataframe_columns,
1819
standardize_dataframe,
@@ -93,32 +94,36 @@ def test_sort_dataframe_preserves_extras_order(self):
9394

9495
class TestStandardizeDataframe(DfTestCase):
9596
def test_preserve_standardized(self):
96-
standardized_df = standardize_dataframe(self.dataframe)
97+
standardized_df = standardize_dataframe(self.dataframe, STANDARD_COLUMNS)
9798
pd.testing.assert_frame_equal(self.dataframe, standardized_df)
9899

99100
def test_standardize_changes_case_variants(self):
100-
standardized_df = standardize_dataframe(self.dataframe.rename(columns={hgvs_nt_column: hgvs_nt_column.upper()}))
101+
standardized_df = standardize_dataframe(
102+
self.dataframe.rename(columns={hgvs_nt_column: hgvs_nt_column.upper()}), STANDARD_COLUMNS
103+
)
101104
pd.testing.assert_frame_equal(self.dataframe, standardized_df)
102105

103106
def test_standardize_changes_case_scores(self):
104107
standardized_df = standardize_dataframe(
105-
self.dataframe.rename(columns={required_score_column: required_score_column.title()})
108+
self.dataframe.rename(columns={required_score_column: required_score_column.title()}), STANDARD_COLUMNS
106109
)
107110
pd.testing.assert_frame_equal(self.dataframe, standardized_df)
108111

109112
def test_standardize_preserves_extras_case(self):
110-
standardized_df = standardize_dataframe(self.dataframe.rename(columns={"extra": "extra".upper()}))
113+
standardized_df = standardize_dataframe(
114+
self.dataframe.rename(columns={"extra": "extra".upper()}), STANDARD_COLUMNS
115+
)
111116
pd.testing.assert_frame_equal(self.dataframe.rename(columns={"extra": "extra".upper()}), standardized_df)
112117

113118
def test_standardize_removes_quotes(self):
114119
standardized_df = standardize_dataframe(
115-
self.dataframe.rename(columns={"extra": "'extra'", "extra2": '"extra2"'})
120+
self.dataframe.rename(columns={"extra": "'extra'", "extra2": '"extra2"'}), STANDARD_COLUMNS
116121
)
117122
pd.testing.assert_frame_equal(self.dataframe, standardized_df)
118123

119124
def test_standardize_removes_whitespace(self):
120125
standardized_df = standardize_dataframe(
121-
self.dataframe.rename(columns={"extra": " extra ", "extra2": " extra2"})
126+
self.dataframe.rename(columns={"extra": " extra ", "extra2": " extra2"}), STANDARD_COLUMNS
122127
)
123128
pd.testing.assert_frame_equal(self.dataframe, standardized_df)
124129

@@ -135,7 +140,8 @@ def test_standardize_sorts_columns(self):
135140
"count1",
136141
"extra",
137142
],
138-
]
143+
],
144+
STANDARD_COLUMNS,
139145
)
140146
pd.testing.assert_frame_equal(
141147
self.dataframe[

0 commit comments

Comments
 (0)