1- from typing import Optional , Tuple , TYPE_CHECKING
1+ from typing import TYPE_CHECKING , Any , Optional , Tuple
22
33import numpy as np
44import pandas as pd
55
66from mavedb .lib .exceptions import MixedTargetError
77from mavedb .lib .validation .constants .general import (
8+ guide_sequence_column ,
89 hgvs_nt_column ,
910 hgvs_pro_column ,
1011 hgvs_splice_column ,
11- guide_sequence_column ,
1212 required_score_column ,
1313)
14- from mavedb .lib .validation .exceptions import ValidationError
15- from mavedb .models .target_gene import TargetGene
1614from mavedb .lib .validation .dataframe .column import validate_data_column
1715from mavedb .lib .validation .dataframe .variant import (
18- validate_hgvs_transgenic_column ,
19- validate_hgvs_genomic_column ,
2016 validate_guide_sequence_column ,
17+ validate_hgvs_genomic_column ,
2118 validate_hgvs_prefix_combinations ,
19+ validate_hgvs_transgenic_column ,
2220)
21+ from mavedb .lib .validation .exceptions import ValidationError
22+ from mavedb .models .target_gene import TargetGene
23+ from mavedb .view_models .score_set_dataset_columns import DatasetColumnMetadata
2324
2425if TYPE_CHECKING :
2526 from cdot .hgvs .dataproviders import RESTDataProvider
2829STANDARD_COLUMNS = (hgvs_nt_column , hgvs_splice_column , hgvs_pro_column , required_score_column , guide_sequence_column )
2930
3031
32+ def clean_col_name (col : str ) -> str :
33+ col = col .strip ()
34+ # Only remove quotes if the column name is fully quoted
35+ if (col .startswith ('"' ) and col .endswith ('"' )) or (col .startswith ("'" ) and col .endswith ("'" )):
36+ col = col [1 :- 1 ]
37+
38+ return col .strip ()
39+
40+
3141def validate_and_standardize_dataframe_pair (
3242 scores_df : pd .DataFrame ,
3343 counts_df : Optional [pd .DataFrame ],
44+ score_columns_metadata : Optional [dict [str , DatasetColumnMetadata ]],
45+ count_columns_metadata : Optional [dict [str , DatasetColumnMetadata ]],
3446 targets : list [TargetGene ],
3547 hdp : Optional ["RESTDataProvider" ],
36- ) -> Tuple [pd .DataFrame , Optional [pd .DataFrame ]]:
48+ ) -> Tuple [
49+ pd .DataFrame ,
50+ Optional [pd .DataFrame ],
51+ Optional [dict [str , DatasetColumnMetadata ]],
52+ Optional [dict [str , DatasetColumnMetadata ]],
53+ ]:
3754 """
3855 Perform validation and standardization on a pair of score and count dataframes.
3956
@@ -43,15 +60,19 @@ def validate_and_standardize_dataframe_pair(
4360 The scores dataframe
4461 counts_df : Optional[pandas.DataFrame]
4562 The counts dataframe, can be None if not present
63+ score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
64+ The scores column metadata, can be None if not present
65+ count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
66+ The counts column metadata, can be None if not present
4667 targets : str
4768 The target genes on which to validate dataframes
4869 hdp : RESTDataProvider
4970 The biocommons.hgvs compatible data provider. Used to fetch sequences for hgvs validation.
5071
5172 Returns
5273 -------
53- Tuple[pd.DataFrame, Optional[pd.DataFrame]]
54- The standardized score and count dataframes, or score and None if no count dataframe was provided
74+ Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[dict[str, DatasetColumnMetadata]], Optional[dict[str, DatasetColumnMetadata]] ]
75+ The standardized score and count dataframes, plus score column metadata and counts column metadata dictionaries. Counts dataframe and column metadata dictionaries can be None if not provided.
5576
5677 Raises
5778 ------
@@ -65,11 +86,32 @@ def validate_and_standardize_dataframe_pair(
6586 standardized_counts_df = standardize_dataframe (counts_df ) if counts_df is not None else None
6687
6788 validate_dataframe (standardized_scores_df , "scores" , targets , hdp )
89+
90+ if score_columns_metadata is not None :
91+ standardized_score_columns_metadata = standardize_dict_keys (score_columns_metadata )
92+ validate_df_column_metadata_match (standardized_scores_df , standardized_score_columns_metadata )
93+ else :
94+ standardized_score_columns_metadata = None
95+
6896 if standardized_counts_df is not None :
6997 validate_dataframe (standardized_counts_df , "counts" , targets , hdp )
7098 validate_variant_columns_match (standardized_scores_df , standardized_counts_df )
71-
72- return standardized_scores_df , standardized_counts_df
99+ if count_columns_metadata is not None :
100+ standardized_count_columns_metadata = standardize_dict_keys (count_columns_metadata )
101+ validate_df_column_metadata_match (standardized_counts_df , standardized_count_columns_metadata )
102+ else :
103+ standardized_count_columns_metadata = None
104+ else :
105+ if count_columns_metadata is not None and len (count_columns_metadata .keys ()) > 0 :
106+ raise ValidationError ("Counts column metadata provided without counts dataframe" )
107+ standardized_count_columns_metadata = None
108+
109+ return (
110+ standardized_scores_df ,
111+ standardized_counts_df ,
112+ standardized_score_columns_metadata ,
113+ standardized_count_columns_metadata ,
114+ )
73115
74116
75117def validate_dataframe (
@@ -163,6 +205,25 @@ def validate_dataframe(
163205 )
164206
165207
208+ def standardize_dict_keys (d : dict [str , Any ]) -> dict [str , Any ]:
209+ """
210+ Standardize the keys of a dictionary by stripping leading and trailing whitespace
211+ and removing any quoted strings from the keys.
212+
213+ Parameters
214+ ----------
215+ d : dict[str, DatasetColumnMetadata]
216+ The dictionary to standardize
217+
218+ Returns
219+ -------
220+ dict[str, DatasetColumnMetadata]
221+ The standardized dictionary
222+ """
223+
224+ return {clean_col_name (k ): v for k , v in d .items ()}
225+
226+
166227def standardize_dataframe (df : pd .DataFrame ) -> pd .DataFrame :
167228 """Standardize a dataframe by sorting the columns and changing the standard column names to lowercase.
168229 Also strips leading and trailing whitespace from column names and removes any quoted strings from column names.
@@ -186,15 +247,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
186247 The standardized dataframe
187248 """
188249
189- def clean_column (col : str ) -> str :
190- col = col .strip ()
191- # Only remove quotes if the column name is fully quoted
192- if (col .startswith ('"' ) and col .endswith ('"' )) or (col .startswith ("'" ) and col .endswith ("'" )):
193- col = col [1 :- 1 ]
194-
195- return col .strip ()
196-
197- cleaned_columns = {c : clean_column (c ) for c in df .columns }
250+ cleaned_columns = {c : clean_col_name (c ) for c in df .columns }
198251 df .rename (columns = cleaned_columns , inplace = True )
199252
200253 column_mapper = {x : x .lower () for x in df .columns if x .lower () in STANDARD_COLUMNS }
@@ -368,6 +421,32 @@ def validate_variant_consistency(df: pd.DataFrame) -> None:
368421 pass
369422
370423
424+ def validate_df_column_metadata_match (df : pd .DataFrame , columnMetadata : dict [str , DatasetColumnMetadata ]):
425+ """
426+ Checks that metadata keys match the dataframe column names and exclude standard column names.
427+
428+ Parameters
429+ ----------
430+ df1 : pandas.DataFrame
431+ Dataframe parsed from an uploaded scores file
432+ columnMetadata : dict[str, DatasetColumnMetadata]
433+ Metadata for the scores columns
434+
435+ Raises
436+ ------
437+ ValidationError
438+ If any metadata keys do not match dataframe column names
439+ ValidationError
440+ If any metadata keys match standard columns
441+
442+ """
443+ for key in columnMetadata .keys ():
444+ if key .lower () in STANDARD_COLUMNS :
445+ raise ValidationError (f"standard column '{ key } ' cannot have metadata defined" )
446+ elif key not in df .columns :
447+ raise ValidationError (f"column metadata key '{ key } ' does not match any dataframe column names" )
448+
449+
371450def validate_variant_columns_match (df1 : pd .DataFrame , df2 : pd .DataFrame ):
372451 """
373452 Checks if two dataframes have matching HGVS columns.
0 commit comments