diff --git a/mavecore/original_validation/__init__.py b/mavecore/original_validation/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/mavecore/original_validation/constants.py b/mavecore/original_validation/constants.py deleted file mode 100644 index 6630323..0000000 --- a/mavecore/original_validation/constants.py +++ /dev/null @@ -1,90 +0,0 @@ -import re - -""" -Null Constant definitions -""" -NA_value = "NA" -null_values_list = ( - "nan", - "na", - "none", - "", - "undefined", - "n/a", - "null", - "nil", - NA_value, -) - -null_values_re = re.compile( - r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_value), flags=re.IGNORECASE -) - -readable_null_values = [ - "'{}'".format(v) for v in set([v.lower() for v in null_values_list]) if v.strip() -] + ["whitespace"] - -""" -Sequence constants -""" -AA_LETTERS = "ABCDEFGHIKLMNPQRSTVWXYZ" -DNA_LETTERS = "ATCG" - -DNA_SEQ_PATTERN = rf"[{DNA_LETTERS}]+" -AA_SEQ_PATTERN = rf"[{AA_LETTERS}]+" - - -""" -Constant definitions for application `experiment`. -""" -from mavecore.validation.urn_validators import ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, -) - -hgvs_nt_column = "hgvs_nt" -hgvs_splice_column = "hgvs_splice" -hgvs_pro_column = "hgvs_pro" -hgvs_columns = sorted([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]) -meta_data = "meta_data" -score_columns = "score_columns" -count_columns = "count_columns" -variant_score_data = "score_data" -variant_count_data = "count_data" -required_score_column = "score" - -experimentset_url_pattern = "|".join( - [MAVEDB_EXPERIMENTSET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -experiment_url_pattern = "|".join( - [MAVEDB_EXPERIMENT_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -scoreset_url_pattern = "|".join( - [MAVEDB_SCORESET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) - -any_url_pattern = "|".join( - [experimentset_url_pattern, experiment_url_pattern, scoreset_url_pattern] -) - - -valid_dataset_columns = [score_columns, count_columns] -valid_variant_columns = [variant_score_data, variant_count_data] - -variant_to_scoreset_column = { - variant_score_data: score_columns, - variant_count_data: count_columns, -} -scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} - -# Celery dataset status -processing = "processing" -failed = "failed" -success = "success" - -# User roles -administrator = "administrator" -editor = "editor" -viewer = "viewer" diff --git a/mavecore/original_validation/dataset_validators.py b/mavecore/original_validation/dataset_validators.py deleted file mode 100644 index fd7a20e..0000000 --- a/mavecore/original_validation/dataset_validators.py +++ /dev/null @@ -1,377 +0,0 @@ -import io -import csv -import re - -from numpy.testing import assert_array_equal - -from mavecore.validation import constants - - -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value : str - The value to be checked as null or not. - - Returns - _______ - bool - True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. - """ - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value - - -class WordLimitValidator: - """ - This class - - Attributes - __________ - message : str - Message template to describe how many words a field is limited to. - code : str - - counter : str - - """ - - message = "This field is limited to {} words." - code = "invalid" - counter = re.compile(r"\w+\b", flags=re.IGNORECASE) - - def __init__(self, word_limit, message=None, code=None): - # TODO - # check the code parameter type - """ - This constructor sets the values of the WordLimitValidator class attributes - message, code, and counter. - - Parameters - __________ - word_limit : int - The word limit assigned to the word limit attribute. - message : str - (default = None) The message assigned to the message attribute. - code : - (default = None) The code assigned to the code attribute. - """ - if message is not None: - self.message = message - if code is not None: - self.code = code - self.word_limit = int(word_limit) - - def __call__(self, value): - """ - Parameters - __________ - value : - - Returns - _______ - - Raises - ______ - ValueError - If - """ - if not value: - return - if len(self.counter.findall(value)) > self.word_limit: - raise ValueError(self.message.format(self.word_limit)) - - -def read_header_from_io(file, label=None, msg=None): - # TODO - # confirm types for parameters - """ - This takes a file and reads the header from that file. - - Parameters - __________ - file : - label : str - (default = None) - msg : str - (default = None) The message that is printed in the event of an error is raised. - - Returns - _______ - str - The header that was read from io. - - Raises - ______ - ValueError - If a header could not be parsed from file. Columns must be coma delimited. Column names - with commas must be escaped by enclosing them in double quotes. - """ - if label is None: - label = "uploaded" - - try: - header_line = file.readline() - if isinstance(header_line, bytes): - header_line = header_line.decode() - file.seek(0) - f = io.StringIO(header_line.strip()) - return [h.strip() for h in csv.DictReader(f, delimiter=",").fieldnames] - except Exception: - if not msg: - msg = ( - "A header could not be parsed from your {} file. Make sure" - "Columns are comma delimited. Column names with commas must be" - "escaped by enclosing them in double quotes.".format(label) - ) - raise ValueError(msg) - - -def validate_has_hgvs_in_header(header, label=None, msg=None): - """ - Parameters - __________ - header : - label : - default = None - msg : - default = None - - Raises - ______ - ValueError - If - """ - if label is None: - label = "Uploaded" - params = {} - if msg is None: - msg = ( - "Your %(label)s file must define either a nucleotide hgvs column " - "'%(col_nt)s' or a protein hgvs column '%(col_p)s'. " - "Columns are case-sensitive and must be comma delimited." - ) - params = { - "label": label, - "col_nt": constants.hgvs_nt_column, - "col_p": constants.hgvs_pro_column, - } - if not set(header) & set(constants.hgvs_columns): - raise ValueError(msg) - - -def validate_at_least_one_additional_column(header, label=None, msg=None): - # TODO - # verify parameter types - """ - This function checks the passed header to see if there exists additional columns besides the three - specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. - - Parameters - __________ - header : - label : - default = None - msg : - default = None - - Raises - ______ - ValueError - If there are not additional columns in the header argument. - """ - if label is None: - label = "Uploaded" - params = {} - if not any(v not in constants.hgvs_columns for v in header): - if msg is None: - msg = ( - "Your %(label)s file must define at " - "least one additional column different " - "from '{}', '{}' and '{}'.".format( - constants.hgvs_nt_column, - constants.hgvs_splice_column, - constants.hgvs_pro_column, - ) - ) - params = {"label": label} - raise ValueError(msg) - - -def validate_header_contains_no_null_columns(header, label=None, msg=None): - """ - This function checks that the header parameter does not contain any null columns that - are not in the case-insensitive null values listed in constants.readable_null_values. - - Parameters - __________ - header : - label : - (default = None) - msg : - (default = None) - - Raises - ______ - ValueError - If the file header contains blank/empty/whitespace. Only columns or the - case-insensitive null values listed in constants.readable_null_values - are permitted. - """ - if label is None: - label = "File" - any_null = any([is_null(v) for v in header]) - if any_null: - if msg is None: - msg = ( - "%(label)s file header cannot contain blank/empty/whitespace " - "only columns or the following case-insensitive null " - "values: {}.".format( - label, ", ".join(constants.readable_null_values_list) - ) - ) - raise ValueError(msg) - - -def validate_datasets_define_same_variants(scores, counts): - """ - Checks if two `pd.DataFrame` objects parsed from uploaded files - define the same variants. - - Parameters - ---------- - scores : `pd.DataFrame` - Scores dataframe parsed from an uploaded scores file. - counts : `pd.DataFrame` - Scores dataframe parsed from an uploaded counts file. - - Raises - ______ - ValueError - If score and counts files do not define the same variants. - """ - try: - assert_array_equal( - scores[constants.hgvs_nt_column].sort_values().values, - counts[constants.hgvs_nt_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_splice_column].sort_values().values, - counts[constants.hgvs_splice_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_pro_column].sort_values().values, - counts[constants.hgvs_pro_column].sort_values().values, - ) - except AssertionError: - raise ValueError( - "Your score and counts files do not define the same variants. " - "Check that the hgvs columns in both files match." - ) - - -def validate_scoreset_score_data_input(file): - """ - Validator function for checking that the scores file input contains - at least the column 'hgvs' and 'score'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - An open file handle in read mode. - - Raises - ______ - ValueError - If score data file is missing the required column constants.required_score_column - """ - file.seek(0) - header = read_header_from_io(file, label="Score") - validate_header_contains_no_null_columns(header, label="Score") - validate_has_hgvs_in_header(header, label="Score") - validate_at_least_one_additional_column(header, label="Score") - - if constants.required_score_column not in header: - raise ValueError( - "Score data file is missing the required column " - + constants.required_score_column - + "." - + "Columns are case-sensitive and must be comma delimited." - ) - - -def validate_scoreset_count_data_input(file): - """ - Validator function for checking that the counts file input contains - at least the column 'hgvs'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - File parsed by a `django` form. - """ - file.seek(0) - header = read_header_from_io(file, label="Count") - validate_header_contains_no_null_columns(header, label="Count") - validate_has_hgvs_in_header(header, label="Count") - validate_at_least_one_additional_column(header, label="Count") - - -def validate_scoreset_json(dict_): - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `dataset_columns` attribute in a :class:`ScoreSet` instance. - - Parameters - ---------- - dict_ : dict - Dictionary of keys mapping to a list. - - Raises - ______ - ValueError - If scoreset data is missing the required key. - ValueError - If header values are not strings. - ValueError - If - ValueError - If missing required column constants.required_score_column for score dataset. - ValueError - If encountered unexpected keys extras. - """ - required_columns = [constants.score_columns, constants.count_columns] - - for key in required_columns: - if key not in dict_.keys(): - raise ValueError("Scoreset data is missing the required key " + key) - - columns = dict_[key] - if not all([isinstance(c, str) for c in columns]): - raise ValueError("Header values must be strings.") - - if not isinstance(columns, list): - type_ = type(columns).__name__ - raise ValueError( - "Value for " + key.replace("_", " ") + " must be a list not " + type_ - ) - - # Check score columns is not-empty and at least contains hgvs and score - if key == constants.score_columns: - if constants.required_score_column not in columns: - raise ValueError( - "Missing required column constants.required_score_column " - "for score dataset." - ) - - # Check there are not unexpected columns supplied to the scoreset json - # field. - extras = [k for k in dict_.keys() if k not in set(required_columns)] - if len(extras) > 0: - extras = [k for k in dict_.keys() if k not in required_columns] - raise ValueError("Encountered unexpected keys extras") diff --git a/mavecore/original_validation/exceptions.py b/mavecore/original_validation/exceptions.py deleted file mode 100644 index 2851fa7..0000000 --- a/mavecore/original_validation/exceptions.py +++ /dev/null @@ -1,2 +0,0 @@ -class ValidationError(ValueError): - pass diff --git a/mavecore/original_validation/genome_validators.py b/mavecore/original_validation/genome_validators.py deleted file mode 100644 index dff8b69..0000000 --- a/mavecore/original_validation/genome_validators.py +++ /dev/null @@ -1,605 +0,0 @@ -""" -Validator functions for the fields of the following classes: - WildTypeSequence - ReferenceGenome - TargetGene - ReferenceMap - GenomicInterval - -Most validation should validate one specific field, unless fields need -to be validated against each other. -""" -from fqfa.validator.validator import dna_bases_validator, amino_acids_validator -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation import constants - - -def is_null(value): - """ - This function checks if the value exists or is null. - - Parameters - __________ - value : - The value to be checked. - - Returns - _______ - bool - True if a stripped/lowercase value in `nan_col_values`. - """ - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value - - -# min_start_validator = MinValueValidator( -# 1, message=_("Start coordinate must be a positive integer.") -# ) -# min_end_validator = MinValueValidator( -# 1, message=_("End coordinate must be a positive integer.") -# ) - - -class WildTypeSequence: - """ - Basic model specifying a wild-type sequence. - - Parameters - ---------- - sequence : `models.CharField` - The wild type DNA sequence that is related to the `target`. Will - be converted to upper-case upon instantiation. - - sequence_type : `models.CharField` - Protein sequence (amino acids) or DNA (nucleotides) - """ - - class SequenceType: - """ """ - - DNA = "dna" - PROTEIN = "protein" - INFER = "infer" - - @classmethod - def detect_sequence_type(cls, sequence): - # TODO - # confirm sequence parameter type - """ - This function determines if the sequence is a DNA or protein sequence and - returns "dna" if it is DNA or "protein" if it is protein. An error is raised - if it is neither. - - Parameters - __________ - sequence : str - - Returns - _______ - str - "dna" or "protein" depending on if the sequence is a DNA or protein sequence. - - Raises - ______ - ValueError - If sequence parameter is not protein or DNA. - """ - if sequence_is_dna(sequence): - return cls.DNA - elif sequence_is_protein(sequence): - return cls.PROTEIN - else: - raise ValueError( - f"Unknown sequence '{sequence}'. It is not protein or DNA." - ) - - @classmethod - def is_protein(cls, value): - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.PROTEIN - - @classmethod - def is_dna(cls, value): - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.DNA - - @classmethod - def choices(cls): - """ - - Returns - _______ - """ - return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] - - class Meta: - """ """ - - verbose_name = "Reference sequence" - verbose_name_plural = "Reference sequences" - - def __str__(self): - """ - - Returns - _______ - - """ - return self.get_sequence() - - # sequence = models.TextField( - # default=None, - # blank=False, - # null=False, - # verbose_name="Reference sequence", - # validation=[validate_wildtype_sequence], - # ) - # sequence_type = models.CharField( - # blank=True, - # null=False, - # default=SequenceType.INFER, - # verbose_name="Reference sequence type", - # max_length=32, - # choices=SequenceType.choices(), - # ) - - @property - def is_dna(self): - """ - - Returns - _______ - - """ - return self.__class__.SequenceType.is_dna(self.sequence_type) - - @property - def is_protein(self): - """ - - Returns - _______ - - """ - return self.__class__.SequenceType.is_protein(self.sequence_type) - - def save(self, *args, **kwargs): - """ - - Parameters - __________ - args : - kwargs : - - Returns - _______ - - """ - if self.sequence is not None: - self.sequence = self.sequence.upper() - self.sequence_type = ( - (self.__class__.SequenceType.detect_sequence_type(self.sequence)) - if self.__class__.SequenceType.INFER - else self.sequence_type - ) - - return super().save(*args, **kwargs) - - def get_sequence(self): - """ - - Returns - _______ - - """ - return self.sequence.upper() - - def is_attached(self): - """ - - Returns - _______ - - """ - return getattr(self, "target", None) is not None - - -# GenomicInterval -# ------------------------------------------------------------------------- # -def validate_interval_start_lteq_end(start, end): - """ - This function validates whether or not an interval's starting coordinate is less than - or equal to that interval's ending coordinate. - - Parameters - __________ - start : int - The interval's starting coordinate. - end : int - The interval's ending coordinate. - - Returns - _______ - None - If start is NoneType or end is NoneType. - - Raises - ______ - ValidationError - If an interval's starting coordinate is greater than the ending coordinate. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if start is None or end is None: - return - if start > end: - raise ValidationError( - ( - "An interval's starting coordinate cannot be greater than the " - "ending coordinate." - ) - ) - - -def validate_strand(value): - # TODO - # find the type of value - """ - This function validates a GenomicInterval strand and raises an error if the strand is invalid. - - Parameters - __________ - value : - The Genomic Interval strand to be validated. - - Raises - ______ - ValidationError - If GenomicInterval strand is not positive or negative. - """ - if value not in ("+", "-"): - raise ValidationError("GenomicInterval strand must be either '+' or '-'") - - -def validate_chromosome(value): - # TODO - # add description and type for value parameter - """ - - Parameters - __________ - value : - - Returns - _______ - None - If value is NoneType. - - Raises - ______ - ValidationError - If chromosome identifier is null. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if value is None: - return - if is_null(value): - raise ValidationError("Chromosome identifier must not be null.") - - -def validate_unique_intervals(intervals): - # TODO - # add description and interval parameter type plus description - """ - - Parameters - __________ - intervals : - - Raises - ______ - ValidationError - If the same interval was specified twice. - """ - for interval1 in intervals: - for interval2 in intervals: - if ( - (interval1.pk is not None) - and (interval2.pk is not None) - and (interval1.pk == interval2.pk) - ): - continue - elif interval1 is interval2: - continue - elif interval1.equals(interval2): - raise ValidationError("You can not specify the same interval twice.") - - -# WildTypeSequence -# ------------------------------------------------------------------------- # -def validate_wildtype_sequence(seq, as_type="any"): - # TODO - # add description to as_type parameter - """ - This function checks whether or not seq is a wildtype sequence. - - Parameters - __________ - seq : str - The sequence being validated. - as_type : str - (default = "any") - - Raises - ______ - ValidationError - If seq is not a valid wild type sequence. - ValidationError - If seq is not a valid DNA or protein reference sequence. - """ - # from .models import WildTypeSequence - - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - raise ValidationError( - "'%(seq)s' is not a valid wild type sequence." # , params={"seq": seq} - ) - - seq = seq.upper() - is_dna = dna_bases_validator(seq) is not None - is_aa = amino_acids_validator(seq) is not None - - if as_type == WildTypeSequence.SequenceType.DNA and not is_dna: - raise ValidationError( - "'%(seq)s' is not a valid DNA reference sequence." # , - # params={"seq": seq}, - ) - elif as_type == WildTypeSequence.SequenceType.PROTEIN and not is_aa: - raise ValidationError( - "'%(seq)s' is not a valid protein reference sequence." # , - # params={"seq": seq}, - ) - elif (as_type == "any" or WildTypeSequence.SequenceType.INFER) and not ( - is_dna or is_aa - ): - raise ValidationError( - "'%(seq)s' is not a valid DNA or protein reference sequence." # , - # params={"seq": seq}, - ) - - -def sequence_is_dna(seq): - """ - This function checks if seq is a DNA sequence. - - Parameters - __________ - seq : str - The sequence to be validated. - - Returns - _______ - bool - True if the dna_bases_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - return dna_bases_validator(seq) is not None - - -def sequence_is_protein(seq): - """ - This function check if seq is a protein sequence. - - Parameters - __________ - seq : str - The sequence being validated. - - Returns - _______ - bool - True if seq is not null, is a DNA sequence or amino_acids_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - if dna_bases_validator(seq) is not None: - return False # Very likely a DNA sequence if only ATG - return amino_acids_validator(seq) is not None - - -# ReferenceGenome -# ------------------------------------------------------------------------- # -def validate_organism_name(organism_name): - # TODO - # confirm organism_name type - """ - This function validates the organism name by checking that the name is not null. - - Parameters - __________ - organism_name : str - The organism name to be validated. - - Raises - ______ - ValidationError - If the organism name is null. - """ - if is_null(organism_name): - raise ValidationError("Species name must not be null.") - - -def validate_reference_genome_has_one_external_identifier(referencegenome): - # TODO - # revise description, make sure it is accurate - # anything greater than 0 will return True, so should it be == 1 or > 0? - # determine what type referencegenome is - """ - This function validates whether or not the reference genome has one external identifier. - An error is raised if - - Parameters - __________ - referencegenome : - - Raises - ______ - ValidationError - If - """ - if not referencegenome.genome_id: - raise ValidationError( - "Only one external identifier can be specified for a reference" "genome." - ) - - -def validate_genome_short_name(value): - # TODO - # confirm the type of the value parameter - """ - This function validates the genome short name and raises an error if the value is null. - - Parameters - __________ - value : str - The genome short name to be validated. - - Raises - ______ - ValidationError - If the genome short name is null. - """ - if is_null(value): - raise ValidationError("Genome short name must not be null.") - - -# ReferenceMap -# ------------------------------------------------------------------------- # -def validate_map_has_unique_reference_genome(annotations): - # TODO - # check the type of annotations - # add description to annotations parameter - """ - This function validates whether or not each map in annotations has a - unique reference genome and raises an error if this is not the case. - - Parameters - __________ - annotations : - - Raises - ______ - ValidationError - If each reference map does not specify a different reference genome. - """ - genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) - if len(genomes) < len(annotations): - raise ValidationError( - "Each reference map must specify a different reference genome." - ) - - -def validate_map_has_at_least_one_interval(reference_map): - """ - This function validates that a reference map has at least one interval and raises an error - if this is not the case. - - Parameters - __________ - reference_map : - Reference map. - - Raises - ______ - ValidationError - If the reference_map does not have at least one interval. - """ - if not reference_map.get_intervals().count(): - raise ValidationError( - "You must specify at least one interval for each reference map." - ) - - -def validate_at_least_one_map(reference_maps): - """ - This function validates whether a target has at least one reference map specified - and raises an error if it does not. - - Parameters - __________ - reference_maps : - - - Raises - ______ - ValidationError - If the target does not have at least one reference map specified. - """ - if not len(reference_maps): - raise ValidationError( - "A target must have at least one reference map specified." - ) - - -def validate_one_primary_map(reference_maps): - """ - This function validates the existence of one primary reference map and raises an error - if it does not exist. - - Parameters - __________ - reference_maps : - - Raises - ______ - ValidationError - If target has less than or more than one primary reference map. - """ - primary_count = sum(a.is_primary_reference_map() for a in reference_maps) - if primary_count > 1 or primary_count < 1: - raise ValidationError("A target must have one primary reference map.") - - -# TargetGene -# ------------------------------------------------------------------------- # -def validate_gene_name(gene_name): - # TODO - # confirm gene_name type - """ - This function checks to see if a gene name is null and raises and error if it is. - - Parameters - __________ - gene_name : str - The gene name. - - Raises - ______ - ValidationError - If gene name (value parameter) is null. - """ - if is_null(gene_name): - raise ValidationError("Gene name must not be null.") diff --git a/mavecore/original_validation/metadata_validators.py b/mavecore/original_validation/metadata_validators.py deleted file mode 100644 index 3c9d5d1..0000000 --- a/mavecore/original_validation/metadata_validators.py +++ /dev/null @@ -1,203 +0,0 @@ -import idutils - -from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import null_values_re - - -def is_null(value): - # TODO - # check that parameter type is accurate - """ - This function checks that the passed value is null. - - Parameters - __________ - value : str - Value to be checked if null. - - Returns - _______ - bool - True if a stripped/lowercase value in in `nan_col_values`. - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -def validate_sra_identifier(identifier): - if not ( - idutils.is_sra(identifier) - or idutils.is_bioproject(identifier) - or idutils.is_geo(identifier) - or idutils.is_arrayexpress_array(identifier) - or idutils.is_arrayexpress_experiment(identifier) - ): - raise ValidationError( - f"'{identifier} is not a valid SRA, GEO, ArrayExpress or BioProject " - "accession." - ) - - -def validate_keyword(kw): - """ - This function validates whether or not the kw parameter is valid by - checking that it is a string that is not null. If kw is null - or is not a string, an error is raised. - - Parameters - __________ - kw : str - The keyword to be validated. - - Raises - ______ - ValidationError - If the kw argument is not a valid string. - """ - if is_null(kw) or not isinstance(kw, str): - raise ValidationError( - f"'{kw}' not a valid keyword. Keywords must be valid strings." - ) - - -def validate_pubmed_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_pmid(identifier): - raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") - - -def validate_doi_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_doi(identifier): - raise ValidationError(f"'{identifier}' is not a valid DOI.") - - -def validate_ensembl_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_ensembl(identifier): - raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") - - -def validate_uniprot_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_uniprot(identifier): - raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") - - -def validate_refseq_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_refseq(identifier): - raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") - - -def validate_genome_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_genome(identifier): - raise ValidationError( - f"'{identifier}' is not a valid GenBank or RefSeq genome assembly." - ) - - -def validate_keyword_list(values): - """ - This function takes a list of keyword values and validates that each one is valid. - A valid keyword is a non-null string. The validate_keyword function will raise an - ValidationError if any of the keywords are invalid. - - Parameters - __________ - values : list[str] - The list of values to be validated. - """ - for value in values: - if not is_null(value): - validate_keyword(value) - - -def validate_pubmed_list(values): - """ - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_pubmed_identifier(value) - - -def validate_sra_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_sra_identifier(value) - - -def validate_doi_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_doi_identifier(value) - - -def validate_ensembl_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_ensembl_identifier(value) - - -def validate_refseq_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_refseq_identifier(value) - - -def validate_uniprot_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_uniprot_identifier(value) diff --git a/mavecore/original_validation/urn_validators.py b/mavecore/original_validation/urn_validators.py deleted file mode 100644 index f81b8fd..0000000 --- a/mavecore/original_validation/urn_validators.py +++ /dev/null @@ -1,153 +0,0 @@ -import re -from mavecore.validation.exceptions import ValidationError - -MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 -MAVEDB_TMP_URN_DIGITS = 16 -MAVEDB_URN_MAX_LENGTH = 64 -MAVEDB_URN_NAMESPACE = "mavedb" - - -# Temp URN patterns -# --------------------------------------------------------------------------- # -MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( - width=MAVEDB_TMP_URN_DIGITS -) -MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) - - -# Experimentset Pattern/Compiled RE -MAVEDB_EXPERIMENTSET_URN_PATTERN = r"^urn:{namespace}:\d{{{width}}}$".format( - namespace=MAVEDB_URN_NAMESPACE, width=MAVEDB_EXPERIMENTSET_URN_DIGITS -) -MAVEDB_EXPERIMENTSET_URN_RE = re.compile(MAVEDB_EXPERIMENTSET_URN_PATTERN) - -# Experiment Pattern/Compiled RE -MAVEDB_EXPERIMENT_URN_PATTERN = r"{pattern}-([a-z]+|0)$".format( - pattern=MAVEDB_EXPERIMENTSET_URN_PATTERN[:-1] -) -MAVEDB_EXPERIMENT_URN_RE = re.compile(MAVEDB_EXPERIMENT_URN_PATTERN) - -# Scoreset Pattern/Compiled RE -MAVEDB_SCORESET_URN_PATTERN = r"{pattern}-\d+$".format( - pattern=MAVEDB_EXPERIMENT_URN_PATTERN[:-1] -) -MAVEDB_SCORESET_URN_RE = re.compile(MAVEDB_SCORESET_URN_PATTERN) - -# Variant Pattern/Compiled RE -MAVEDB_VARIANT_URN_PATTERN = r"{pattern}#\d+$".format( - pattern=MAVEDB_SCORESET_URN_PATTERN[:-1] -) -MAVEDB_VARIANT_URN_RE = re.compile(MAVEDB_VARIANT_URN_PATTERN) - -# Any Pattern/Compiled RE -MAVEDB_ANY_URN_PATTERN = "|".join( - [ - r"({pattern})".format(pattern=p) - for p in ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_VARIANT_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, - ) - ] -) -MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) - - -def validate_mavedb_urn(urn): - """ - This function validates a MaveDB urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The MaveDB urn to be validated. - - Raises - ______ - ValidationError - If the MaveDB urn is not valid. - """ - if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError("%(urn)s is not a valid urn.", params={"urn": urn}) - - -def validate_mavedb_urn_experimentset(urn): - """ - This function validates a Experiment Set urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Experiment Set urn to be validated. - - Raises - ______ - ValidationError - If the Experiment Set urn is not valid. - """ - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid Experiment Set urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_experiment(urn): - """ - This function validates an Experiment urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Experiment urn to be validated. - - Raises - ______ - ValidationError - If the Experiemnt urn is not valid. - """ - if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid Experiment urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_scoreset(urn): - """ - This function validates a Scoreset urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Scoreset urn to be validated - - Raises - ______ - ValidationError - If the Scoreset urn is not valid. - """ - if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid score set urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_variant(urn): - """ - This function validates a MaveDB Variant urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The MaveDB Variant urn to be validated. - - Raises - ______ - ValidationError - If the MaveDB Variant urn is not valid. - """ - if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid Variant urn.", params={"urn": urn} - ) diff --git a/mavecore/original_validation/validate.py b/mavecore/original_validation/validate.py deleted file mode 100644 index b138c9a..0000000 --- a/mavecore/original_validation/validate.py +++ /dev/null @@ -1,69 +0,0 @@ -from mavecore.validation import dataset_validators - - -def validate_all(countfile=None, scorefile=None, scorejson=None): - """ - By calling other helper functions, this function runs all of the validation code. - - Parameters - __________ - countfile : - scorefile : - scorejson : - - """ - validate_dataset(countfile, scorefile, scorejson) - - -def validate_dataset(countfile=None, scorefile=None, scorejson=None): - """ - This function calls all of the validation functions within - mavetools/mavetools/validation/dataset_validation.py - - Parameters - __________ - countfile : - scorefile : - scorejson : - - Returns - ------- - - """ - - # how to incorporate word limit validator? - - if scorefile is not None: - # open scorefile - open(scorefile) - # this one returns header - scoreheader = dataset_validators.read_header_from_io(file=scorefile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=scoreheader) - dataset_validators.validate_at_least_one_additional_column(header=scoreheader) - dataset_validators.validate_header_contains_no_null_columns(header=scoreheader) - - dataset_validators.validate_scoreset_score_data_input(file=scorefile) - - if scorejson is not None: - # open scorejson - open(scorejson) - dataset_validators.validate_scoreset_json(dict_=scorejson) - - if countfile is not None: - # open countfile - open(countfile) - countheader = dataset_validators.read_header_from_io(file=countfile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=countheader) - dataset_validators.validate_at_least_one_additional_column(header=countheader) - dataset_validators.validate_header_contains_no_null_columns(header=countheader) - - dataset_validators.validate_scoreset_count_data_input(file=countfile) - - if scorefile is not None and countfile is not None: - dataset_validators.validate_datasets_define_same_variants( - scores=scorefile, counts=countfile - ) diff --git a/mavecore/original_validation/variant_validators/__init__.py b/mavecore/original_validation/variant_validators/__init__.py deleted file mode 100644 index 1f7aca1..0000000 --- a/mavecore/original_validation/variant_validators/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from .dataset import MaveDataset, MaveCountsDataset, MaveScoresDataset - -from .hgvs import ( - validate_nt_variant, - validate_pro_variant, - validate_splice_variant, - validate_hgvs_string, -) - -from .variant import validate_columns_match, validate_variant_json - -__all__ = [ - "dataset", - "variant", - "hgvs", - "validate_nt_variant", - "validate_splice_variant", - "validate_pro_variant", - "validate_hgvs_string", - "validate_columns_match", - "validate_variant_json", - "MaveCountsDataset", - "MaveScoresDataset", - "MaveDataset", -] diff --git a/mavecore/original_validation/variant_validators/dataset.py b/mavecore/original_validation/variant_validators/dataset.py deleted file mode 100644 index 9461a0e..0000000 --- a/mavecore/original_validation/variant_validators/dataset.py +++ /dev/null @@ -1,1012 +0,0 @@ -import re -from collections import defaultdict -from io import StringIO -from itertools import groupby -from operator import itemgetter -from typing import Union, Optional, Tuple, List, TextIO, BinaryIO, Set, Dict - -import pandas as pd -import numpy as np -from mavehgvs import MaveHgvsParseError, Variant -from fqfa.util.translate import translate_dna -from fqfa.util.infer import infer_sequence_type - -from mavecore.validation.constants import ( - hgvs_nt_column, - hgvs_splice_column, - hgvs_pro_column, - required_score_column, - null_values_list, - null_values_re, - readable_null_values_list, -) - - -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value : - - Returns - _______ - bool - - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -class MaveDataset: - """ """ - - class DatasetType: - """ """ - - SCORES = "scores" - COUNTS = "counts" - - class HGVSColumns: - """ """ - - NUCLEOTIDE: str = hgvs_nt_column - TRANSCRIPT: str = hgvs_splice_column - PROTEIN: str = hgvs_pro_column - - @classmethod - def options(cls) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] - - class AdditionalColumns: - """ """ - - @classmethod - def options(cls) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [] - - # ---------------------- Construction------------------------------------ # - @classmethod - def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveScoresDataset` - - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) - - @classmethod - def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveCountsDataset` - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.COUNTS) - - @classmethod - def _for_type( - cls, file: Union[str, TextIO, BinaryIO], dataset_type: str - ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - dataset_type : str - - Returns - _______ - Union[`MaveScoreDataset`, `MaveCountsDataset`] - - Raises - ______ - TypeError - If file parameter is not expected file path or buffer object. - ValueError - If dataset_type parameter is not a recognized dataset type. - """ - if isinstance(file, str): - handle = file - elif hasattr(file, "read"): - file_contents = file.read() - if hasattr(file_contents, "decode"): - file_contents = file_contents.decode("utf-8") - file_contents = file_contents.strip() - handle = StringIO(file_contents) - else: - raise TypeError( - f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" - ) - - extra_na_values = set( - list(null_values_list) - + [str(x).lower() for x in null_values_list] - + [str(x).upper() for x in null_values_list] - + [str(x).capitalize() for x in null_values_list] - ) - - df = pd.read_csv( - filepath_or_buffer=handle, - sep=",", - encoding="utf-8", - quotechar='"', - comment="#", - na_values=extra_na_values, - keep_default_na=True, - dtype={ - **{c: str for c in cls.HGVSColumns.options()}, - MaveScoresDataset.AdditionalColumns.SCORES: float, - }, - ).replace(null_values_re, np.NaN) - - if dataset_type == cls.DatasetType.SCORES: - return MaveScoresDataset(df) - elif dataset_type == cls.DatasetType.COUNTS: - return MaveCountsDataset(df) - else: - raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") - - # ---------------------- Public ----------------------------------------- # - @property - def label(self) -> str: - """ - - Returns - _______ - str - """ - return "dataset" - - @property - def is_valid(self) -> Optional[bool]: - """ - - Returns - _______ - Optional[bool] - """ - if self._errors is None: - return None - return len(self._errors) == 0 - - @property - def n_errors(self) -> Optional[int]: - """ - - Returns - _______ - Optional[int] - """ - if self._errors is None: - return None - return len(self._errors) - - @property - def errors(self) -> Optional[List[str]]: - """ - - Returns - _______ - Optional[List[str]] - """ - return self._errors - - @property - def is_empty(self) -> bool: - """ - - Returns - _______ - bool - """ - return self._df.empty - - @property - def columns(self) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return list(self._df.columns) - - @property - def hgvs_columns(self) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c in self.HGVSColumns.options()] - - @property - def non_hgvs_columns(self) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c not in self.HGVSColumns.options()] - - @property - def n_rows(self) -> int: - """ - - Returns - _______ - int - """ - return len(self._df) - - @property - def n_columns(self) -> int: - """ - - Returns - _______ - int - """ - return len(self.columns) - - @property - def index_column(self) -> Optional[str]: - """ - - Returns - _______ - Optional[str] - """ - if self._errors: - return None - return self._index_column - - @property - def index(self) -> Optional[pd.Index]: - """ - - Returns - _______ - Optional[`pd.Index`] - """ - if self._errors: - return None - return self._df.index.copy(deep=True) - - def data(self, serializable=False) -> pd.DataFrame: - """ - Return underlying dataframe object. - - Parameters - ---------- - serializable: bool - Replaces `np.NaN` with `None` for JSON compatibility. - - Returns - _______ - `pd.DataFrame` - - """ - if serializable: - # need to force "object" type to allow None values - return_df = self._df.astype(object, copy=True) - return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) - return return_df - return self._df.copy(deep=True) - - def match_other(self, other: "MaveDataset") -> Optional[bool]: - """ - Check that each dataset defined the same variants in each column. - - Parameters - ---------- - other: MaveDataset - Validator instance to match against. - - Returns - ------- - Optional[bool] - A boolean indicating index match, otherwise `None` if either instance - is not valid. - """ - if (not self.is_valid) or (not other.is_valid): - return None - - if self.index_column != other.index_column: - return False - - return all( - self._df[column].equals(other._df[column]) - for column in self.HGVSColumns.options() - ) - - def to_dict(self) -> Dict[str, Dict]: - """ - Returns underlying dataframe as dictionary in 'records' orientation. - Keys will be index values and values will be an inner dictionary mapping - column names to row values for said index. - - Returns - _______ - Dict[str, Dict] - """ - # Convert np.NaN values to None for consistency across all columns and - # for compatibility in PostgresSQL queries. Replaces all values which - # are considered null by pandas with None by masking pd.notnull cells. - - return self.data(serializable=True).to_dict(orient="index") - - def validate( - self, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - allow_index_duplicates: bool = False, - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - allow_index_duplicates : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - - self._errors = [] - self._df.index = pd.RangeIndex(start=0, stop=self.n_rows, step=1) - self._index_column = None - - self._validate_columns() - # Only attempt to validate variants if columns are valid - if not self._errors: - ( - self._normalize_data() - ._validate_genomic_variants(targetseq, relaxed_ordering) - ._validate_transcript_variants(targetseq, relaxed_ordering) - ._validate_protein_variants(targetseq, relaxed_ordering) - ._validate_index_column(allow_duplicates=allow_index_duplicates) - ) - - if self.is_empty: - self._errors.append( - f"No variants could be parsed from your {self.label} file. " - f"Please upload a non-empty file." - ) - return self - - if not self._errors: - # Set index last as original index is used when indicating duplicate - # hgvs string row numbers in the column name used as the index ( - # either hgvs_nt when present or hgvs_pro when hgvs_nt is absent). - self._df.index = pd.Index(self._df[self.index_column]) - - return self - - # ---------------------- Private ---------------------------------------- # - def __init__( - self, - df: Optional[pd.DataFrame] = None, - index_column: Optional[str] = None, - errors: Optional[List[str]] = None, - ): - """ - - Parameters - df : - index_column : - errors : - - Raises - ______ - - """ - self._df: pd.DataFrame = pd.DataFrame() if df is None else df - self._index_column = index_column or None - self._errors = None if errors is None else list(errors) - - def __repr__(self): - """ - - Returns - _______ - - """ - return ( - f"<" - f"{self.__class__.__name__} " - f"columns={self.columns} " - f"index={self.index_column} " - f"valid={self.is_valid}" - f">" - ) - - @property - def _column_order(self) -> Dict[str, int]: - """ - - Returns - _______ - Dict[str, int] - """ - return defaultdict( - lambda: 100, - { - self.HGVSColumns.NUCLEOTIDE: 0, - self.HGVSColumns.TRANSCRIPT: 1, - self.HGVSColumns.PROTEIN: 2, - **{ - c: (2 + i) - for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) - }, - }, - ) - - def _validate_columns(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - # Pandas will automatically name blank columns using the pattern below - unnamed = re.compile(r"^Unnamed: \d+$", flags=re.IGNORECASE) - columns = self.columns - if any(is_null(h) or unnamed.match(h) for h in columns): - self._errors.append( - f"Column names in your {self.label} file cannot values " - f"considered null such as the following: " - f"{', '.join(readable_null_values_list)}" - ) - - columns = [c for c in columns if not is_null(c)] - if len(columns) < 1: - self._errors.append( - f"No columns could not be parsed from your {self.label} file. " - "Make sure columns are comma delimited. Column names with " - "commas must be escaped by enclosing them in double quotes" - ) - - required = {self.HGVSColumns.NUCLEOTIDE, self.HGVSColumns.PROTEIN} - if not (set(columns) & required): - self._errors.append( - f"Your {self.label} file must define either a nucleotide " - f"hgvs column '({self.HGVSColumns.NUCLEOTIDE})' " - f"or a protein hgvs column '({self.HGVSColumns.PROTEIN})'. " - f"Columns are case-sensitive and must be comma delimited" - ) - - if not (set(columns) - set(self.HGVSColumns.options())): - self._errors.append( - f"Your {self.label} file must define at least one additional " - f"column different from '{self.HGVSColumns.NUCLEOTIDE}', " - f"'{self.HGVSColumns.TRANSCRIPT}' and " - f"'{self.HGVSColumns.PROTEIN}'" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - """ - if self._errors: - return self - - # Initialize missing hgvs columns as empty. - for c in self.HGVSColumns.options(): - if c not in self.columns: - self._df[c] = np.NaN - - column_order = self._column_order - sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) - - self._df = self._df[sorted_columns] - return self - - def _validate_genomic_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.NUCLEOTIDE): - return self - - defines_transcript_variants = not self._column_is_null( - self.HGVSColumns.TRANSCRIPT - ) - validated_variants, prefixes, errors = self._validate_variants( - column=self.HGVSColumns.NUCLEOTIDE, - splice_defined=defines_transcript_variants, - targetseq=targetseq, - relaxed_ordering=relaxed_ordering, - ) - - if ("c" in prefixes or "n" in prefixes) and "g" in prefixes: - self._errors.append( - f"{self.HGVSColumns.NUCLEOTIDE}: Genomic variants " - f"(prefix 'g.') cannot be mixed with transcript variants " - f"(prefix 'c.' or 'n.')" - ) - - if prefixes == {"g"} and not defines_transcript_variants: - self._errors.append( - f"Transcript variants ('{self.HGVSColumns.TRANSCRIPT}' column) " - f"are required when specifying genomic variants " - f"(prefix 'g.' in the 'hgvs_nt' column)" - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.NUCLEOTIDE] = validated_variants - - self._index_column = self.HGVSColumns.NUCLEOTIDE - return self - - def _validate_transcript_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_tx = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_tx and (not defines_nt): - self._errors.append( - f"Genomic variants ('{self.HGVSColumns.NUCLEOTIDE}' column) " - f"must be defined when specifying transcript " - f"variants ('{self.HGVSColumns.TRANSCRIPT}' column)" - ) - - if not defines_tx: - return self - - # Don't validate transcript variants against sequence. Might come - # back to this later with research into implementing gene models. - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.TRANSCRIPT, - targetseq=None, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.TRANSCRIPT] = validated_variants - - return self - - def _validate_protein_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.PROTEIN): - return self - - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_splice = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_splice: - protein_seq = None - else: - protein_seq = targetseq - if targetseq and "dna" in infer_sequence_type(targetseq).lower(): - protein_seq, remainder = translate_dna(targetseq) - if remainder: - self._errors.insert( - 0, - "Protein variants could not be validated because the " - "length of your target sequence is not a multiple of 3", - ) - - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.PROTEIN, - targetseq=protein_seq, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.PROTEIN] = validated_variants - - if not defines_nt: - self._index_column = self.HGVSColumns.PROTEIN - - return self - - def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": - """ - - Parameters - __________ - allow_duplicates : bool - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - if self._index_column is None: - self._index_column = self.HGVSColumns.NUCLEOTIDE - - if self._column_is_partially_null(self._index_column): - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"cannot contain any null values from " - f"{', '.join(readable_null_values_list)} (case-insensitive)" - ) - - if not allow_duplicates: - dupes = self._df[self._index_column].duplicated(keep=False) - if np.any(dupes): - dup_list = zip( - self._df.loc[dupes, self._index_column], dupes.index[dupes] - ) - dupes_str = ", ".join( - f"{v}: {[(g[1] + 1) for g in groups]}" # get row numbers - for (v, groups) in groupby(dup_list, key=itemgetter(0)) - ) - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"contains duplicate HGVS variants: {dupes_str}" - ) - - return self - - def _validate_variants( - self, - column: str, - splice_defined: Optional[bool] = None, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - ) -> Tuple[pd.Series, Set[str], List[str]]: - """ - - Parameters - __________ - column : str - splice_defined : Optional[bool] - targetseq : Optional[str] - relaxed_ordering : bool - - Returns - _______ - Tuple[`pd.Series`, Set[str], List[str]] - - Raises - ______ - - """ - - prefixes = set() - errors = [] - - def validate_variant(variant: str): - # TODO: logic mirrors that in validate_hgvs_string, which is kept - # as a standalone function for backwards compatibility with - # django's model validator field. Merge at some point. - - if is_null(variant): - return np.NaN - else: - try: - if variant.lower() == "_sy": - errors.append( - "'_sy' is no longer supported and should be " - "replaced by 'p.(=)'" - ) - return variant - elif variant.lower() == "_wt": - errors.append( - "'_wt' is no longer supported and should be " - "replaced by one of 'g.=', 'c.=' or 'n.='" - ) - return variant - - validated = Variant( - variant, targetseq=targetseq, relaxed_ordering=relaxed_ordering - ) - prefix = validated.prefix.lower() - prefixes.add(prefix) - - prefix_error = self._validate_variant_prefix_for_column( - variant=validated, - prefix=validated.prefix, - column=column, - splice_defined=splice_defined, - ) - if prefix_error: - errors.append(prefix_error) - - return str(validated) - - except MaveHgvsParseError as error: - errors.append(f"{variant}: {str(error)}") - return np.NaN - - validated_variants = self._df[column].apply(validate_variant) - - return validated_variants, prefixes, errors - - def _column_is_null(self, column) -> bool: - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == len(self._df) - - def _column_is_partially_null(self, column) -> bool: - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return 0 < len(self._df[self._df[column].isna()]) < len(self._df) - - def _column_is_fully_specified(self, column) -> bool: - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == 0 - - def _validate_variant_prefix_for_column( - self, variant: Variant, prefix: str, column: str, splice_defined: bool - ) -> Optional[str]: - """ - - Parameters - __________ - variant : Variant - prefix : str - column : str - splice_defined : bool - - Returns - _______ - Optional[str] - - Raises - ______ - ValueError - If there is an unknown column as column argument. - """ - prefix = prefix.lower() - - if column == self.HGVSColumns.NUCLEOTIDE: - if splice_defined: - if prefix not in "g": - return ( - f"{column}: " - f"'{variant}' is not a genomic variant " - f"(prefix 'g.'). Nucleotide variants must " - f"be genomic if transcript variants are " - f"also present" - ) - else: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. " - f"The accepted transcript variant prefixes " - f"are 'c.' or 'n.'" - ) - elif column == self.HGVSColumns.TRANSCRIPT: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. The " - f"accepted transcript variant prefixes are " - f"'c.' or 'n.'" - ) - elif column == self.HGVSColumns.PROTEIN: - if prefix not in "p": - return ( - f"{column}: " - f"'{variant}' is not a protein variant. " - f"The accepted protein variant prefix is 'p.'" - ) - else: - raise ValueError( - f"Unknown column '{column}'. Expected one " - f"of {', '.join(self.HGVSColumns.options())}" - ) - - return None - - -class MaveScoresDataset(MaveDataset): - """ """ - - class AdditionalColumns: - """ """ - - SCORES = required_score_column - - @classmethod - def options(cls) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [cls.SCORES] - - @property - def label(self) -> str: - """ - - Returns - _______ - str - """ - return "scores" - - def _validate_columns(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - super()._validate_columns() - - if self.AdditionalColumns.SCORES not in self.columns: - self._errors.append( - f"Your scores dataset is missing the " - f"'{self.AdditionalColumns.SCORES}' column. " - f"Columns are case-sensitive and must be comma delimited" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - ValueError - - """ - super()._normalize_data() - - should_be_numeric = [self.AdditionalColumns.SCORES] - for c in should_be_numeric: - if c in self.columns: - try: - self._df[c] = self._df[c].astype(dtype=float, errors="raise") - except ValueError as e: - self._errors.append(f"{c}: {str(e)}") - - return self - - -class MaveCountsDataset(MaveDataset): - """ """ - - @property - def label(self) -> str: - """ - - Returns - _______ - str - """ - return "counts" diff --git a/mavecore/original_validation/variant_validators/hgvs.py b/mavecore/original_validation/variant_validators/hgvs.py deleted file mode 100644 index 3f0c043..0000000 --- a/mavecore/original_validation/variant_validators/hgvs.py +++ /dev/null @@ -1,134 +0,0 @@ -from functools import partial -from typing import Optional, Union - -from mavehgvs import Variant, MaveHgvsParseError -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation.constants import NA_STRING, null_values_re - -from mavecore.validation.constants import ( - hgvs_nt_column, - hgvs_splice_column, - hgvs_pro_column, -) - - -# from core.utilities import is_null -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value - - Returns - _______ - - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -def validate_hgvs_string( - value: Union[str, bytes], - column: Optional[str] = None, - splice_present: bool = False, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, -) -> Optional[str]: - """ - - Parameters - __________ - value : Union[str, bytes] - column : Optional[str] = None - splice_present : - targetseq : - relaxed_ordering : - - Returns - _______ - - Raises - ______ - ValidationError - If variant HGVS input values are not strings. - ValidationError - If value is _sy or _wt, which are no longer supported. - ValidationError - If - ValidationError - If value is not a genomic variant (prefix 'g.'). Nucleotide variants must - be genomic if transcript variants are also defined. - ValidationError - If value is not a transcript variant. The accepted transcript variant - prefixes are 'c.', 'n.'. - ValidationError - If value is not a protein variant. The accepted protein variant prefix is 'p.'. - ValueError - If there exists an unknown column. Function expects nt, splice or p." - """ - if is_null(value): - return None - - if hasattr(value, "decode"): - value = value.decode() - if not isinstance(value, str): - raise ValidationError( - "Variant HGVS values input must be strings. " - "'{}' has the type '{}'.".format(value, type(value).__name__) - ) - - if value.lower() == "_sy": - raise ValidationError( - "_sy is no longer supported and should be replaced by p.(=)" - ) - elif value.lower() == "_wt": - raise ValidationError( - "_wt is no longer supported and should be replaced by (cgnp).=" - ) - - try: - variant = Variant( - s=value, targetseq=targetseq, relaxed_ordering=relaxed_ordering - ) - except MaveHgvsParseError as error: - raise ValidationError(f"{value}: {str(error)}") - - prefix = variant.prefix.lower() - if column in ("nt", hgvs_nt_column): - if splice_present: - if prefix not in "g": - raise ValidationError( - f"'{value}' is not a genomic variant (prefix 'g.'). " - f"Nucleotide variants must be genomic if transcript " - f"variants are also defined." - ) - else: - if prefix not in "cn": - raise ValidationError( - f"'{value}' is not a transcript variant. The accepted " - f"transcript variant prefixes are 'c.', 'n.'." - ) - elif column in ("splice", hgvs_splice_column): - if prefix not in "cn": - raise ValidationError( - f"'{value}' is not a transcript variant. The accepted " - f"transcript variant prefixes are 'c.', 'n.'." - ) - elif column in ("p", hgvs_pro_column): - if prefix not in "p": - raise ValidationError( - f"'{value}' is not a protein variant. The accepted " - f"protein variant prefix is 'p.'." - ) - else: - raise ValueError("Unknown column '{}'. Expected nt, splice or p".format(column)) - - return str(variant) - - -validate_nt_variant = partial(validate_hgvs_string, **{"column": "nt"}) -validate_splice_variant = partial(validate_hgvs_string, **{"column": "splice"}) -validate_pro_variant = partial(validate_hgvs_string, **{"column": "p"}) diff --git a/mavecore/original_validation/variant_validators/variant.py b/mavecore/original_validation/variant_validators/variant.py deleted file mode 100644 index bf00e71..0000000 --- a/mavecore/original_validation/variant_validators/variant.py +++ /dev/null @@ -1,85 +0,0 @@ -from typing import Dict - -from mavecore.validation.constants import ( - variant_score_data, - variant_count_data, - required_score_column, -) -from mavecore.validation.exceptions import ValidationError - - -def validate_columns_match(variant, scoreset) -> None: - # TODO - # document errors correctly, note key error - """ - Validate that a child matches parents defined columns to keep - data in sync. - - Parameters - __________ - variant : - scoreset : - - Raises - ______ - ValidationError - If variant score columns do not match scoreset score columns. - ValidationError - If variant count columns do not match scoreset count columns. - """ - try: - if variant.score_columns != scoreset.score_columns: - raise ValidationError( - f"Variant defines score columns '{variant.score_columns}' " - f"but parent defines columns '{scoreset.score_columns}. " - ) - if variant.count_columns != scoreset.count_columns: - raise ValidationError( - f"Variant defines count columns '{variant.count_columns}' " - f"but parent defines columns '{scoreset.count_columns}. " - ) - except KeyError as error: - raise ValidationError(f"Missing key {str(error)}") - - -def validate_variant_json(data: Dict[str, Dict]) -> None: - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `data` attribute in a :class:`Variant` instance. - - Parameters - ---------- - data : dict[str, dict] - Dictionary of keys mapping to a list. - - Raises - ______ - ValidationError - If missing the required key. - ValidationError - If missing the required column in variant's score data. - ValidationError - If encountered unexpected keys. - ValidationError - If value for key is not of type dict. - """ - expected_keys = [variant_score_data, variant_count_data] - for key in expected_keys: - if key not in data.keys(): - raise ValidationError(f"Missing the required key {key}") - - if required_score_column not in data[variant_score_data]: - raise ValidationError( - f"Missing required column '{required_score_column}' in variant's score data." - ) - - extras = [k for k in data.keys() if k not in set(expected_keys)] - if len(extras) > 0: - extras = [k for k in data.keys() if k not in expected_keys] - raise ValidationError("Encountered unexpected keys {extras}") - - # Check the correct data types are given. - for key in expected_keys: - if not isinstance(data[key], dict): - type_ = type(data[key]).__name__ - raise ValidationError(f"Value for '{key}' must be a dict not {type_}.") diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index e675251..59dea5e 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -9,90 +9,10 @@ from mavecore.validation.utilities import is_null -def is_null(value): - """ - Checks if a stripped/lowercase value is one of the recognized NA or NULL string values. - - Parameters - __________ - value : str - The value to be checked as null or not. - - Returns - _______ - bool - True value is NoneType, is an empty string, or if value matches the stated regex patterns in - constants.null_values_re. - """ - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value - -validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) - -class WordLimitValidator: - """ - This class - - Attributes - __________ - message : str - Message template to describe how many words a field is limited to. - code : str - - counter : str - - """ - - message = "This field is limited to {} words." - code = "invalid" - counter = re.compile(r"\w+\b", flags=re.IGNORECASE) - - def __init__(self, word_limit, message=None, code=None): - # check the code parameter type - """ - This constructor sets the values of the WordLimitValidator class attributes - message, code, and counter. - - Parameters - __________ - word_limit : int - The word limit assigned to the word limit attribute. - message : str - (default = None) The message assigned to the message attribute. - code : - (default = None) The code assigned to the code attribute. - """ - if message is not None: - self.message = message - if code is not None: - self.code = code - self.word_limit = int(word_limit) - - def __call__(self, value): - """ - Parameters - __________ - value : - - Returns - _______ - - Raises - ______ - ValidationError - If - """ - if not value: - return - if len(self.counter.findall(value)) > self.word_limit: - raise ValidationError(self.message.format(self.word_limit)) - def read_header_from_io(file, label=None, msg=None): # TODO - # confirm types for parameters + # confirm type for the file parameter """ This takes a file and reads the header from that file. @@ -102,7 +22,8 @@ def read_header_from_io(file, label=None, msg=None): label : str (default = None) msg : str - (default = None) The message that is printed in the event of an error is raised. + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Returns _______ @@ -137,18 +58,22 @@ def read_header_from_io(file, label=None, msg=None): def validate_has_hgvs_in_header(header, label=None, msg=None): """ + Determines whether or not hgvs is in a header. + Parameters __________ - header : + header : str + The first line of the file being validated. label : - default = None + (default = None) msg : - default = None + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Raises ______ ValidationError - If + If the header is empty and there exists a value for the constants.hgvs_columns parameter. """ if label is None: label = "Uploaded" @@ -169,19 +94,19 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): def validate_at_least_one_additional_column(header, label=None, msg=None): - # TODO - # verify parameter types """ This function checks the passed header to see if there exists additional columns besides the three specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. Parameters __________ - header : - label : - default = None - msg : - default = None + header : str + The first line of the file being validated. + label : str + (default = None) + msg : str + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Raises ______ @@ -213,11 +138,13 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): Parameters __________ - header : - label : - (default = None) - msg : + header : str + The first line of the file being validated. + label : str (default = None) + msg : str + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Raises ______ @@ -385,5 +312,3 @@ def validate_scoreset_json(dict_): if len(extras) > 0: extras = [k for k in dict_.keys() if k not in required_columns] raise ValidationError("Encountered unexpected keys extras") - - diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 466c2fa..b3e419b 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -1,149 +1,5 @@ -# note: ValidationError2 code in this file is from Django -import operator - NON_FIELD_ERRORS = "__all__" -class ValidationError(Exception): - None - - -class ValidationError2(Exception): - """An error while validating data.""" - - def __init__(self, message, code=None, params=None): - """ - The `message` argument can be a single error, a list of errors, or a - dictionary that maps field names to lists of errors. What we define as - an "error" can be either a simple string or an instance of - ValidationError with its message attribute set, and what we define as - list or dictionary can be an actual `list` or `dict` or an instance - of ValidationError with its `error_list` or `error_dict` attribute set. - """ - super().__init__(message, code, params) - - if isinstance(message, ValidationError): - if hasattr(message, "error_dict"): - message = message.error_dict - elif not hasattr(message, "message"): - message = message.error_list - else: - message, code, params = message.message, message.code, message.params - - if isinstance(message, dict): - self.error_dict = {} - for field, messages in message.items(): - if not isinstance(messages, ValidationError): - messages = ValidationError(messages) - self.error_dict[field] = messages.error_list - - elif isinstance(message, list): - self.error_list = [] - for message in message: - # Normalize plain strings to instances of ValidationError. - if not isinstance(message, ValidationError): - message = ValidationError(message) - if hasattr(message, "error_dict"): - self.error_list.extend(sum(message.error_dict.values(), [])) - else: - self.error_list.extend(message.error_list) - - else: - self.message = message - self.code = code - self.params = params - self.error_list = [self] - - @property - def message_dict(self): - # Trigger an AttributeError if this ValidationError - # doesn't have an error_dict. - getattr(self, "error_dict") - - return dict(self) - - @property - def messages(self): - if hasattr(self, "error_dict"): - return sum(dict(self).values(), []) - return list(self) - - def update_error_dict(self, error_dict): - if hasattr(self, "error_dict"): - for field, error_list in self.error_dict.items(): - error_dict.setdefault(field, []).extend(error_list) - else: - error_dict.setdefault(NON_FIELD_ERRORS, []).extend(self.error_list) - return error_dict - - def __iter__(self): - if hasattr(self, "error_dict"): - for field, errors in self.error_dict.items(): - yield field, list(ValidationError(errors)) - else: - for error in self.error_list: - message = error.message - if error.params: - message %= error.params - yield str(message) - - def __str__(self): - if hasattr(self, "error_dict"): - return repr(dict(self)) - return repr(list(self)) - - def __repr__(self): - return "ValidationError(%s)" % self - - def __eq__(self, other): - if not isinstance(other, ValidationError): - return NotImplemented - return hash(self) == hash(other) - - def __hash__(self): - if hasattr(self, "message"): - return hash( - ( - self.message, - self.code, - make_hashable(self.params), - ) - ) - if hasattr(self, "error_dict"): - return hash(make_hashable(self.error_dict)) - return hash(tuple(sorted(self.error_list, key=operator.attrgetter("message")))) - - -def make_hashable(value): - """ - Attempt to make value hashable or raise a TypeError if it fails. - - The returned value should generate the same hash for equal values. - """ - if isinstance(value, dict): - return tuple([ - (key, make_hashable(nested_value)) - for key, nested_value in sorted(value.items()) - ]) - # Try hash to avoid converting a hashable iterable (e.g. string, frozenset) - # to a tuple. - try: - hash(value) - except TypeError: - if is_iterable(value): - return tuple(map(make_hashable, value)) - # Non-hashable, non-iterable. - raise - return value - - -def is_iterable(x): - """ - An implementation independent way of checking for iterables - """ - try: - iter(x) - except TypeError: - return False - else: - return True \ No newline at end of file +class ValidationError(ValueError): + None \ No newline at end of file diff --git a/mavecore/validation/general_validators.py b/mavecore/validation/general_validators.py deleted file mode 100644 index e2962cf..0000000 --- a/mavecore/validation/general_validators.py +++ /dev/null @@ -1,73 +0,0 @@ -# note: FileExtensionValidator is from Django -from pathlib import Path -from mavecore.validation.exceptions import ValidationError -# validate_csv_extension -# validate_gz_extension -# validate_json_extension - -validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) - -class FileExtensionValidator: - # TODO, may need to edit validation error, will try to replicate Django error first - """ - This class validates file extensions and will replace the Django validator of - the same name. - - From Django: - Raises a ValidationError with a code of 'invalid_extension' if the extension of - value.name (value is a File) isn’t found in allowed_extensions. The extension is - compared case-insensitively with allowed_extensions. - """ - message = _("File extension “%(extension)s” is not allowed. " - "Allowed extensions are: %(allowed_extensions)s." - ) - code = "invalid_extension" - - def __init__(self, allowed_extensions=None, message=None, code=None): - """ - This constructor sets the values of the FileExtensionValidator. - - Parameters - __________ - allowed_extensions : List[str] - A list of allowed file extensions. - message : str - (default = None) The message assigned to the message attribute. - code : - (default = None) The code assigned to the code attribute. - """ - if allowed_extensions is not None: - allowed_extensions = [ - allowed_extension.lower() for allowed_extension in allowed_extensions - ] - self.allowed_extensions = allowed_extensions - if message is not None: - self.message = message - if code is not None: - self.code = code - - def __call__(self, value): - extension = Path(value.name).suffix[1:].lower() - if ( - self.allowed_extensions is not None - and extension not in self.allowed_extensions - ): - raise ValidationError( - self.message, - code=self.code, - params={ - "extension": extension, - "allowed_extensions": ", ".join(self.allowed_extensions), - "value": value, - }, - ) - - def __eq__(self, other): - return ( - isinstance(other, self.__class__) - and self.allowed_extensions == other.allowed_extensions - and self.message == other.message - and self.code == other.code - ) \ No newline at end of file diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py deleted file mode 100644 index c8f0c40..0000000 --- a/mavecore/validation/genome_validators.py +++ /dev/null @@ -1,590 +0,0 @@ -# TODO Django dependent, Django forms, whole file needs to be refactored -""" -Validator functions for the fields of the following classes: - WildTypeSequence - ReferenceGenome - TargetGene - ReferenceMap - GenomicInterval - -Most validation should validate one specific field, unless fields need -to be validated against each other. -""" -from fqfa.validator.validator import dna_bases_validator, amino_acids_validator -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation import constants - -from mavecore.validation.utilities import is_null - - -# min_start_validator = MinValueValidator( -# 1, message=_("Start coordinate must be a positive integer.") -# ) -# min_end_validator = MinValueValidator( -# 1, message=_("End coordinate must be a positive integer.") -# ) - - -class WildTypeSequence: - """ - Basic model specifying a wild-type sequence. - - Parameters - ---------- - sequence : `models.CharField` - The wild type DNA sequence that is related to the `target`. Will - be converted to upper-case upon instantiation. - - sequence_type : `models.CharField` - Protein sequence (amino acids) or DNA (nucleotides) - """ - - class SequenceType: - """ """ - - DNA = "dna" - PROTEIN = "protein" - INFER = "infer" - - @classmethod - def detect_sequence_type(cls, sequence): - # TODO - # confirm sequence parameter type - """ - This function determines if the sequence is a DNA or protein sequence and - returns "dna" if it is DNA or "protein" if it is protein. An error is raised - if it is neither. - - Parameters - __________ - sequence : str - - Returns - _______ - str - "dna" or "protein" depending on if the sequence is a DNA or protein sequence. - - Raises - ______ - ValueError - If sequence parameter is not protein or DNA. - """ - if sequence_is_dna(sequence): - return cls.DNA - elif sequence_is_protein(sequence): - return cls.PROTEIN - else: - raise ValueError( - f"Unknown sequence '{sequence}'. It is not protein or DNA." - ) - - @classmethod - def is_protein(cls, value): - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.PROTEIN - - @classmethod - def is_dna(cls, value): - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.DNA - - @classmethod - def choices(cls): - """ - - Returns - _______ - """ - return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] - - class Meta: - """ """ - - verbose_name = "Reference sequence" - verbose_name_plural = "Reference sequences" - - def __str__(self): - """ - - Returns - _______ - - """ - return self.get_sequence() - - # sequence = models.TextField( - # default=None, - # blank=False, - # null=False, - # verbose_name="Reference sequence", - # validation=[validate_wildtype_sequence], - # ) - # sequence_type = models.CharField( - # blank=True, - # null=False, - # default=SequenceType.INFER, - # verbose_name="Reference sequence type", - # max_length=32, - # choices=SequenceType.choices(), - # ) - - @property - def is_dna(self): - """ - - Returns - _______ - - """ - return self.__class__.SequenceType.is_dna(self.sequence_type) - - @property - def is_protein(self): - """ - - Returns - _______ - - """ - return self.__class__.SequenceType.is_protein(self.sequence_type) - - def save(self, *args, **kwargs): - """ - - Parameters - __________ - args : - kwargs : - - Returns - _______ - - """ - if self.sequence is not None: - self.sequence = self.sequence.upper() - self.sequence_type = ( - (self.__class__.SequenceType.detect_sequence_type(self.sequence)) - if self.__class__.SequenceType.INFER - else self.sequence_type - ) - - return super().save(*args, **kwargs) - - def get_sequence(self): - """ - - Returns - _______ - - """ - return self.sequence.upper() - - def is_attached(self): - """ - - Returns - _______ - - """ - return getattr(self, "target", None) is not None - - -# GenomicInterval -# ------------------------------------------------------------------------- # -def validate_interval_start_lteq_end(start, end): - """ - This function validates whether or not an interval's starting coordinate is less than - or equal to that interval's ending coordinate. - - Parameters - __________ - start : int - The interval's starting coordinate. - end : int - The interval's ending coordinate. - - Returns - _______ - None - If start is NoneType or end is NoneType. - - Raises - ______ - ValidationError - If an interval's starting coordinate is greater than the ending coordinate. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if start is None or end is None: - return - if start > end: - raise ValidationError( - ( - "An interval's starting coordinate cannot be greater than the " - "ending coordinate." - ) - ) - - -def validate_strand(value): - # TODO - # find the type of value - """ - This function validates a GenomicInterval strand and raises an error if the strand is invalid. - - Parameters - __________ - value : - The Genomic Interval strand to be validated. - - Raises - ______ - ValidationError - If GenomicInterval strand is not positive or negative. - """ - if value not in ("+", "-"): - raise ValidationError("GenomicInterval strand must be either '+' or '-'") - - -def validate_chromosome(value): - # TODO - # add description and type for value parameter - """ - - Parameters - __________ - value : - - Returns - _______ - None - If value is NoneType. - - Raises - ______ - ValidationError - If chromosome identifier is null. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if value is None: - return - if is_null(value): - raise ValidationError("Chromosome identifier must not be null.") - - -def validate_unique_intervals(intervals): - # TODO - # add description and interval parameter type plus description - """ - - Parameters - __________ - intervals : - - Raises - ______ - ValidationError - If the same interval was specified twice. - """ - for interval1 in intervals: - for interval2 in intervals: - if ( - (interval1.pk is not None) - and (interval2.pk is not None) - and (interval1.pk == interval2.pk) - ): - continue - elif interval1 is interval2: - continue - elif interval1.equals(interval2): - raise ValidationError("You can not specify the same interval twice.") - - -# WildTypeSequence -# ------------------------------------------------------------------------- # -def validate_wildtype_sequence(seq, as_type="any"): - # TODO - # add description to as_type parameter - """ - This function checks whether or not seq is a wildtype sequence. - - Parameters - __________ - seq : str - The sequence being validated. - as_type : str - (default = "any") - - Raises - ______ - ValidationError - If seq is not a valid wild type sequence. - ValidationError - If seq is not a valid DNA or protein reference sequence. - """ - # from .models import WildTypeSequence - - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - raise ValidationError( - "'%(seq)s' is not a valid wild type sequence." # , params={"seq": seq} - ) - - seq = seq.upper() - is_dna = dna_bases_validator(seq) is not None - is_aa = amino_acids_validator(seq) is not None - - if as_type == WildTypeSequence.SequenceType.DNA and not is_dna: - raise ValidationError( - "'%(seq)s' is not a valid DNA reference sequence." # , - # params={"seq": seq}, - ) - elif as_type == WildTypeSequence.SequenceType.PROTEIN and not is_aa: - raise ValidationError( - "'%(seq)s' is not a valid protein reference sequence." # , - # params={"seq": seq}, - ) - elif (as_type == "any" or WildTypeSequence.SequenceType.INFER) and not ( - is_dna or is_aa - ): - raise ValidationError( - "'%(seq)s' is not a valid DNA or protein reference sequence." # , - # params={"seq": seq}, - ) - - -def sequence_is_dna(seq): - """ - This function checks if seq is a DNA sequence. - - Parameters - __________ - seq : str - The sequence to be validated. - - Returns - _______ - bool - True if the dna_bases_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - return dna_bases_validator(seq) is not None - - -def sequence_is_protein(seq): - """ - This function check if seq is a protein sequence. - - Parameters - __________ - seq : str - The sequence being validated. - - Returns - _______ - bool - True if seq is not null, is a DNA sequence or amino_acids_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - if dna_bases_validator(seq) is not None: - return False # Very likely a DNA sequence if only ATG - return amino_acids_validator(seq) is not None - - -# ReferenceGenome -# ------------------------------------------------------------------------- # -def validate_organism_name(organism_name): - # TODO - # confirm organism_name type - """ - This function validates the organism name by checking that the name is not null. - - Parameters - __________ - organism_name : str - The organism name to be validated. - - Raises - ______ - ValidationError - If the organism name is null. - """ - if is_null(organism_name): - raise ValidationError("Species name must not be null.") - - -def validate_reference_genome_has_one_external_identifier(referencegenome): - # TODO - # revise description, make sure it is accurate - # anything greater than 0 will return True, so should it be == 1 or > 0? - # determine what type referencegenome is - """ - This function validates whether or not the reference genome has one external identifier. - An error is raised if - - Parameters - __________ - referencegenome : - - Raises - ______ - ValidationError - If - """ - if not referencegenome.genome_id: - raise ValidationError( - "Only one external identifier can be specified for a reference" "genome." - ) - - -def validate_genome_short_name(value): - # TODO - # confirm the type of the value parameter - """ - This function validates the genome short name and raises an error if the value is null. - - Parameters - __________ - value : str - The genome short name to be validated. - - Raises - ______ - ValidationError - If the genome short name is null. - """ - if is_null(value): - raise ValidationError("Genome short name must not be null.") - - -# ReferenceMap -# ------------------------------------------------------------------------- # -def validate_map_has_unique_reference_genome(annotations): - # TODO - # check the type of annotations - # add description to annotations parameter - """ - This function validates whether or not each map in annotations has a - unique reference genome and raises an error if this is not the case. - - Parameters - __________ - annotations : - - Raises - ______ - ValidationError - If each reference map does not specify a different reference genome. - """ - genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) - if len(genomes) < len(annotations): - raise ValidationError( - "Each reference map must specify a different reference genome." - ) - - -def validate_map_has_at_least_one_interval(reference_map): - """ - This function validates that a reference map has at least one interval and raises an error - if this is not the case. - - Parameters - __________ - reference_map : - Reference map. - - Raises - ______ - ValidationError - If the reference_map does not have at least one interval. - """ - if not reference_map.get_intervals().count(): - raise ValidationError( - "You must specify at least one interval for each reference map." - ) - - -def validate_at_least_one_map(reference_maps): - """ - This function validates whether a target has at least one reference map specified - and raises an error if it does not. - - Parameters - __________ - reference_maps : - - - Raises - ______ - ValidationError - If the target does not have at least one reference map specified. - """ - if not len(reference_maps): - raise ValidationError( - "A target must have at least one reference map specified." - ) - - -def validate_one_primary_map(reference_maps): - """ - This function validates the existence of one primary reference map and raises an error - if it does not exist. - - Parameters - __________ - reference_maps : - - Raises - ______ - ValidationError - If target has less than or more than one primary reference map. - """ - primary_count = sum(a.is_primary_reference_map() for a in reference_maps) - if primary_count > 1 or primary_count < 1: - raise ValidationError("A target must have one primary reference map.") - - -# TargetGene -# ------------------------------------------------------------------------- # -def validate_gene_name(gene_name): - # TODO - # confirm gene_name type - """ - This function checks to see if a gene name is null and raises and error if it is. - - Parameters - __________ - gene_name : str - The gene name. - - Raises - ______ - ValidationError - If gene name (value parameter) is null. - """ - if is_null(gene_name): - raise ValidationError("Gene name must not be null.") diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index cb0e450..70b731b 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -1,4 +1,3 @@ -# TODO Django dependent, Django forms, whole file needs to be refactored import idutils from mavecore.validation.exceptions import ValidationError @@ -6,6 +5,19 @@ def validate_sra_identifier(identifier): + """ + Validates whether the identifier is a valid SRA identifier. + + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid SRA identifier. + """ if not ( idutils.is_sra(identifier) or idutils.is_bioproject(identifier) @@ -43,9 +55,17 @@ def validate_keyword(kw): def validate_pubmed_identifier(identifier): """ + Validates whether the identifier is a valid PubMed identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid PubMed identifier. """ if not idutils.is_pmid(identifier): raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") @@ -53,9 +73,17 @@ def validate_pubmed_identifier(identifier): def validate_doi_identifier(identifier): """ + Validates whether the identifier is a valid DOI identifier. + + Parameters + __________ + identifier: str + The identifier to be validated. - :param identifier: - :return: + Raises + ______ + ValidationError + If the identifier is not a valid DOI identifier. """ if not idutils.is_doi(identifier): raise ValidationError(f"'{identifier}' is not a valid DOI.") @@ -63,9 +91,17 @@ def validate_doi_identifier(identifier): def validate_ensembl_identifier(identifier): """ + Validates whether the identifier is a valid Ensembl identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid Ensembl identifier. """ if not idutils.is_ensembl(identifier): raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") @@ -73,9 +109,17 @@ def validate_ensembl_identifier(identifier): def validate_uniprot_identifier(identifier): """ + Validates whether the identifier is a valid UniProt identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid UniProt identifier. """ if not idutils.is_uniprot(identifier): raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") @@ -83,9 +127,17 @@ def validate_uniprot_identifier(identifier): def validate_refseq_identifier(identifier): """ + Validates whether the identifier is a valid RefSeq identifier. + + Parameters + __________ + identifier: str + The identifier to be validated. - :param identifier: - :return: + Raises + ______ + ValidationError + If the identifier is not a valid RefSeq identifier. """ if not idutils.is_refseq(identifier): raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") @@ -93,9 +145,17 @@ def validate_refseq_identifier(identifier): def validate_genome_identifier(identifier): """ + Validates whether the identifier is a valid genome identifier. + + Parameters + __________ + identifier: str + The identifier to be validated. - :param identifier: - :return: + Raises + ______ + ValidationError + If the identifier is not a valid genome identifier. """ if not idutils.is_genome(identifier): raise ValidationError( @@ -121,8 +181,17 @@ def validate_keyword_list(values): def validate_pubmed_list(values): """ - :param values: - :return: + Validates whether each identifier in a list of identifiers (values) is a valid PubMed identifier. + + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid PubMed identifier. """ for value in values: if not is_null(value): @@ -131,9 +200,17 @@ def validate_pubmed_list(values): def validate_sra_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid SRA identifier. - :param values: - :return: + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid SRA identifier. """ for value in values: if not is_null(value): @@ -142,9 +219,17 @@ def validate_sra_list(values): def validate_doi_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid DOI identifier. - :param values: - :return: + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid DOI identifier. """ for value in values: if not is_null(value): @@ -153,9 +238,17 @@ def validate_doi_list(values): def validate_ensembl_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid Ensembl identifier. + + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. - :param values: - :return: + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid Ensemble identifier. """ for value in values: if not is_null(value): @@ -164,9 +257,17 @@ def validate_ensembl_list(values): def validate_refseq_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid RefSeq identifier. + + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. - :param values: - :return: + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid RefSeq identifier. """ for value in values: if not is_null(value): @@ -175,9 +276,17 @@ def validate_refseq_list(values): def validate_uniprot_list(values): """ + Validates whether each identifer in a list of identifiers (values) is a valid UniProt identifier. + + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. - :param values: - :return: + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid UniProt identifier. """ for value in values: if not is_null(value): diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py deleted file mode 100644 index b138c9a..0000000 --- a/mavecore/validation/validate.py +++ /dev/null @@ -1,69 +0,0 @@ -from mavecore.validation import dataset_validators - - -def validate_all(countfile=None, scorefile=None, scorejson=None): - """ - By calling other helper functions, this function runs all of the validation code. - - Parameters - __________ - countfile : - scorefile : - scorejson : - - """ - validate_dataset(countfile, scorefile, scorejson) - - -def validate_dataset(countfile=None, scorefile=None, scorejson=None): - """ - This function calls all of the validation functions within - mavetools/mavetools/validation/dataset_validation.py - - Parameters - __________ - countfile : - scorefile : - scorejson : - - Returns - ------- - - """ - - # how to incorporate word limit validator? - - if scorefile is not None: - # open scorefile - open(scorefile) - # this one returns header - scoreheader = dataset_validators.read_header_from_io(file=scorefile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=scoreheader) - dataset_validators.validate_at_least_one_additional_column(header=scoreheader) - dataset_validators.validate_header_contains_no_null_columns(header=scoreheader) - - dataset_validators.validate_scoreset_score_data_input(file=scorefile) - - if scorejson is not None: - # open scorejson - open(scorejson) - dataset_validators.validate_scoreset_json(dict_=scorejson) - - if countfile is not None: - # open countfile - open(countfile) - countheader = dataset_validators.read_header_from_io(file=countfile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=countheader) - dataset_validators.validate_at_least_one_additional_column(header=countheader) - dataset_validators.validate_header_contains_no_null_columns(header=countheader) - - dataset_validators.validate_scoreset_count_data_input(file=countfile) - - if scorefile is not None and countfile is not None: - dataset_validators.validate_datasets_define_same_variants( - scores=scorefile, counts=countfile - ) diff --git a/mavecore/validation/variant_validators/__init__.py b/mavecore/validation/variant_validators/__init__.py index 1f7aca1..5bba70b 100644 --- a/mavecore/validation/variant_validators/__init__.py +++ b/mavecore/validation/variant_validators/__init__.py @@ -1,5 +1,3 @@ -from .dataset import MaveDataset, MaveCountsDataset, MaveScoresDataset - from .hgvs import ( validate_nt_variant, validate_pro_variant, @@ -10,7 +8,6 @@ from .variant import validate_columns_match, validate_variant_json __all__ = [ - "dataset", "variant", "hgvs", "validate_nt_variant", @@ -19,7 +16,4 @@ "validate_hgvs_string", "validate_columns_match", "validate_variant_json", - "MaveCountsDataset", - "MaveScoresDataset", - "MaveDataset", ] diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py deleted file mode 100644 index d6c7288..0000000 --- a/mavecore/validation/variant_validators/dataset.py +++ /dev/null @@ -1,1040 +0,0 @@ -# TODO Django dependent, whole file will need to be refactored -import re -from collections import defaultdict -from io import StringIO -from itertools import groupby -from operator import itemgetter -from typing import Union, Optional, Tuple, List, TextIO, BinaryIO, Set, Dict - -import pandas as pd -import numpy as np -from mavehgvs import MaveHgvsParseError, Variant -from fqfa.util.translate import translate_dna -from fqfa.util.infer import infer_sequence_type - -from mavecore.validation.constants import ( - hgvs_nt_column, - hgvs_splice_column, - hgvs_pro_column, - required_score_column, - null_values_list, - null_values_re, - readable_null_values_list, -) - -from mavecore.validation.utilities import is_null - - -class MaveDataset: - # TODO Django dependent - """ - - class DatasetType: - # TODO - """ """ - SCORES = "scores" - COUNTS = "counts" - - class HGVSColumns: - # TODO - """ """ - NUCLEOTIDE: str = hgvs_nt_column - TRANSCRIPT: str = hgvs_splice_column - PROTEIN: str = hgvs_pro_column - - @classmethod - def options(cls) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] - - class AdditionalColumns: - # TODO Django dependent - """ - @classmethod - def options(cls) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [] - - # ---------------------- Construction------------------------------------ # - @classmethod - def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": - # TODO Django dependent - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveScoresDataset` - - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) - - @classmethod - def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": - # TODO Django dependent - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveCountsDataset` - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.COUNTS) - - @classmethod - def _for_type( - cls, file: Union[str, TextIO, BinaryIO], dataset_type: str - ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: - # TODO Django dependent - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - dataset_type : str - - Returns - _______ - Union[`MaveScoreDataset`, `MaveCountsDataset`] - - Raises - ______ - TypeError - If file parameter is not expected file path or buffer object. - ValueError - If dataset_type parameter is not a recognized dataset type. - """ - if isinstance(file, str): - handle = file - elif hasattr(file, "read"): - file_contents = file.read() - if hasattr(file_contents, "decode"): - file_contents = file_contents.decode("utf-8") - file_contents = file_contents.strip() - handle = StringIO(file_contents) - else: - raise TypeError( - f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" - ) - - extra_na_values = set( - list(null_values_list) - + [str(x).lower() for x in null_values_list] - + [str(x).upper() for x in null_values_list] - + [str(x).capitalize() for x in null_values_list] - ) - - df = pd.read_csv( - filepath_or_buffer=handle, - sep=",", - encoding="utf-8", - quotechar='"', - comment="#", - na_values=extra_na_values, - keep_default_na=True, - dtype={ - **{c: str for c in cls.HGVSColumns.options()}, - MaveScoresDataset.AdditionalColumns.SCORES: float, - }, - ).replace(null_values_re, np.NaN) - - if dataset_type == cls.DatasetType.SCORES: - return MaveScoresDataset(df) - elif dataset_type == cls.DatasetType.COUNTS: - return MaveCountsDataset(df) - else: - raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") - - # ---------------------- Public ----------------------------------------- # - @property - def label(self) -> str: - # TODO Django dependent - """ - - Returns - _______ - str - """ - return "dataset" - - @property - def is_valid(self) -> Optional[bool]: - # TODO Django dependent - """ - - Returns - _______ - Optional[bool] - """ - if self._errors is None: - return None - return len(self._errors) == 0 - - @property - def n_errors(self) -> Optional[int]: - # TODO Django dependent - """ - - Returns - _______ - Optional[int] - """ - if self._errors is None: - return None - return len(self._errors) - - @property - def errors(self) -> Optional[List[str]]: - # TODO Django dependent - """ - - Returns - _______ - Optional[List[str]] - """ - return self._errors - - @property - def is_empty(self) -> bool: - # TODO Django dependent - """ - - Returns - _______ - bool - """ - return self._df.empty - - @property - def columns(self) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return list(self._df.columns) - - @property - def hgvs_columns(self) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c in self.HGVSColumns.options()] - - @property - def non_hgvs_columns(self) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c not in self.HGVSColumns.options()] - - @property - def n_rows(self) -> int: - # TODO Django dependent - """ - - Returns - _______ - int - """ - return len(self._df) - - @property - def n_columns(self) -> int: - # TODO Django dependent - """ - - Returns - _______ - int - """ - return len(self.columns) - - @property - def index_column(self) -> Optional[str]: - # TODO Django dependent - """ - - Returns - _______ - Optional[str] - """ - if self._errors: - return None - return self._index_column - - @property - def index(self) -> Optional[pd.Index]: - # TODO Django dependent - """ - - Returns - _______ - Optional[`pd.Index`] - """ - if self._errors: - return None - return self._df.index.copy(deep=True) - - def data(self, serializable=False) -> pd.DataFrame: - # TODO Django dependent - """ - Return underlying dataframe object. - - Parameters - ---------- - serializable: bool - Replaces `np.NaN` with `None` for JSON compatibility. - - Returns - _______ - `pd.DataFrame` - - """ - if serializable: - # need to force "object" type to allow None values - return_df = self._df.astype(object, copy=True) - return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) - return return_df - return self._df.copy(deep=True) - - def match_other(self, other: "MaveDataset") -> Optional[bool]: - # TODO Django dependent - """ - Check that each dataset defined the same variants in each column. - - Parameters - ---------- - other: MaveDataset - Validator instance to match against. - - Returns - ------- - Optional[bool] - A boolean indicating index match, otherwise `None` if either instance - is not valid. - """ - if (not self.is_valid) or (not other.is_valid): - return None - - if self.index_column != other.index_column: - return False - - return all( - self._df[column].equals(other._df[column]) - for column in self.HGVSColumns.options() - ) - - def to_dict(self) -> Dict[str, Dict]: - # TODO Django dependent - """ - Returns underlying dataframe as dictionary in 'records' orientation. - Keys will be index values and values will be an inner dictionary mapping - column names to row values for said index. - - Returns - _______ - Dict[str, Dict] - """ - # Convert np.NaN values to None for consistency across all columns and - # for compatibility in PostgresSQL queries. Replaces all values which - # are considered null by pandas with None by masking pd.notnull cells. - - return self.data(serializable=True).to_dict(orient="index") - - def validate( - self, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - allow_index_duplicates: bool = False, - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - allow_index_duplicates : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - - self._errors = [] - self._df.index = pd.RangeIndex(start=0, stop=self.n_rows, step=1) - self._index_column = None - - self._validate_columns() - # Only attempt to validate variants if columns are valid - if not self._errors: - ( - self._normalize_data() - ._validate_genomic_variants(targetseq, relaxed_ordering) - ._validate_transcript_variants(targetseq, relaxed_ordering) - ._validate_protein_variants(targetseq, relaxed_ordering) - ._validate_index_column(allow_duplicates=allow_index_duplicates) - ) - - if self.is_empty: - self._errors.append( - f"No variants could be parsed from your {self.label} file. " - f"Please upload a non-empty file." - ) - return self - - if not self._errors: - # Set index last as original index is used when indicating duplicate - # hgvs string row numbers in the column name used as the index ( - # either hgvs_nt when present or hgvs_pro when hgvs_nt is absent). - self._df.index = pd.Index(self._df[self.index_column]) - - return self - - # ---------------------- Private ---------------------------------------- # - def __init__( - self, - df: Optional[pd.DataFrame] = None, - index_column: Optional[str] = None, - errors: Optional[List[str]] = None, - ): - # TODO Django dependent - """ - - Parameters - df : - index_column : - errors : - - Raises - ______ - - """ - self._df: pd.DataFrame = pd.DataFrame() if df is None else df - self._index_column = index_column or None - self._errors = None if errors is None else list(errors) - - def __repr__(self): - # TODO Django dependent - """ - - Returns - _______ - - """ - return ( - f"<" - f"{self.__class__.__name__} " - f"columns={self.columns} " - f"index={self.index_column} " - f"valid={self.is_valid}" - f">" - ) - - @property - def _column_order(self) -> Dict[str, int]: - # TODO Django dependent - """ - - Returns - _______ - Dict[str, int] - """ - return defaultdict( - lambda: 100, - { - self.HGVSColumns.NUCLEOTIDE: 0, - self.HGVSColumns.TRANSCRIPT: 1, - self.HGVSColumns.PROTEIN: 2, - **{ - c: (2 + i) - for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) - }, - }, - ) - - def _validate_columns(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - # Pandas will automatically name blank columns using the pattern below - unnamed = re.compile(r"^Unnamed: \d+$", flags=re.IGNORECASE) - columns = self.columns - if any(is_null(h) or unnamed.match(h) for h in columns): - self._errors.append( - f"Column names in your {self.label} file cannot values " - f"considered null such as the following: " - f"{', '.join(readable_null_values_list)}" - ) - - columns = [c for c in columns if not is_null(c)] - if len(columns) < 1: - self._errors.append( - f"No columns could not be parsed from your {self.label} file. " - "Make sure columns are comma delimited. Column names with " - "commas must be escaped by enclosing them in double quotes" - ) - - required = {self.HGVSColumns.NUCLEOTIDE, self.HGVSColumns.PROTEIN} - if not (set(columns) & required): - self._errors.append( - f"Your {self.label} file must define either a nucleotide " - f"hgvs column '({self.HGVSColumns.NUCLEOTIDE})' " - f"or a protein hgvs column '({self.HGVSColumns.PROTEIN})'. " - f"Columns are case-sensitive and must be comma delimited" - ) - - if not (set(columns) - set(self.HGVSColumns.options())): - self._errors.append( - f"Your {self.label} file must define at least one additional " - f"column different from '{self.HGVSColumns.NUCLEOTIDE}', " - f"'{self.HGVSColumns.TRANSCRIPT}' and " - f"'{self.HGVSColumns.PROTEIN}'" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - """ - if self._errors: - return self - - # Initialize missing hgvs columns as empty. - for c in self.HGVSColumns.options(): - if c not in self.columns: - self._df[c] = np.NaN - - column_order = self._column_order - sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) - - self._df = self._df[sorted_columns] - return self - - def _validate_genomic_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.NUCLEOTIDE): - return self - - defines_transcript_variants = not self._column_is_null( - self.HGVSColumns.TRANSCRIPT - ) - validated_variants, prefixes, errors = self._validate_variants( - column=self.HGVSColumns.NUCLEOTIDE, - splice_defined=defines_transcript_variants, - targetseq=targetseq, - relaxed_ordering=relaxed_ordering, - ) - - if ("c" in prefixes or "n" in prefixes) and "g" in prefixes: - self._errors.append( - f"{self.HGVSColumns.NUCLEOTIDE}: Genomic variants " - f"(prefix 'g.') cannot be mixed with transcript variants " - f"(prefix 'c.' or 'n.')" - ) - - if prefixes == {"g"} and not defines_transcript_variants: - self._errors.append( - f"Transcript variants ('{self.HGVSColumns.TRANSCRIPT}' column) " - f"are required when specifying genomic variants " - f"(prefix 'g.' in the 'hgvs_nt' column)" - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.NUCLEOTIDE] = validated_variants - - self._index_column = self.HGVSColumns.NUCLEOTIDE - return self - - def _validate_transcript_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_tx = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_tx and (not defines_nt): - self._errors.append( - f"Genomic variants ('{self.HGVSColumns.NUCLEOTIDE}' column) " - f"must be defined when specifying transcript " - f"variants ('{self.HGVSColumns.TRANSCRIPT}' column)" - ) - - if not defines_tx: - return self - - # Don't validate transcript variants against sequence. Might come - # back to this later with research into implementing gene models. - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.TRANSCRIPT, - targetseq=None, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.TRANSCRIPT] = validated_variants - - return self - - def _validate_protein_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.PROTEIN): - return self - - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_splice = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_splice: - protein_seq = None - else: - protein_seq = targetseq - if targetseq and "dna" in infer_sequence_type(targetseq).lower(): - protein_seq, remainder = translate_dna(targetseq) - if remainder: - self._errors.insert( - 0, - "Protein variants could not be validated because the " - "length of your target sequence is not a multiple of 3", - ) - - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.PROTEIN, - targetseq=protein_seq, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.PROTEIN] = validated_variants - - if not defines_nt: - self._index_column = self.HGVSColumns.PROTEIN - - return self - - def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - allow_duplicates : bool - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - if self._index_column is None: - self._index_column = self.HGVSColumns.NUCLEOTIDE - - if self._column_is_partially_null(self._index_column): - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"cannot contain any null values from " - f"{', '.join(readable_null_values_list)} (case-insensitive)" - ) - - if not allow_duplicates: - dupes = self._df[self._index_column].duplicated(keep=False) - if np.any(dupes): - dup_list = zip( - self._df.loc[dupes, self._index_column], dupes.index[dupes] - ) - dupes_str = ", ".join( - f"{v}: {[(g[1] + 1) for g in groups]}" # get row numbers - for (v, groups) in groupby(dup_list, key=itemgetter(0)) - ) - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"contains duplicate HGVS variants: {dupes_str}" - ) - - return self - - def _validate_variants( - self, - column: str, - splice_defined: Optional[bool] = None, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - ) -> Tuple[pd.Series, Set[str], List[str]]: - # TODO Django dependent - """ - - Parameters - __________ - column : str - splice_defined : Optional[bool] - targetseq : Optional[str] - relaxed_ordering : bool - - Returns - _______ - Tuple[`pd.Series`, Set[str], List[str]] - - Raises - ______ - - """ - - prefixes = set() - errors = [] - - def validate_variant(variant: str): - # TODO Django dependent - # TODO: logic mirrors that in validate_hgvs_string, which is kept - # as a standalone function for backwards compatibility with - # django's model validator field. Merge at some point. - - if is_null(variant): - return np.NaN - else: - try: - if variant.lower() == "_sy": - errors.append( - "'_sy' is no longer supported and should be " - "replaced by 'p.(=)'" - ) - return variant - elif variant.lower() == "_wt": - errors.append( - "'_wt' is no longer supported and should be " - "replaced by one of 'g.=', 'c.=' or 'n.='" - ) - return variant - - validated = Variant( - variant, targetseq=targetseq, relaxed_ordering=relaxed_ordering - ) - prefix = validated.prefix.lower() - prefixes.add(prefix) - - prefix_error = self._validate_variant_prefix_for_column( - variant=validated, - prefix=validated.prefix, - column=column, - splice_defined=splice_defined, - ) - if prefix_error: - errors.append(prefix_error) - - return str(validated) - - except MaveHgvsParseError as error: - errors.append(f"{variant}: {str(error)}") - return np.NaN - - validated_variants = self._df[column].apply(validate_variant) - - return validated_variants, prefixes, errors - - def _column_is_null(self, column) -> bool: - # TODO Django dependent - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == len(self._df) - - def _column_is_partially_null(self, column) -> bool: - # TODO Django dependent - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return 0 < len(self._df[self._df[column].isna()]) < len(self._df) - - def _column_is_fully_specified(self, column) -> bool: - # TODO Django dependent - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == 0 - - def _validate_variant_prefix_for_column( - self, variant: Variant, prefix: str, column: str, splice_defined: bool - ) -> Optional[str]: - # TODO Django dependent - """ - - Parameters - __________ - variant : Variant - prefix : str - column : str - splice_defined : bool - - Returns - _______ - Optional[str] - - Raises - ______ - ValueError - If there is an unknown column as column argument. - """ - prefix = prefix.lower() - - if column == self.HGVSColumns.NUCLEOTIDE: - if splice_defined: - if prefix not in "g": - return ( - f"{column}: " - f"'{variant}' is not a genomic variant " - f"(prefix 'g.'). Nucleotide variants must " - f"be genomic if transcript variants are " - f"also present" - ) - else: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. " - f"The accepted transcript variant prefixes " - f"are 'c.' or 'n.'" - ) - elif column == self.HGVSColumns.TRANSCRIPT: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. The " - f"accepted transcript variant prefixes are " - f"'c.' or 'n.'" - ) - elif column == self.HGVSColumns.PROTEIN: - if prefix not in "p": - return ( - f"{column}: " - f"'{variant}' is not a protein variant. " - f"The accepted protein variant prefix is 'p.'" - ) - else: - raise ValueError( - f"Unknown column '{column}'. Expected one " - f"of {', '.join(self.HGVSColumns.options())}" - ) - - return None - - -class MaveScoresDataset(MaveDataset): - # TODO - """ """ - class AdditionalColumns: - # TODO - """ """ - SCORES = required_score_column - - @classmethod - def options(cls) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [cls.SCORES] - - @property - def label(self) -> str: - # TODO Django dependent - """ - - Returns - _______ - str - """ - return "scores" - - def _validate_columns(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - super()._validate_columns() - - if self.AdditionalColumns.SCORES not in self.columns: - self._errors.append( - f"Your scores dataset is missing the " - f"'{self.AdditionalColumns.SCORES}' column. " - f"Columns are case-sensitive and must be comma delimited" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - ValueError - - """ - super()._normalize_data() - - should_be_numeric = [self.AdditionalColumns.SCORES] - for c in should_be_numeric: - if c in self.columns: - try: - self._df[c] = self._df[c].astype(dtype=float, errors="raise") - except ValueError as e: - self._errors.append(f"{c}: {str(e)}") - - return self - - -class MaveCountsDataset(MaveDataset): - # TODO - """ """ - @property - def label(self) -> str: - # TODO Django dependent - """ - - Returns - _______ - str - """ - return "counts" diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 4ef39e3..b423ecf 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -22,17 +22,19 @@ def validate_hgvs_string( relaxed_ordering: bool = False, ) -> Optional[str]: """ + Validates hgvs string. Parameters __________ value : Union[str, bytes] column : Optional[str] = None - splice_present : - targetseq : - relaxed_ordering : + splice_present : bool = False + targetseq : Optional[str] = None + relaxed_ordering : bool = False Returns _______ + Optional[str] Raises ______ diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py index bf00e71..140fed4 100644 --- a/mavecore/validation/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -9,8 +9,6 @@ def validate_columns_match(variant, scoreset) -> None: - # TODO - # document errors correctly, note key error """ Validate that a child matches parents defined columns to keep data in sync. @@ -26,6 +24,8 @@ def validate_columns_match(variant, scoreset) -> None: If variant score columns do not match scoreset score columns. ValidationError If variant count columns do not match scoreset count columns. + ValidationError + If try fails within try except block. """ try: if variant.score_columns != scoreset.score_columns: diff --git a/setup.py b/setup.py index 286779f..058998f 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setuptools.setup( name="mavecore", - version="0.1.4", + version="0.1.7", author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", description=( diff --git a/tests/test_validation/test_dataset_validators.py b/tests/test_validation/test_dataset_validators.py index 6b4895a..9e53b31 100644 --- a/tests/test_validation/test_dataset_validators.py +++ b/tests/test_validation/test_dataset_validators.py @@ -15,25 +15,9 @@ read_header_from_io, validate_scoreset_json, validate_datasets_define_same_variants, - WordLimitValidator, ) -class TestWordLimitValidator(TestCase): - def test_validation_error_more_than_word_limit(self): - with self.assertRaises(ValueError): - n = 5 - WordLimitValidator(n)("Word " * (n + 1)) - - def test_passes_equal_to_word_limit(self): - n = 5 - WordLimitValidator(n)("Word " * n) - - def test_passes_less_than_word_limit(self): - n = 5 - WordLimitValidator(n)("Word " * (n - 1)) - - class TestHeaderFromIO(TestCase): """ Tests to ensure that a file in bytes or string mode can be read and then @@ -42,14 +26,18 @@ class TestHeaderFromIO(TestCase): """ def test_can_read_header_from_bytes(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count"] self.assertEqual(expected, header) def test_removes_quotes_from_header(self): file = BytesIO( - '"{}","score","count,nt"\n'.format(constants.hgvs_nt_column).encode() + '"{}","score","count,nt"\n'.format( + constants.hgvs_nt_column + ).encode() ) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count,nt"] @@ -62,16 +50,21 @@ def test_can_read_header_from_string(self): self.assertEqual(expected, header) def test_strips_whitespace(self): - file = StringIO(" {} , score , count\n".format(constants.hgvs_nt_column)) + file = StringIO( + " {} , score , count\n".format(constants.hgvs_nt_column) + ) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count"] self.assertEqual(expected, header) def test_returns_file_position_to_begining(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) read_header_from_io(file) self.assertEqual( - file.read(), "{},score,count\n".format(constants.hgvs_nt_column).encode() + file.read(), + "{},score,count\n".format(constants.hgvs_nt_column).encode(), ) @@ -84,7 +77,9 @@ class TestNoNullInColumnsValidator(TestCase): def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + "{},score,{}\n".format( + constants.hgvs_nt_column, value + ).encode() ) with self.assertRaises(ValueError): header = read_header_from_io(file) @@ -110,7 +105,9 @@ def test_raises_valuerror_when_less_than_2_values_in_column(self): validate_at_least_one_additional_column(header) def test_does_not_raise_valuerror_2_or_more_values_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) header = read_header_from_io(file) validate_at_least_one_additional_column(header) # Should pass @@ -132,18 +129,24 @@ def test_raises_valuerror_when_neither_hgvs_col_in_column(self): def test_hgvs_must_be_lowercase(self): file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column.upper()).encode() + "{},score,count\n".format( + constants.hgvs_nt_column.upper() + ).encode() ) with self.assertRaises(ValueError): header = read_header_from_io(file) validate_has_hgvs_in_header(header) def test_does_not_raise_valuerror_when_either_hgvs_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) header = read_header_from_io(file) validate_has_hgvs_in_header(header) # Should pass - file = BytesIO("{},score,count\n".format(constants.hgvs_pro_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_pro_column).encode() + ) header = read_header_from_io(file) validate_has_hgvs_in_header(header) # Should pass @@ -245,7 +248,9 @@ def test_raises_valuerror_no_numeric_column(self): def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + "{},score,{}\n".format( + constants.hgvs_nt_column, value + ).encode() ) with self.assertRaises(ValueError): validate_scoreset_count_data_input(file) @@ -270,7 +275,9 @@ def test_raises_valuerror_no_numeric_column(self): def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + "{},score,{}\n".format( + constants.hgvs_nt_column, value + ).encode() ) with self.assertRaises(ValueError): validate_scoreset_score_data_input(file) @@ -296,12 +303,18 @@ def test_valueerror_unexptected_columns(self): validate_scoreset_json(field) def test_valueerror_values_not_lists(self): - field = {constants.score_columns: ["score"], constants.count_columns: {}} + field = { + constants.score_columns: ["score"], + constants.count_columns: {}, + } with self.assertRaises(ValueError): validate_scoreset_json(field) def test_valueerror_list_values_not_strings(self): - field = {constants.score_columns: [b"score"], constants.count_columns: []} + field = { + constants.score_columns: [b"score"], + constants.count_columns: [], + } with self.assertRaises(ValueError): validate_scoreset_json(field) @@ -323,6 +336,9 @@ def test_valueerror_missing_dict_columns(self): def test_valueerror_missing_header_columns(self): # constants.score_columns columns missing 'score' - field = {constants.score_columns: ["hgvs"], constants.count_columns: []} + field = { + constants.score_columns: ["hgvs"], + constants.count_columns: [], + } with self.assertRaises(ValueError): validate_scoreset_json(field) diff --git a/tests/test_validation/test_genome_validators.py b/tests/test_validation/test_genome_validators.py deleted file mode 100644 index 6f36283..0000000 --- a/tests/test_validation/test_genome_validators.py +++ /dev/null @@ -1,137 +0,0 @@ -from unittest import TestCase - -from mavecore.validation.genome_validators import WildTypeSequence - -# from mavetools.validation.genome_factories import ( -# ReferenceMapFactory, -# ReferenceGenomeFactory, -# GenomicIntervalFactory, -# ) - - -from mavecore.validation.genome_validators import ( - validate_wildtype_sequence, - validate_gene_name, - validate_genome_short_name, - validate_organism_name, - sequence_is_protein, - sequence_is_dna, -) -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation.constants import null_values_list - - -class TestWildTypeSequenceValidators(TestCase): - """ - Tests validation associated with :class:`WildTypeSequence`. Tests: - - - validate_wildtype_sequence - """ - - def test_ve_not_a_sequence_of_nucleotides_or_aa(self): - with self.assertRaises(ValidationError): - validate_wildtype_sequence("2823d") - - def test_ve_null(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_wildtype_sequence(v) - - def test_passes_lowercase_nucleotides(self): - validate_wildtype_sequence("atcg") - - def test_passes_uppercase_nucleotides(self): - validate_wildtype_sequence("ATCG") - - def test_passes_lowercase_aa(self): - validate_wildtype_sequence("MDLSALRVEE") - - def test_passes_uppercase_aa(self): - validate_wildtype_sequence("MDLSALRVEE".lower()) - - def test_pass_validate_dna_sequence(self): - validate_wildtype_sequence("ATCG", as_type=WildTypeSequence.SequenceType.DNA) - - def test_pass_validate_protein_sequence(self): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - - def test_fails_validate_as_type_dna_but_seq_is_protein(self): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - with self.assertRaises(ValidationError): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.DNA - ) - - def test_fail_validate_as_type_protein_when_sequence_is_invalid(self): - with self.assertRaises(ValidationError): - validate_wildtype_sequence( - "ABC", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - - -class TestIsProteinSequence(TestCase): - def test_false_null(self): - for v in null_values_list: - self.assertFalse(sequence_is_protein(v)) - - def test_false_dna_sequence(self): - # Favor dna sequences when only ATCG - self.assertFalse(sequence_is_protein("ATCG")) - self.assertFalse(sequence_is_protein("atc")) - - def test_true_aa_sequence(self): - self.assertTrue(sequence_is_protein("MDLSALRVEEATC")) - self.assertTrue(sequence_is_protein("MDLSALRVEEATC".lower())) - - -class TestIsDNASequence(TestCase): - def test_false_null(self): - for v in null_values_list: - self.assertFalse(sequence_is_protein(v)) - - def test_true_dna_sequence(self): - self.assertTrue(sequence_is_dna("ATCG")) - self.assertTrue(sequence_is_dna("atc")) - - def test_false_aa_sequence(self): - self.assertFalse(sequence_is_dna("MDLSALRVEEATC")) - self.assertFalse(sequence_is_dna("MDLSALRVEEATC".lower())) - - -class TestReferenceGenomeValidators(TestCase): - """ - Tests validation associated with :class:`ReferenceGenome`: - - - validate_reference_genome_has_one_external_identifier - - validate_organism_name - - validate_genome_short_name - """ - - def test_ve_null_organism_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_organism_name(v) - - def test_ve_null_genome_short_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_genome_short_name(v) - - -class TestTargetGeneValidators(TestCase): - """ - Tests validation asscociated with :class:`TargetGene`: - - - validate_gene_name - - validate_target_has_one_primary_reference_map - """ - - def test_ve_null_gene_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_gene_name(v) diff --git a/tests/test_validation/test_variant_validators/test_validators.py b/tests/test_validation/test_variant_validators/test_validators.py index 495ca3e..4e0a4af 100644 --- a/tests/test_validation/test_variant_validators/test_validators.py +++ b/tests/test_validation/test_variant_validators/test_validators.py @@ -12,7 +12,6 @@ # from ..factories import generate_hgvs, VariantFactory from mavecore.validation.variant_validators import ( - MaveDataset, validate_variant_json, validate_hgvs_string, ) @@ -122,616 +121,12 @@ def test_validation_error_contains_unexpected_keys(self): validate_variant_json(data) def test_validation_error_values_not_dict(self): - data = {constants.variant_score_data: {}, constants.variant_count_data: {}} + data = { + constants.variant_score_data: {}, + constants.variant_count_data: {}, + } for key in data.keys(): data[key] = [] with self.assertRaises(ValidationError): validate_variant_json(data) data[key] = {} - - -class TestMaveDataset(TestCase): - """ - Tests the validator :func:`validate_variant_rows` to check if the correct - errors are thrown when invalid rows are encountered in a - scores/counts/meta data input file. Checks for: - - Invalid HGVS string in a row - - Row HGVS is defined in more than one row - - Row values are not int/float for a count/score file - - Tests also check to see if the correct header and hgvs data information - is parsed and returned. - """ - - SCORE_COL = constants.required_score_column - HGVS_NT_COL = constants.hgvs_nt_column - HGVS_SPLICE_COL = constants.hgvs_splice_column - HGVS_PRO_COL = constants.hgvs_pro_column - - @staticmethod - def mock_return_value(data, index=None): - df = pd.read_csv(StringIO(data), sep=",", na_values=["None", None]) - if index: - df.index = pd.Index(df[index]) - return df - - def test_invalid_row_hgvs_is_not_a_string(self): - data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_missing_hgvs_columns(self): - data = "{},{}\n{},1.0".format("not_hgvs", self.SCORE_COL, generate_hgvs()) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_replaces_null_with_none_in_secondary_hgvs_column(self): - hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{},{}\n{},{},1.0 ".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.HGVS_PRO_COL]), [None] - ) - - def test_replaces_null_with_none_in_numeric_columns(self): - hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.SCORE_COL]), [None] - ) - - def test_invalid_null_values_in_header(self): - for value in constants.null_values_list: - with self.subTest(msg=f"'{value}'"): - data = "{},{},{}\n{},1.0,1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, value, generate_hgvs() - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_no_additional_columns_outside_hgvs_ones(self): - data = "{},{},{}\n{},{},{}".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_scores_missing_scores_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, "scores_rna", generate_hgvs(prefix="g"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_missing_either_required_hgvs_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_SPLICE_COL, self.SCORE_COL, generate_hgvs(prefix="c"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_empty_no_variants_parsed(self): - data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_empty) - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_error_non_numeric_values_in_score_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - "I am not a number", - ) - - with self.assertRaises(ValueError): - MaveDataset.for_scores(StringIO(data)) - - def test_invalid_same_hgvs_nt_defined_in_two_rows(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): - hgvs = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_data_method_converts_null_values_to_None(self): - hgvs = generate_hgvs() - for value in constants.null_values_list: - with self.subTest(msg=value): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, value - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - - df = dataset.data(serializable=True) - self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) - self.assertIsNone(df[self.SCORE_COL].values[0]) - - def test_sorts_header(self): - hgvs_nt = generate_hgvs(prefix="g") - hgvs_pro = generate_hgvs(prefix="p") - hgvs_splice = generate_hgvs(prefix="c") - data = "{},{},{},{},{}\n{},{},{},{},{}".format( - self.HGVS_PRO_COL, - self.HGVS_NT_COL, - "colA", - self.SCORE_COL, - self.HGVS_SPLICE_COL, - hgvs_pro, - hgvs_nt, - "hello", - 1.0, - hgvs_splice, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - dataset.columns, - [ - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - "colA", - ], - ) - - def test_does_not_allow_wt_and_sy(self): - wt = "_wt" - sy = "_sy" - data = "{},{},{},{}\n{},{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - wt, - wt, - sy, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 3) - print(dataset.errors) - - def test_parses_numeric_column_values_into_float(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - value = dataset.data()[self.SCORE_COL].values[0] - self.assertIsInstance(value, float) - - def test_does_not_split_double_quoted_variants(self): - hgvs = "c.[123A>G;124A>G]" - data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL]) - - # def test_invalid_non_double_quoted_multi_variant_row(self): - # hgvs = "{},{}".format(generate_hgvs(), generate_hgvs()) - # data = "{},{}\n'{}',1.0".format( - # constants.hgvs_nt_column, required_score_column, hgvs - # ) - # with self.assertRaises(ValidationError): - # _ = validate_variant_rows(BytesIO(data.encode())) - - def test_primary_column_is_pro_when_nt_is_not_defined(self): - hgvs_pro = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0".format(self.HGVS_PRO_COL, self.SCORE_COL, hgvs_pro) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_PRO_COL) - - def test_primary_column_is_nt_by_default(self): - hgvs_nt = generate_hgvs(prefix="c") - hgvs_pro = generate_hgvs(prefix="p") - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, hgvs_pro - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_NT_COL) - - def test_error_missing_value_in_nt_column_when_nt_is_primary(self): - for v in constants.null_values_list: - with self.subTest(msg=v): - data = ( - "{},{},{}\n" - "{},{},1.0\n" - "{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - v, - generate_hgvs(prefix="p"), - ) - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_error_missing_value_in_pro_column_when_pro_is_primary(self): - for v in constants.null_values_list: - with self.subTest(msg=v): - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_df_indexed_by_primary_column(self): - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - assert_index_equal(dataset.data().index, dataset.index) - - def test_invalid_duplicates_in_index(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{},{}\n{},{},1.0\n{},{},2.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - hgvs, - generate_hgvs(prefix="p"), - hgvs, - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_hgvs_in_column(self): - tests = [ - (self.HGVS_PRO_COL, generate_hgvs(prefix="c")), - (self.HGVS_SPLICE_COL, generate_hgvs(prefix="g")), - (self.HGVS_NT_COL, generate_hgvs(prefix="p")), - ] - for (column, variant) in tests: - with self.subTest(msg=f"{column}: {variant}"): - if column == self.HGVS_SPLICE_COL: - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - column, - self.SCORE_COL, - generate_hgvs(prefix="g"), - variant, - ) - else: - data = "{},{}\n{},1.0".format(column, self.SCORE_COL, variant) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): - data = "{},{}\n{},1.0\n{},2.0".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors) - - def test_invalid_nt_not_genomic_when_splice_present(self): - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_splice_defined_when_nt_is_not(self): - data = "{},{},{}\n,{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_splice_not_defined_when_nt_is_genomic(self): - data = "{},{}\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="g") - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors) - - def test_invalid_zero_is_not_parsed_as_none(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 0) - - def test_invalid_close_to_zero_is_not_parsed_as_none(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},5.6e-15".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15) - - def test_defines_same_variants(self): - tests = [ - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - True, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.2A>G,0.0".format(self.HGVS_NT_COL), - False, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - True, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Phe,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - False, - ), - # Check returns None if either dataset invalid - ( - "wrong_columns,{}\nc.1A>G,0.0".format(self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - None, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "wrong_column,count\nc.1A>G,0.0".format(), - None, - ), - ] - - for (scores, counts, expected) in tests: - with self.subTest(msg=(scores, counts, expected)): - scores_dataset = MaveDataset.for_scores(StringIO(scores)) - scores_dataset.validate() - - counts_dataset = MaveDataset.for_counts(StringIO(counts)) - counts_dataset.validate() - - self.assertEqual(scores_dataset.match_other(counts_dataset), expected) - - def test_to_dict(self): - hgvs_1 = generate_hgvs(prefix="c") - hgvs_2 = generate_hgvs(prefix="c") - data = "{},{},{},{}\n{},,,\n{},,,1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - hgvs_1, - hgvs_2, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertDictEqual( - dataset.to_dict(), - { - hgvs_1: { - self.HGVS_NT_COL: hgvs_1, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: None, - }, - hgvs_2: { - self.HGVS_NT_COL: hgvs_2, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: 1.0, - }, - }, - ) - - def test_valid_targetseq_validation_fails(self): - data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertTrue(dataset.is_valid) - - def test_invalid_targetseq_validation_fails(self): - data = "{},{},{}\nc.1A>G,p.Val1Phe,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("p.Val1Phe", dataset.errors[0]) - - def test_invalid_target_sequence_not_a_multiple_of_3(self): - data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATCG") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("multiple of 3", dataset.errors[0]) - - @unittest.expectedFailure - def test_invalid_relaxed_ordering_check_fails(self): - self.fail("Test is pending")