diff --git a/openvariant/annotation/annotation.py b/openvariant/annotation/annotation.py index 3b9359e..b35dd80 100644 --- a/openvariant/annotation/annotation.py +++ b/openvariant/annotation/annotation.py @@ -5,14 +5,14 @@ """ import logging import re +import codecs from typing import List from yaml import safe_load, YAMLError from openvariant.utils.utils import import_class_from_module from openvariant.annotation.config_annotation import (AnnotationGeneralKeys, AnnotationKeys, AnnotationTypes, - ExcludesKeys, DEFAULT_FORMAT, DEFAULT_DELIMITER, - AnnotationFormat, AnnotationDelimiter) + ExcludesKeys, DEFAULT_FORMAT, AnnotationFormat) def _check_general_keys(annot: dict) -> None: @@ -35,9 +35,7 @@ def _check_general_keys(annot: dict) -> None: raise KeyError(f"'{AnnotationGeneralKeys.FORMAT.value}' key is not a string.") # Delimiter key - if AnnotationGeneralKeys.DELIMITER.value in annot and \ - (not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str) or - annot[AnnotationGeneralKeys.DELIMITER.value].upper() not in [e.name for e in AnnotationDelimiter]): + if AnnotationGeneralKeys.DELIMITER.value in annot and not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str): raise KeyError(f"'{AnnotationGeneralKeys.DELIMITER.value}' key is not valid or is not a string.") # Columns key @@ -151,7 +149,8 @@ def __init__(self, annotation_path: str) -> None: patterns = raw_annotation[AnnotationGeneralKeys.PATTERN.value] self._patterns = patterns if isinstance(patterns, List) else [patterns] self._recursive = raw_annotation.get(AnnotationGeneralKeys.RECURSIVE.value, True) - self._delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, DEFAULT_DELIMITER).upper() + delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, None) + self._delimiter = None if delimiter is None else codecs.decode(delimiter, 'unicode_escape') self._format = raw_annotation.get(AnnotationGeneralKeys.FORMAT.value, DEFAULT_FORMAT).replace('.', '') self._excludes: dict = {} diff --git a/openvariant/annotation/config_annotation.py b/openvariant/annotation/config_annotation.py index 9cbc52f..beb748c 100644 --- a/openvariant/annotation/config_annotation.py +++ b/openvariant/annotation/config_annotation.py @@ -9,7 +9,6 @@ DEFAULT_FORMAT = 'TSV' DEFAULT_COLUMNS = [] DEFAULT_RECURSIVE = False -DEFAULT_DELIMITER = 'T' class AnnotationGeneralKeys(Enum): @@ -49,11 +48,6 @@ class AnnotationTypes(Enum): MAPPING = 'mapping' -class AnnotationDelimiter(Enum): - T = "\t" - C = "," - - class AnnotationFormat(Enum): TSV = "\t" CSV = "," diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index e31e270..ea790ed 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -18,14 +18,13 @@ from openvariant.annotation.annotation import Annotation from openvariant.annotation.builder import MappingBuilder -from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes, AnnotationDelimiter +from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes from openvariant.utils.utils import check_extension, import_class_from_module from openvariant.variant.where import skip, parse_where def _open_file(file_path: str, mode='r+b'): """Open raw files or compressed files""" - if file_path.endswith('xz'): open_method = lzma.open file = open_method(file_path, mode) @@ -37,6 +36,17 @@ def _open_file(file_path: str, mode='r+b'): return mm, file +def _detect_delimiter(line: str): + """Detects the dominant delimiter in a line""" + sniffer = csv.Sniffer() + try: + dialect = sniffer.sniff(line, delimiters='\t,;') + return dialect.delimiter + except csv.Error as e: + if "Could not determine delimiter" in str(e): + return '\t' + else: + raise e def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]: """Cleaning comments and irrelevant data""" @@ -49,15 +59,19 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) try: for l_num, line in enumerate(iter(mm_obj.readline, b'')): line = line.decode('utf-8') - row_line = line.split(AnnotationDelimiter[delimiter].value) - row_line = list(map(lambda w: w.rstrip("\r\n"), row_line)) - if len(row_line) == 0: + # Skip comments + if (line.startswith('#') or line.startswith('##') or line.startswith('browser') or + line.startswith('track')) and not line.startswith('#CHROM'): continue - # Skip comments - if (row_line[0].startswith('#') or row_line[0].startswith('##') or row_line[0].startswith('browser') or - row_line[0].startswith('track')) and not row_line[0].startswith('#CHROM'): + if delimiter is None: + delimiter = _detect_delimiter(line) + + row_line = re.split(delimiter, line) + row_line = list(map(lambda w: w.rstrip("\r\n"), row_line)) + + if len(row_line) == 0: continue yield l_num, row_line diff --git a/tests/data/annotation/annotation.yaml b/tests/data/annotation/annotation.yaml index b935f91..35fd40e 100644 --- a/tests/data/annotation/annotation.yaml +++ b/tests/data/annotation/annotation.yaml @@ -3,7 +3,7 @@ pattern: - '*.vcf.gz' recursive: false format: 'CSV' -delimiter: 'C' +delimiter: \t columns: - 'PLATFORM' diff --git a/tests/data/annotation/invalid_delimiter.yaml b/tests/data/annotation/invalid_delimiter.yaml deleted file mode 100644 index ac0c331..0000000 --- a/tests/data/annotation/invalid_delimiter.yaml +++ /dev/null @@ -1,36 +0,0 @@ -pattern: - - '*.maf' - - '*.vcf.gz' -recursive: false -format: 'CSV' -delimiter: 'INVALID' - -annotation: - - type: 'static' - field: 'PLATFORM' - value: 'WSG' - - - type: 'internal' - field: 'POSITION' - fieldSource: - - 'Position' - - 'Start_Position' - - - type: 'filename' - field: 'DATASET' - function: 'lambda x: "{}".format(x.lower()[:-4])' - - - type: 'dirname' - field: 'PROJECT' - function: 'lambda x: "{}".format(x.lower())' - - - type: 'plugin' - plugin: 'alteration_type' - field: 'ALT_TYPE' - -exclude: - - field: 'MUTATION_REF' - value: 1234 - - - field: 'DATASET' - value: 'ucs' diff --git a/tests/test_annotation/test_annotation.py b/tests/test_annotation/test_annotation.py index 6690a7a..c5fc93c 100644 --- a/tests/test_annotation/test_annotation.py +++ b/tests/test_annotation/test_annotation.py @@ -1,8 +1,9 @@ import unittest from os import getcwd +import re from openvariant.annotation.annotation import Annotation -from openvariant.annotation.config_annotation import DEFAULT_FORMAT, DEFAULT_DELIMITER +from openvariant.annotation.config_annotation import DEFAULT_FORMAT class TestAnnotation(unittest.TestCase): @@ -51,15 +52,8 @@ def test_annotation_invalid_format(self): def test_annotation_delimiter(self): annotation = Annotation(f'{getcwd()}/tests/data/annotation/annotation.yaml') - self.assertEqual(annotation.delimiter, 'C') - - def test_annotation_no_exist_delimiter(self): - annotation = Annotation(f'{getcwd()}/tests/data/annotation/no_exist_delimiter.yaml') - self.assertEqual(annotation.delimiter, DEFAULT_DELIMITER) - - def test_annotation_invalid_delimiter(self): - with self.assertRaises(KeyError): - Annotation(f'{getcwd()}/tests/data/annotation/invalid_delimiter.yaml') + print(annotation.delimiter, '\t') + self.assertEqual(annotation.delimiter, '\t') def test_annotation_columns(self): res_expect = {'PLATFORM', 'DATASET'}