Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions openvariant/annotation/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
"""
import logging
import re
import codecs

from typing import List
from yaml import safe_load, YAMLError

from openvariant.utils.utils import import_class_from_module
from openvariant.annotation.config_annotation import (AnnotationGeneralKeys, AnnotationKeys, AnnotationTypes,
ExcludesKeys, DEFAULT_FORMAT, DEFAULT_DELIMITER,
AnnotationFormat, AnnotationDelimiter)
ExcludesKeys, DEFAULT_FORMAT, AnnotationFormat)


def _check_general_keys(annot: dict) -> None:
Expand All @@ -35,9 +35,7 @@ def _check_general_keys(annot: dict) -> None:
raise KeyError(f"'{AnnotationGeneralKeys.FORMAT.value}' key is not a string.")

# Delimiter key
if AnnotationGeneralKeys.DELIMITER.value in annot and \
(not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str) or
annot[AnnotationGeneralKeys.DELIMITER.value].upper() not in [e.name for e in AnnotationDelimiter]):
if AnnotationGeneralKeys.DELIMITER.value in annot and not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str):
raise KeyError(f"'{AnnotationGeneralKeys.DELIMITER.value}' key is not valid or is not a string.")

# Columns key
Expand Down Expand Up @@ -151,7 +149,8 @@ def __init__(self, annotation_path: str) -> None:
patterns = raw_annotation[AnnotationGeneralKeys.PATTERN.value]
self._patterns = patterns if isinstance(patterns, List) else [patterns]
self._recursive = raw_annotation.get(AnnotationGeneralKeys.RECURSIVE.value, True)
self._delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, DEFAULT_DELIMITER).upper()
delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, None)
self._delimiter = None if delimiter is None else codecs.decode(delimiter, 'unicode_escape')
self._format = raw_annotation.get(AnnotationGeneralKeys.FORMAT.value, DEFAULT_FORMAT).replace('.', '')

self._excludes: dict = {}
Expand Down
6 changes: 0 additions & 6 deletions openvariant/annotation/config_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
DEFAULT_FORMAT = 'TSV'
DEFAULT_COLUMNS = []
DEFAULT_RECURSIVE = False
DEFAULT_DELIMITER = 'T'


class AnnotationGeneralKeys(Enum):
Expand Down Expand Up @@ -49,11 +48,6 @@ class AnnotationTypes(Enum):
MAPPING = 'mapping'


class AnnotationDelimiter(Enum):
T = "\t"
C = ","


class AnnotationFormat(Enum):
TSV = "\t"
CSV = ","
30 changes: 22 additions & 8 deletions openvariant/variant/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,13 @@

from openvariant.annotation.annotation import Annotation
from openvariant.annotation.builder import MappingBuilder
from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes, AnnotationDelimiter
from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes
from openvariant.utils.utils import check_extension, import_class_from_module
from openvariant.variant.where import skip, parse_where


def _open_file(file_path: str, mode='r+b'):
"""Open raw files or compressed files"""

if file_path.endswith('xz'):
open_method = lzma.open
file = open_method(file_path, mode)
Expand All @@ -37,6 +36,17 @@ def _open_file(file_path: str, mode='r+b'):

return mm, file

def _detect_delimiter(line: str):
"""Detects the dominant delimiter in a line"""
sniffer = csv.Sniffer()
try:
dialect = sniffer.sniff(line, delimiters='\t,;')
return dialect.delimiter
except csv.Error as e:
if "Could not determine delimiter" in str(e):
return '\t'
else:
raise e

def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]:
"""Cleaning comments and irrelevant data"""
Expand All @@ -49,15 +59,19 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool)
try:
for l_num, line in enumerate(iter(mm_obj.readline, b'')):
line = line.decode('utf-8')
row_line = line.split(AnnotationDelimiter[delimiter].value)
row_line = list(map(lambda w: w.rstrip("\r\n"), row_line))

if len(row_line) == 0:
# Skip comments
if (line.startswith('#') or line.startswith('##') or line.startswith('browser') or
line.startswith('track')) and not line.startswith('#CHROM'):
continue

# Skip comments
if (row_line[0].startswith('#') or row_line[0].startswith('##') or row_line[0].startswith('browser') or
row_line[0].startswith('track')) and not row_line[0].startswith('#CHROM'):
if delimiter is None:
delimiter = _detect_delimiter(line)

row_line = re.split(delimiter, line)
row_line = list(map(lambda w: w.rstrip("\r\n"), row_line))

if len(row_line) == 0:
continue

yield l_num, row_line
Expand Down
2 changes: 1 addition & 1 deletion tests/data/annotation/annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pattern:
- '*.vcf.gz'
recursive: false
format: 'CSV'
delimiter: 'C'
delimiter: \t

columns:
- 'PLATFORM'
Expand Down
36 changes: 0 additions & 36 deletions tests/data/annotation/invalid_delimiter.yaml

This file was deleted.

14 changes: 4 additions & 10 deletions tests/test_annotation/test_annotation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import unittest
from os import getcwd
import re

from openvariant.annotation.annotation import Annotation
from openvariant.annotation.config_annotation import DEFAULT_FORMAT, DEFAULT_DELIMITER
from openvariant.annotation.config_annotation import DEFAULT_FORMAT


class TestAnnotation(unittest.TestCase):
Expand Down Expand Up @@ -51,15 +52,8 @@ def test_annotation_invalid_format(self):

def test_annotation_delimiter(self):
annotation = Annotation(f'{getcwd()}/tests/data/annotation/annotation.yaml')
self.assertEqual(annotation.delimiter, 'C')

def test_annotation_no_exist_delimiter(self):
annotation = Annotation(f'{getcwd()}/tests/data/annotation/no_exist_delimiter.yaml')
self.assertEqual(annotation.delimiter, DEFAULT_DELIMITER)

def test_annotation_invalid_delimiter(self):
with self.assertRaises(KeyError):
Annotation(f'{getcwd()}/tests/data/annotation/invalid_delimiter.yaml')
print(annotation.delimiter, '\t')
self.assertEqual(annotation.delimiter, '\t')

def test_annotation_columns(self):
res_expect = {'PLATFORM', 'DATASET'}
Expand Down