Skip to content

Commit 8e68a46

Browse files
authored
Merge pull request #52 from bbglab/47-openvariant-bug-windows-input-files-not-parsed-properly
OpenVariant parse any delimiter
2 parents 9631f08 + ca427ed commit 8e68a46

File tree

6 files changed

+32
-67
lines changed

6 files changed

+32
-67
lines changed

openvariant/annotation/annotation.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55
"""
66
import logging
77
import re
8+
import codecs
89

910
from typing import List
1011
from yaml import safe_load, YAMLError
1112

1213
from openvariant.utils.utils import import_class_from_module
1314
from openvariant.annotation.config_annotation import (AnnotationGeneralKeys, AnnotationKeys, AnnotationTypes,
14-
ExcludesKeys, DEFAULT_FORMAT, DEFAULT_DELIMITER,
15-
AnnotationFormat, AnnotationDelimiter)
15+
ExcludesKeys, DEFAULT_FORMAT, AnnotationFormat)
1616

1717

1818
def _check_general_keys(annot: dict) -> None:
@@ -35,9 +35,7 @@ def _check_general_keys(annot: dict) -> None:
3535
raise KeyError(f"'{AnnotationGeneralKeys.FORMAT.value}' key is not a string.")
3636

3737
# Delimiter key
38-
if AnnotationGeneralKeys.DELIMITER.value in annot and \
39-
(not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str) or
40-
annot[AnnotationGeneralKeys.DELIMITER.value].upper() not in [e.name for e in AnnotationDelimiter]):
38+
if AnnotationGeneralKeys.DELIMITER.value in annot and not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str):
4139
raise KeyError(f"'{AnnotationGeneralKeys.DELIMITER.value}' key is not valid or is not a string.")
4240

4341
# Columns key
@@ -151,7 +149,8 @@ def __init__(self, annotation_path: str) -> None:
151149
patterns = raw_annotation[AnnotationGeneralKeys.PATTERN.value]
152150
self._patterns = patterns if isinstance(patterns, List) else [patterns]
153151
self._recursive = raw_annotation.get(AnnotationGeneralKeys.RECURSIVE.value, True)
154-
self._delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, DEFAULT_DELIMITER).upper()
152+
delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, None)
153+
self._delimiter = None if delimiter is None else codecs.decode(delimiter, 'unicode_escape')
155154
self._format = raw_annotation.get(AnnotationGeneralKeys.FORMAT.value, DEFAULT_FORMAT).replace('.', '')
156155

157156
self._excludes: dict = {}

openvariant/annotation/config_annotation.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
DEFAULT_FORMAT = 'TSV'
1010
DEFAULT_COLUMNS = []
1111
DEFAULT_RECURSIVE = False
12-
DEFAULT_DELIMITER = 'T'
1312

1413

1514
class AnnotationGeneralKeys(Enum):
@@ -49,11 +48,6 @@ class AnnotationTypes(Enum):
4948
MAPPING = 'mapping'
5049

5150

52-
class AnnotationDelimiter(Enum):
53-
T = "\t"
54-
C = ","
55-
56-
5751
class AnnotationFormat(Enum):
5852
TSV = "\t"
5953
CSV = ","

openvariant/variant/variant.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,13 @@
1818

1919
from openvariant.annotation.annotation import Annotation
2020
from openvariant.annotation.builder import MappingBuilder
21-
from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes, AnnotationDelimiter
21+
from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes
2222
from openvariant.utils.utils import check_extension, import_class_from_module
2323
from openvariant.variant.where import skip, parse_where
2424

2525

2626
def _open_file(file_path: str, mode='r+b'):
2727
"""Open raw files or compressed files"""
28-
2928
if file_path.endswith('xz'):
3029
open_method = lzma.open
3130
file = open_method(file_path, mode)
@@ -37,6 +36,17 @@ def _open_file(file_path: str, mode='r+b'):
3736

3837
return mm, file
3938

39+
def _detect_delimiter(line: str):
40+
"""Detects the dominant delimiter in a line"""
41+
sniffer = csv.Sniffer()
42+
try:
43+
dialect = sniffer.sniff(line, delimiters='\t,;')
44+
return dialect.delimiter
45+
except csv.Error as e:
46+
if "Could not determine delimiter" in str(e):
47+
return '\t'
48+
else:
49+
raise e
4050

4151
def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]:
4252
"""Cleaning comments and irrelevant data"""
@@ -49,15 +59,19 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool)
4959
try:
5060
for l_num, line in enumerate(iter(mm_obj.readline, b'')):
5161
line = line.decode('utf-8')
52-
row_line = line.split(AnnotationDelimiter[delimiter].value)
53-
row_line = list(map(lambda w: w.rstrip("\r\n"), row_line))
5462

55-
if len(row_line) == 0:
63+
# Skip comments
64+
if (line.startswith('#') or line.startswith('##') or line.startswith('browser') or
65+
line.startswith('track')) and not line.startswith('#CHROM'):
5666
continue
5767

58-
# Skip comments
59-
if (row_line[0].startswith('#') or row_line[0].startswith('##') or row_line[0].startswith('browser') or
60-
row_line[0].startswith('track')) and not row_line[0].startswith('#CHROM'):
68+
if delimiter is None:
69+
delimiter = _detect_delimiter(line)
70+
71+
row_line = re.split(delimiter, line)
72+
row_line = list(map(lambda w: w.rstrip("\r\n"), row_line))
73+
74+
if len(row_line) == 0:
6175
continue
6276

6377
yield l_num, row_line

tests/data/annotation/annotation.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ pattern:
33
- '*.vcf.gz'
44
recursive: false
55
format: 'CSV'
6-
delimiter: 'C'
6+
delimiter: \t
77

88
columns:
99
- 'PLATFORM'

tests/data/annotation/invalid_delimiter.yaml

Lines changed: 0 additions & 36 deletions
This file was deleted.

tests/test_annotation/test_annotation.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import unittest
22
from os import getcwd
3+
import re
34

45
from openvariant.annotation.annotation import Annotation
5-
from openvariant.annotation.config_annotation import DEFAULT_FORMAT, DEFAULT_DELIMITER
6+
from openvariant.annotation.config_annotation import DEFAULT_FORMAT
67

78

89
class TestAnnotation(unittest.TestCase):
@@ -51,15 +52,8 @@ def test_annotation_invalid_format(self):
5152

5253
def test_annotation_delimiter(self):
5354
annotation = Annotation(f'{getcwd()}/tests/data/annotation/annotation.yaml')
54-
self.assertEqual(annotation.delimiter, 'C')
55-
56-
def test_annotation_no_exist_delimiter(self):
57-
annotation = Annotation(f'{getcwd()}/tests/data/annotation/no_exist_delimiter.yaml')
58-
self.assertEqual(annotation.delimiter, DEFAULT_DELIMITER)
59-
60-
def test_annotation_invalid_delimiter(self):
61-
with self.assertRaises(KeyError):
62-
Annotation(f'{getcwd()}/tests/data/annotation/invalid_delimiter.yaml')
55+
print(annotation.delimiter, '\t')
56+
self.assertEqual(annotation.delimiter, '\t')
6357

6458
def test_annotation_columns(self):
6559
res_expect = {'PLATFORM', 'DATASET'}

0 commit comments

Comments
 (0)