From 9727fc9321ec238eca7522922ff087724e641121 Mon Sep 17 00:00:00 2001 From: David Martinez Millan Date: Tue, 20 May 2025 15:18:35 +0200 Subject: [PATCH 1/6] add: be able to parse any delimiter --- openvariant/annotation/annotation.py | 11 +++--- openvariant/annotation/config_annotation.py | 6 ---- openvariant/variant/variant.py | 14 ++++++-- tests/data/annotation/annotation.yaml | 2 +- tests/data/annotation/invalid_delimiter.yaml | 36 -------------------- tests/test_annotation/test_annotation.py | 14 +++----- 6 files changed, 22 insertions(+), 61 deletions(-) delete mode 100644 tests/data/annotation/invalid_delimiter.yaml diff --git a/openvariant/annotation/annotation.py b/openvariant/annotation/annotation.py index 3b9359e..b35dd80 100644 --- a/openvariant/annotation/annotation.py +++ b/openvariant/annotation/annotation.py @@ -5,14 +5,14 @@ """ import logging import re +import codecs from typing import List from yaml import safe_load, YAMLError from openvariant.utils.utils import import_class_from_module from openvariant.annotation.config_annotation import (AnnotationGeneralKeys, AnnotationKeys, AnnotationTypes, - ExcludesKeys, DEFAULT_FORMAT, DEFAULT_DELIMITER, - AnnotationFormat, AnnotationDelimiter) + ExcludesKeys, DEFAULT_FORMAT, AnnotationFormat) def _check_general_keys(annot: dict) -> None: @@ -35,9 +35,7 @@ def _check_general_keys(annot: dict) -> None: raise KeyError(f"'{AnnotationGeneralKeys.FORMAT.value}' key is not a string.") # Delimiter key - if AnnotationGeneralKeys.DELIMITER.value in annot and \ - (not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str) or - annot[AnnotationGeneralKeys.DELIMITER.value].upper() not in [e.name for e in AnnotationDelimiter]): + if AnnotationGeneralKeys.DELIMITER.value in annot and not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str): raise KeyError(f"'{AnnotationGeneralKeys.DELIMITER.value}' key is not valid or is not a string.") # Columns key @@ -151,7 +149,8 @@ def __init__(self, annotation_path: str) -> None: patterns = raw_annotation[AnnotationGeneralKeys.PATTERN.value] self._patterns = patterns if isinstance(patterns, List) else [patterns] self._recursive = raw_annotation.get(AnnotationGeneralKeys.RECURSIVE.value, True) - self._delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, DEFAULT_DELIMITER).upper() + delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, None) + self._delimiter = None if delimiter is None else codecs.decode(delimiter, 'unicode_escape') self._format = raw_annotation.get(AnnotationGeneralKeys.FORMAT.value, DEFAULT_FORMAT).replace('.', '') self._excludes: dict = {} diff --git a/openvariant/annotation/config_annotation.py b/openvariant/annotation/config_annotation.py index 9cbc52f..beb748c 100644 --- a/openvariant/annotation/config_annotation.py +++ b/openvariant/annotation/config_annotation.py @@ -9,7 +9,6 @@ DEFAULT_FORMAT = 'TSV' DEFAULT_COLUMNS = [] DEFAULT_RECURSIVE = False -DEFAULT_DELIMITER = 'T' class AnnotationGeneralKeys(Enum): @@ -49,11 +48,6 @@ class AnnotationTypes(Enum): MAPPING = 'mapping' -class AnnotationDelimiter(Enum): - T = "\t" - C = "," - - class AnnotationFormat(Enum): TSV = "\t" CSV = "," diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index e31e270..fee9048 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -18,7 +18,7 @@ from openvariant.annotation.annotation import Annotation from openvariant.annotation.builder import MappingBuilder -from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes, AnnotationDelimiter +from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes from openvariant.utils.utils import check_extension, import_class_from_module from openvariant.variant.where import skip, parse_where @@ -37,6 +37,14 @@ def _open_file(file_path: str, mode='r+b'): return mm, file +def _detect_delimiter(line: str): + """Detects the dominant delimiter in a line""" + counts = { + '\t': line.count('\t'), + ',': line.count(','), + ';': line.count(';') + } + return max(counts, key=counts.get) def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]: """Cleaning comments and irrelevant data""" @@ -49,7 +57,9 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) try: for l_num, line in enumerate(iter(mm_obj.readline, b'')): line = line.decode('utf-8') - row_line = line.split(AnnotationDelimiter[delimiter].value) + delimiter = _detect_delimiter(line) if l_num == 0 else delimiter + + row_line = re.split(delimiter, line) row_line = list(map(lambda w: w.rstrip("\r\n"), row_line)) if len(row_line) == 0: diff --git a/tests/data/annotation/annotation.yaml b/tests/data/annotation/annotation.yaml index b935f91..35fd40e 100644 --- a/tests/data/annotation/annotation.yaml +++ b/tests/data/annotation/annotation.yaml @@ -3,7 +3,7 @@ pattern: - '*.vcf.gz' recursive: false format: 'CSV' -delimiter: 'C' +delimiter: \t columns: - 'PLATFORM' diff --git a/tests/data/annotation/invalid_delimiter.yaml b/tests/data/annotation/invalid_delimiter.yaml deleted file mode 100644 index ac0c331..0000000 --- a/tests/data/annotation/invalid_delimiter.yaml +++ /dev/null @@ -1,36 +0,0 @@ -pattern: - - '*.maf' - - '*.vcf.gz' -recursive: false -format: 'CSV' -delimiter: 'INVALID' - -annotation: - - type: 'static' - field: 'PLATFORM' - value: 'WSG' - - - type: 'internal' - field: 'POSITION' - fieldSource: - - 'Position' - - 'Start_Position' - - - type: 'filename' - field: 'DATASET' - function: 'lambda x: "{}".format(x.lower()[:-4])' - - - type: 'dirname' - field: 'PROJECT' - function: 'lambda x: "{}".format(x.lower())' - - - type: 'plugin' - plugin: 'alteration_type' - field: 'ALT_TYPE' - -exclude: - - field: 'MUTATION_REF' - value: 1234 - - - field: 'DATASET' - value: 'ucs' diff --git a/tests/test_annotation/test_annotation.py b/tests/test_annotation/test_annotation.py index 6690a7a..c5fc93c 100644 --- a/tests/test_annotation/test_annotation.py +++ b/tests/test_annotation/test_annotation.py @@ -1,8 +1,9 @@ import unittest from os import getcwd +import re from openvariant.annotation.annotation import Annotation -from openvariant.annotation.config_annotation import DEFAULT_FORMAT, DEFAULT_DELIMITER +from openvariant.annotation.config_annotation import DEFAULT_FORMAT class TestAnnotation(unittest.TestCase): @@ -51,15 +52,8 @@ def test_annotation_invalid_format(self): def test_annotation_delimiter(self): annotation = Annotation(f'{getcwd()}/tests/data/annotation/annotation.yaml') - self.assertEqual(annotation.delimiter, 'C') - - def test_annotation_no_exist_delimiter(self): - annotation = Annotation(f'{getcwd()}/tests/data/annotation/no_exist_delimiter.yaml') - self.assertEqual(annotation.delimiter, DEFAULT_DELIMITER) - - def test_annotation_invalid_delimiter(self): - with self.assertRaises(KeyError): - Annotation(f'{getcwd()}/tests/data/annotation/invalid_delimiter.yaml') + print(annotation.delimiter, '\t') + self.assertEqual(annotation.delimiter, '\t') def test_annotation_columns(self): res_expect = {'PLATFORM', 'DATASET'} From 7a78ba0f219094e425df47435101eb53184b7672 Mon Sep 17 00:00:00 2001 From: David Martinez Millan Date: Tue, 20 May 2025 17:01:13 +0200 Subject: [PATCH 2/6] fix: delimiter check --- openvariant/variant/variant.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index fee9048..23962de 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -57,7 +57,8 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) try: for l_num, line in enumerate(iter(mm_obj.readline, b'')): line = line.decode('utf-8') - delimiter = _detect_delimiter(line) if l_num == 0 else delimiter + if delimiter is None: + delimiter = _detect_delimiter(line) if l_num == 0 else delimiter row_line = re.split(delimiter, line) row_line = list(map(lambda w: w.rstrip("\r\n"), row_line)) @@ -69,7 +70,6 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) if (row_line[0].startswith('#') or row_line[0].startswith('##') or row_line[0].startswith('browser') or row_line[0].startswith('track')) and not row_line[0].startswith('#CHROM'): continue - yield l_num, row_line except Exception as e: if skip_files: From 32f2b3bd9db958f4bf162c7263d979f117481382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Mart=C3=ADnez=20Mill=C3=A1n?= <10314744+dmartmillan@users.noreply.github.com> Date: Thu, 22 May 2025 14:24:50 +0200 Subject: [PATCH 3/6] fix: update detect delimiter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos López-Elorduy <107858804+CarlosLopezElorduy@users.noreply.github.com> --- openvariant/variant/variant.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index 23962de..c9062b4 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -38,13 +38,9 @@ def _open_file(file_path: str, mode='r+b'): return mm, file def _detect_delimiter(line: str): - """Detects the dominant delimiter in a line""" - counts = { - '\t': line.count('\t'), - ',': line.count(','), - ';': line.count(';') - } - return max(counts, key=counts.get) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(line, delimiters='\t,;') + return dialect.delimiter def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]: """Cleaning comments and irrelevant data""" From 7139ff842af69251312bd66e840eeead40f46c1f Mon Sep 17 00:00:00 2001 From: David Martinez Millan Date: Thu, 22 May 2025 14:25:22 +0200 Subject: [PATCH 4/6] fix: skip comments order --- openvariant/variant/variant.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index 23962de..b42b1a4 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -57,8 +57,14 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) try: for l_num, line in enumerate(iter(mm_obj.readline, b'')): line = line.decode('utf-8') + + # Skip comments + if (line.startswith('#') or line.startswith('##') or line.startswith('browser') or + line.startswith('track')) and not line.startswith('#CHROM'): + continue + if delimiter is None: - delimiter = _detect_delimiter(line) if l_num == 0 else delimiter + delimiter = _detect_delimiter(line) row_line = re.split(delimiter, line) row_line = list(map(lambda w: w.rstrip("\r\n"), row_line)) @@ -66,10 +72,6 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) if len(row_line) == 0: continue - # Skip comments - if (row_line[0].startswith('#') or row_line[0].startswith('##') or row_line[0].startswith('browser') or - row_line[0].startswith('track')) and not row_line[0].startswith('#CHROM'): - continue yield l_num, row_line except Exception as e: if skip_files: From a7312462b049ee4436c57bc6c9f4895e3353406c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Mart=C3=ADnez=20Mill=C3=A1n?= <10314744+dmartmillan@users.noreply.github.com> Date: Fri, 23 May 2025 12:49:52 +0200 Subject: [PATCH 5/6] fix: update detect delimiter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos López-Elorduy <107858804+CarlosLopezElorduy@users.noreply.github.com> --- openvariant/variant/variant.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index 78e38a2..47fb06b 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -39,8 +39,14 @@ def _open_file(file_path: str, mode='r+b'): def _detect_delimiter(line: str): sniffer = csv.Sniffer() - dialect = sniffer.sniff(line, delimiters='\t,;') - return dialect.delimiter + try: + dialect = sniffer.sniff(line, delimiters='\t,;') + return dialect.delimiter + except csv.Error as e: + if "Could not determine delimiter" in str(e): + return '\t' + else: + raise e def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]: """Cleaning comments and irrelevant data""" From ca427ed10c42b622e5f9f71933ed118db8c57698 Mon Sep 17 00:00:00 2001 From: David Martinez Millan Date: Fri, 23 May 2025 13:14:28 +0200 Subject: [PATCH 6/6] fix: added comment --- openvariant/variant/variant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index 47fb06b..ea790ed 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -25,7 +25,6 @@ def _open_file(file_path: str, mode='r+b'): """Open raw files or compressed files""" - if file_path.endswith('xz'): open_method = lzma.open file = open_method(file_path, mode) @@ -38,6 +37,7 @@ def _open_file(file_path: str, mode='r+b'): return mm, file def _detect_delimiter(line: str): + """Detects the dominant delimiter in a line""" sniffer = csv.Sniffer() try: dialect = sniffer.sniff(line, delimiters='\t,;')