From 9727fc9321ec238eca7522922ff087724e641121 Mon Sep 17 00:00:00 2001
From: David Martinez Millan <dmartinezmillan@hotmail.com>
Date: Tue, 20 May 2025 15:18:35 +0200
Subject: [PATCH 1/6] add: be able to parse any delimiter

---
 openvariant/annotation/annotation.py         | 11 +++---
 openvariant/annotation/config_annotation.py  |  6 ----
 openvariant/variant/variant.py               | 14 ++++++--
 tests/data/annotation/annotation.yaml        |  2 +-
 tests/data/annotation/invalid_delimiter.yaml | 36 --------------------
 tests/test_annotation/test_annotation.py     | 14 +++-----
 6 files changed, 22 insertions(+), 61 deletions(-)
 delete mode 100644 tests/data/annotation/invalid_delimiter.yaml

diff --git a/openvariant/annotation/annotation.py b/openvariant/annotation/annotation.py
index 3b9359e..b35dd80 100644
--- a/openvariant/annotation/annotation.py
+++ b/openvariant/annotation/annotation.py
@@ -5,14 +5,14 @@
 """
 import logging
 import re
+import codecs
 
 from typing import List
 from yaml import safe_load, YAMLError
 
 from openvariant.utils.utils import import_class_from_module
 from openvariant.annotation.config_annotation import (AnnotationGeneralKeys, AnnotationKeys, AnnotationTypes,
-                                                      ExcludesKeys, DEFAULT_FORMAT, DEFAULT_DELIMITER,
-                                                      AnnotationFormat, AnnotationDelimiter)
+                                                      ExcludesKeys, DEFAULT_FORMAT, AnnotationFormat)
 
 
 def _check_general_keys(annot: dict) -> None:
@@ -35,9 +35,7 @@ def _check_general_keys(annot: dict) -> None:
         raise KeyError(f"'{AnnotationGeneralKeys.FORMAT.value}' key is not a string.")
 
     # Delimiter key
-    if AnnotationGeneralKeys.DELIMITER.value in annot and \
-            (not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str) or
-             annot[AnnotationGeneralKeys.DELIMITER.value].upper() not in [e.name for e in AnnotationDelimiter]):
+    if AnnotationGeneralKeys.DELIMITER.value in annot and not isinstance(annot[AnnotationGeneralKeys.DELIMITER.value], str):
         raise KeyError(f"'{AnnotationGeneralKeys.DELIMITER.value}' key is not valid or is not a string.")
 
     # Columns key
@@ -151,7 +149,8 @@ def __init__(self, annotation_path: str) -> None:
         patterns = raw_annotation[AnnotationGeneralKeys.PATTERN.value]
         self._patterns = patterns if isinstance(patterns, List) else [patterns]
         self._recursive = raw_annotation.get(AnnotationGeneralKeys.RECURSIVE.value, True)
-        self._delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, DEFAULT_DELIMITER).upper()
+        delimiter = raw_annotation.get(AnnotationGeneralKeys.DELIMITER.value, None)
+        self._delimiter = None if delimiter is None else codecs.decode(delimiter, 'unicode_escape')
         self._format = raw_annotation.get(AnnotationGeneralKeys.FORMAT.value, DEFAULT_FORMAT).replace('.', '')
 
         self._excludes: dict = {}
diff --git a/openvariant/annotation/config_annotation.py b/openvariant/annotation/config_annotation.py
index 9cbc52f..beb748c 100644
--- a/openvariant/annotation/config_annotation.py
+++ b/openvariant/annotation/config_annotation.py
@@ -9,7 +9,6 @@
 DEFAULT_FORMAT = 'TSV'
 DEFAULT_COLUMNS = []
 DEFAULT_RECURSIVE = False
-DEFAULT_DELIMITER = 'T'
 
 
 class AnnotationGeneralKeys(Enum):
@@ -49,11 +48,6 @@ class AnnotationTypes(Enum):
     MAPPING = 'mapping'
 
 
-class AnnotationDelimiter(Enum):
-    T = "\t"
-    C = ","
-
-
 class AnnotationFormat(Enum):
     TSV = "\t"
     CSV = ","
diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py
index e31e270..fee9048 100644
--- a/openvariant/variant/variant.py
+++ b/openvariant/variant/variant.py
@@ -18,7 +18,7 @@
 
 from openvariant.annotation.annotation import Annotation
 from openvariant.annotation.builder import MappingBuilder
-from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes, AnnotationDelimiter
+from openvariant.annotation.config_annotation import AnnotationFormat, AnnotationTypes
 from openvariant.utils.utils import check_extension, import_class_from_module
 from openvariant.variant.where import skip, parse_where
 
@@ -37,6 +37,14 @@ def _open_file(file_path: str, mode='r+b'):
 
     return mm, file
 
+def _detect_delimiter(line: str):
+    """Detects the dominant delimiter in a line"""
+    counts = {
+        '\t': line.count('\t'),
+        ',': line.count(','),
+        ';': line.count(';')
+    }
+    return max(counts, key=counts.get)
 
 def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]:
     """Cleaning comments and irrelevant data"""
@@ -49,7 +57,9 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool)
     try:
         for l_num, line in enumerate(iter(mm_obj.readline, b'')):
             line = line.decode('utf-8')
-            row_line = line.split(AnnotationDelimiter[delimiter].value)
+            delimiter = _detect_delimiter(line) if l_num == 0 else delimiter
+
+            row_line = re.split(delimiter, line)
             row_line = list(map(lambda w: w.rstrip("\r\n"), row_line))
 
             if len(row_line) == 0:
diff --git a/tests/data/annotation/annotation.yaml b/tests/data/annotation/annotation.yaml
index b935f91..35fd40e 100644
--- a/tests/data/annotation/annotation.yaml
+++ b/tests/data/annotation/annotation.yaml
@@ -3,7 +3,7 @@ pattern:
   - '*.vcf.gz'
 recursive: false
 format: 'CSV'
-delimiter: 'C'
+delimiter: \t
 
 columns:
   - 'PLATFORM'
diff --git a/tests/data/annotation/invalid_delimiter.yaml b/tests/data/annotation/invalid_delimiter.yaml
deleted file mode 100644
index ac0c331..0000000
--- a/tests/data/annotation/invalid_delimiter.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-pattern:
-  - '*.maf'
-  - '*.vcf.gz'
-recursive: false
-format: 'CSV'
-delimiter: 'INVALID'
-
-annotation:
-  - type: 'static'
-    field: 'PLATFORM'
-    value: 'WSG'
-
-  - type: 'internal'
-    field: 'POSITION'
-    fieldSource:
-      - 'Position'
-      - 'Start_Position'
-
-  - type: 'filename'
-    field: 'DATASET'
-    function: 'lambda x: "{}".format(x.lower()[:-4])'
-
-  - type: 'dirname'
-    field: 'PROJECT'
-    function: 'lambda x: "{}".format(x.lower())'
-
-  - type: 'plugin'
-    plugin: 'alteration_type'
-    field: 'ALT_TYPE'
-
-exclude:
-  - field: 'MUTATION_REF'
-    value: 1234
-
-  - field: 'DATASET'
-    value: 'ucs'
diff --git a/tests/test_annotation/test_annotation.py b/tests/test_annotation/test_annotation.py
index 6690a7a..c5fc93c 100644
--- a/tests/test_annotation/test_annotation.py
+++ b/tests/test_annotation/test_annotation.py
@@ -1,8 +1,9 @@
 import unittest
 from os import getcwd
+import re
 
 from openvariant.annotation.annotation import Annotation
-from openvariant.annotation.config_annotation import DEFAULT_FORMAT, DEFAULT_DELIMITER
+from openvariant.annotation.config_annotation import DEFAULT_FORMAT
 
 
 class TestAnnotation(unittest.TestCase):
@@ -51,15 +52,8 @@ def test_annotation_invalid_format(self):
 
     def test_annotation_delimiter(self):
         annotation = Annotation(f'{getcwd()}/tests/data/annotation/annotation.yaml')
-        self.assertEqual(annotation.delimiter, 'C')
-
-    def test_annotation_no_exist_delimiter(self):
-        annotation = Annotation(f'{getcwd()}/tests/data/annotation/no_exist_delimiter.yaml')
-        self.assertEqual(annotation.delimiter, DEFAULT_DELIMITER)
-
-    def test_annotation_invalid_delimiter(self):
-        with self.assertRaises(KeyError):
-            Annotation(f'{getcwd()}/tests/data/annotation/invalid_delimiter.yaml')
+        print(annotation.delimiter, '\t')
+        self.assertEqual(annotation.delimiter, '\t')
 
     def test_annotation_columns(self):
         res_expect = {'PLATFORM', 'DATASET'}

From 7a78ba0f219094e425df47435101eb53184b7672 Mon Sep 17 00:00:00 2001
From: David Martinez Millan <dmartinezmillan@hotmail.com>
Date: Tue, 20 May 2025 17:01:13 +0200
Subject: [PATCH 2/6] fix: delimiter check

---
 openvariant/variant/variant.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py
index fee9048..23962de 100644
--- a/openvariant/variant/variant.py
+++ b/openvariant/variant/variant.py
@@ -57,7 +57,8 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool)
     try:
         for l_num, line in enumerate(iter(mm_obj.readline, b'')):
             line = line.decode('utf-8')
-            delimiter = _detect_delimiter(line) if l_num == 0 else delimiter
+            if delimiter is None:
+                delimiter = _detect_delimiter(line) if l_num == 0 else delimiter
 
             row_line = re.split(delimiter, line)
             row_line = list(map(lambda w: w.rstrip("\r\n"), row_line))
@@ -69,7 +70,6 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool)
             if (row_line[0].startswith('#') or row_line[0].startswith('##') or row_line[0].startswith('browser') or
                 row_line[0].startswith('track')) and not row_line[0].startswith('#CHROM'):
                 continue
-
             yield l_num, row_line
     except Exception as e:
         if skip_files:

From 32f2b3bd9db958f4bf162c7263d979f117481382 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Mart=C3=ADnez=20Mill=C3=A1n?=
 <10314744+dmartmillan@users.noreply.github.com>
Date: Thu, 22 May 2025 14:24:50 +0200
Subject: [PATCH 3/6] fix: update detect delimiter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos López-Elorduy <107858804+CarlosLopezElorduy@users.noreply.github.com>
---
 openvariant/variant/variant.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py
index 23962de..c9062b4 100644
--- a/openvariant/variant/variant.py
+++ b/openvariant/variant/variant.py
@@ -38,13 +38,9 @@ def _open_file(file_path: str, mode='r+b'):
     return mm, file
 
 def _detect_delimiter(line: str):
-    """Detects the dominant delimiter in a line"""
-    counts = {
-        '\t': line.count('\t'),
-        ',': line.count(','),
-        ';': line.count(';')
-    }
-    return max(counts, key=counts.get)
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(line, delimiters='\t,;')
+    return dialect.delimiter
 
 def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]:
     """Cleaning comments and irrelevant data"""

From 7139ff842af69251312bd66e840eeead40f46c1f Mon Sep 17 00:00:00 2001
From: David Martinez Millan <dmartinezmillan@hotmail.com>
Date: Thu, 22 May 2025 14:25:22 +0200
Subject: [PATCH 4/6] fix: skip comments order

---
 openvariant/variant/variant.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py
index 23962de..b42b1a4 100644
--- a/openvariant/variant/variant.py
+++ b/openvariant/variant/variant.py
@@ -57,8 +57,14 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool)
     try:
         for l_num, line in enumerate(iter(mm_obj.readline, b'')):
             line = line.decode('utf-8')
+
+            # Skip comments
+            if (line.startswith('#') or line.startswith('##') or line.startswith('browser') or
+                line.startswith('track')) and not line.startswith('#CHROM'):
+                continue
+
             if delimiter is None:
-                delimiter = _detect_delimiter(line) if l_num == 0 else delimiter
+                delimiter = _detect_delimiter(line)
 
             row_line = re.split(delimiter, line)
             row_line = list(map(lambda w: w.rstrip("\r\n"), row_line))
@@ -66,10 +72,6 @@ def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool)
             if len(row_line) == 0:
                 continue
 
-            # Skip comments
-            if (row_line[0].startswith('#') or row_line[0].startswith('##') or row_line[0].startswith('browser') or
-                row_line[0].startswith('track')) and not row_line[0].startswith('#CHROM'):
-                continue
             yield l_num, row_line
     except Exception as e:
         if skip_files:

From a7312462b049ee4436c57bc6c9f4895e3353406c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Mart=C3=ADnez=20Mill=C3=A1n?=
 <10314744+dmartmillan@users.noreply.github.com>
Date: Fri, 23 May 2025 12:49:52 +0200
Subject: [PATCH 5/6] fix: update detect delimiter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos López-Elorduy <107858804+CarlosLopezElorduy@users.noreply.github.com>
---
 openvariant/variant/variant.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py
index 78e38a2..47fb06b 100644
--- a/openvariant/variant/variant.py
+++ b/openvariant/variant/variant.py
@@ -39,8 +39,14 @@ def _open_file(file_path: str, mode='r+b'):
 
 def _detect_delimiter(line: str):
     sniffer = csv.Sniffer()
-    dialect = sniffer.sniff(line, delimiters='\t,;')
-    return dialect.delimiter
+    try:
+        dialect = sniffer.sniff(line, delimiters='\t,;')
+        return dialect.delimiter
+    except csv.Error as e:
+        if "Could not determine delimiter" in str(e):
+            return '\t'
+        else:
+            raise e
 
 def _base_parser(mm_obj: mmap, file_path: str, delimiter: str, skip_files: bool) -> Generator[int, str, None]:
     """Cleaning comments and irrelevant data"""

From ca427ed10c42b622e5f9f71933ed118db8c57698 Mon Sep 17 00:00:00 2001
From: David Martinez Millan <dmartinezmillan@hotmail.com>
Date: Fri, 23 May 2025 13:14:28 +0200
Subject: [PATCH 6/6] fix: added comment

---
 openvariant/variant/variant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py
index 47fb06b..ea790ed 100644
--- a/openvariant/variant/variant.py
+++ b/openvariant/variant/variant.py
@@ -25,7 +25,6 @@
 
 def _open_file(file_path: str, mode='r+b'):
     """Open raw files or compressed files"""
-
     if file_path.endswith('xz'):
         open_method = lzma.open
         file = open_method(file_path, mode)
@@ -38,6 +37,7 @@ def _open_file(file_path: str, mode='r+b'):
     return mm, file
 
 def _detect_delimiter(line: str):
+    """Detects the dominant delimiter in a line"""
     sniffer = csv.Sniffer()
     try:
         dialect = sniffer.sniff(line, delimiters='\t,;')