Skip to content

Commit 91004b1

Browse files
authored
Be more flexible on attribute values in GTFs
1 parent 9ea05fc commit 91004b1

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed

bin/tx2gene.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import argparse
77
import glob
88
import os
9+
import re
910
from collections import Counter, defaultdict, OrderedDict
1011
from collections.abc import Set
1112
from typing import Dict
@@ -50,14 +51,18 @@ def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str:
5051
Returns:
5152
str: The attribute name that corresponds to transcripts in the GTF file.
5253
"""
54+
5355
votes = Counter()
5456
with open(gtf_file) as inh:
55-
# Read GTF file, skipping header lines
57+
# Read GTF file, skipping header lines
5658
for line in filter(lambda x: not x.startswith("#"), inh):
5759
cols = line.split("\t")
58-
# Parse attribute column and update votes for each attribute found
59-
attributes = dict(item.strip().split(" ", 1) for item in cols[8].split(";") if item.strip())
60-
votes.update(key for key, value in attributes.items() if value.strip('"') in transcripts)
60+
61+
# Use regular expression to correctly split the attributes string
62+
attributes_str = cols[8]
63+
attributes = dict(re.findall(r'(\S+) "(.*?)(?<!\\)";', attributes_str))
64+
65+
votes.update(key for key, value in attributes.items())
6166

6267
if not votes:
6368
# Log a warning if no matching attribute is found

0 commit comments

Comments
 (0)