Skip to content

Commit 18238f4

Browse files
authored
Merge pull request #93 from miRTop/fix_mirgenedb_parsing_name_error
fix mirgenedb parser
2 parents 3ac54c4 + 28aa657 commit 18238f4

File tree

5 files changed

+24
-11
lines changed

5 files changed

+24
-11
lines changed

data/examples/annotate/mirtop.db

52 KB
Binary file not shown.

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ dependencies:
44
- bioconda::pybedtools
55
- bioconda::samtools=1.21
66
- conda-forge::pandas
7+
- conda-forge::sqlite
78
- conda-forge::biopython=1.83

mirtop/gff/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def reader(args):
2828
args.database = database
2929
precursors = fasta.read_precursor(args.hairpin, args.sps)
3030
args.precursors = precursors
31-
matures = mapper.read_gtf_to_precursor(args.gtf,database)
31+
matures = mapper.read_gtf_to_precursor(args.gtf, database)
3232
args.matures = matures
3333
# TODO check numbers of miRNA and precursors read
3434
# TODO print message if numbers mismatch
@@ -78,13 +78,14 @@ def reader(args):
7878
def _write(lines, header, fn, args = None):
7979
out_handle = open(fn, 'w')
8080
print(header, file=out_handle)
81-
mapper = read_gtf_to_mirna(args.gtf)
81+
database = mapper.guess_database(args)
82+
dbmapper = read_gtf_to_mirna(args.gtf, database)
8283
for m in lines:
8384
for s in sorted(lines[m].keys()):
8485
for hit in lines[m][s]:
8586
# TODO: convert to genomic if args.out_genomic
8687
if args and args.out_genomic:
87-
lifted = body.lift_to_genome(hit[4], mapper)
88+
lifted = body.lift_to_genome(hit[4], dbmapper)
8889
print(lifted, file=out_handle)
8990
else:
9091
print(hit[4], file=out_handle)

mirtop/mirna/mapper.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Read database information"""
22

33
from collections import defaultdict
4+
import re
45

56
import mirtop.libs.logger as mylog
67

@@ -26,6 +27,8 @@ def guess_database(args):
2627

2728

2829
def _guess_database_file(gff, database=None):
30+
if database:
31+
return database
2932
with open(gff) as in_handle:
3033
for line in in_handle:
3134
if not line.startswith("#"):
@@ -56,7 +59,7 @@ def get_primary_transcript(database):
5659
raise ValueError("Only miRBase is supported for this action.")
5760

5861

59-
def read_gtf_to_mirna(gtf):
62+
def read_gtf_to_mirna(gtf, database=None):
6063
"""
6164
Load GTF file with precursor positions on genome.
6265
@@ -70,9 +73,11 @@ def read_gtf_to_mirna(gtf):
7073
"""
7174
if not gtf:
7275
return gtf
73-
if _guess_database_file(gtf).find("miRBase") > -1:
76+
if not database:
77+
database = _guess_database_file(gtf)
78+
if database.find("miRBase") > -1:
7479
mapped = read_gtf_to_precursor_mirbase(gtf, format="genomic")
75-
elif _guess_database_file(gtf).find("MirGeneDB") > -1:
80+
elif database.find("MirGeneDB") > -1:
7681
mapped = read_gtf_to_precursor_mirgenedb(gtf, format="genomic")
7782
else:
7883
logger.info("Database different than miRBase or MirGeneDB")
@@ -145,7 +150,7 @@ def read_gtf_chr2mirna2(gtf): # to remove
145150
return db_mir
146151

147152

148-
def read_gtf_to_precursor(gtf,database):
153+
def read_gtf_to_precursor(gtf, database=None):
149154
"""
150155
Load GTF file with precursor positions on genome
151156
Return dict with key being precursor name and
@@ -163,9 +168,11 @@ def read_gtf_to_precursor(gtf,database):
163168
"""
164169
if not gtf:
165170
return gtf
166-
if _guess_database_file(gtf,database).find("miRBase") > -1:
171+
if not database:
172+
database = _guess_database_file(gtf)
173+
if database.find("miRBase") > -1:
167174
mapped = read_gtf_to_precursor_mirbase(gtf)
168-
elif _guess_database_file(gtf,database).find("MirGeneDB") > -1:
175+
elif database.find("MirGeneDB") > -1:
169176
mapped = read_gtf_to_precursor_mirgenedb(gtf)
170177
else:
171178
logger.info("Database different than miRBase or MirGeneDB")
@@ -284,6 +291,7 @@ def read_gtf_to_precursor_mirgenedb(gtf, format="precursor"):
284291
db = defaultdict(list)
285292
db_mir = defaultdict(list)
286293
id_dict = dict()
294+
pattern = r'(_3p\*?|_5p\*?)'
287295
with open(gtf) as in_handle:
288296
for line in in_handle:
289297
if line.startswith("#"):
@@ -299,7 +307,10 @@ def read_gtf_to_precursor_mirgenedb(gtf, format="precursor"):
299307
if cols[2] == "miRNA":
300308
idname_mi = [n.split("=")[1] for n in cols[-1].split(";")
301309
if n.startswith("ID")][0]
302-
parent = "%s_pre" % idname_mi.split("_")[0]
310+
# parent = "%s_pre" % idname_mi.replace("_3p.*", "").replace("_5p.*", "")
311+
parent = re.sub(pattern, '', idname_mi)
312+
parent = "%s_pre" % parent
313+
# import pdb; pdb.set_trace()
303314
db_mir[(parent, name)] = [chrom,
304315
int(start), int(end),
305316
strand, parent]

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
from setuptools import setup, find_packages
55

6-
version = '0.4.29'
6+
version = '0.4.30'
77
url = 'http://github.com/mirtop/mirtop'
88

99

0 commit comments

Comments
 (0)