11"""Read database information"""
22
33from collections import defaultdict
4+ import re
45
56import mirtop .libs .logger as mylog
67
@@ -26,6 +27,8 @@ def guess_database(args):
2627
2728
2829def _guess_database_file (gff , database = None ):
30+ if database :
31+ return database
2932 with open (gff ) as in_handle :
3033 for line in in_handle :
3134 if not line .startswith ("#" ):
@@ -56,7 +59,7 @@ def get_primary_transcript(database):
5659 raise ValueError ("Only miRBase is supported for this action." )
5760
5861
59- def read_gtf_to_mirna (gtf ):
62+ def read_gtf_to_mirna (gtf , database = None ):
6063 """
6164 Load GTF file with precursor positions on genome.
6265
@@ -70,9 +73,11 @@ def read_gtf_to_mirna(gtf):
7073 """
7174 if not gtf :
7275 return gtf
73- if _guess_database_file (gtf ).find ("miRBase" ) > - 1 :
76+ if not database :
77+ database = _guess_database_file (gtf )
78+ if database .find ("miRBase" ) > - 1 :
7479 mapped = read_gtf_to_precursor_mirbase (gtf , format = "genomic" )
75- elif _guess_database_file ( gtf ) .find ("MirGeneDB" ) > - 1 :
80+ elif database .find ("MirGeneDB" ) > - 1 :
7681 mapped = read_gtf_to_precursor_mirgenedb (gtf , format = "genomic" )
7782 else :
7883 logger .info ("Database different than miRBase or MirGeneDB" )
@@ -145,7 +150,7 @@ def read_gtf_chr2mirna2(gtf): # to remove
145150 return db_mir
146151
147152
148- def read_gtf_to_precursor (gtf ,database ):
153+ def read_gtf_to_precursor (gtf , database = None ):
149154 """
150155 Load GTF file with precursor positions on genome
151156 Return dict with key being precursor name and
@@ -163,9 +168,11 @@ def read_gtf_to_precursor(gtf,database):
163168 """
164169 if not gtf :
165170 return gtf
166- if _guess_database_file (gtf ,database ).find ("miRBase" ) > - 1 :
171+ if not database :
172+ database = _guess_database_file (gtf )
173+ if database .find ("miRBase" ) > - 1 :
167174 mapped = read_gtf_to_precursor_mirbase (gtf )
168- elif _guess_database_file ( gtf , database ) .find ("MirGeneDB" ) > - 1 :
175+ elif database .find ("MirGeneDB" ) > - 1 :
169176 mapped = read_gtf_to_precursor_mirgenedb (gtf )
170177 else :
171178 logger .info ("Database different than miRBase or MirGeneDB" )
@@ -284,6 +291,7 @@ def read_gtf_to_precursor_mirgenedb(gtf, format="precursor"):
284291 db = defaultdict (list )
285292 db_mir = defaultdict (list )
286293 id_dict = dict ()
294+ pattern = r'(_3p\*?|_5p\*?)'
287295 with open (gtf ) as in_handle :
288296 for line in in_handle :
289297 if line .startswith ("#" ):
@@ -299,7 +307,10 @@ def read_gtf_to_precursor_mirgenedb(gtf, format="precursor"):
299307 if cols [2 ] == "miRNA" :
300308 idname_mi = [n .split ("=" )[1 ] for n in cols [- 1 ].split (";" )
301309 if n .startswith ("ID" )][0 ]
302- parent = "%s_pre" % idname_mi .split ("_" )[0 ]
310+ # parent = "%s_pre" % idname_mi.replace("_3p.*", "").replace("_5p.*", "")
311+ parent = re .sub (pattern , '' , idname_mi )
312+ parent = "%s_pre" % parent
313+ # import pdb; pdb.set_trace()
303314 db_mir [(parent , name )] = [chrom ,
304315 int (start ), int (end ),
305316 strand , parent ]
0 commit comments