11from kipoiseq .extractors import SingleVariantProteinVCFSeqExtractor , TranscriptSeqExtractor
22from kipoiseq .extractors import GenericMultiIntervalSeqExtractor , GenericSingleVariantMultiIntervalVCFSeqExtractor , \
33 FastaStringExtractor , UTRFetcher , MultiSampleVCF , SingleVariantMatcher
4- from kipoi .data import SampleIterator
4+
5+ from kipoi .data import SampleIterator , kipoi_dataloader
6+ from kipoi_conda .dependencies import Dependencies
7+ from kipoi .specs import Author
58
69__all__ = [
710 'SingleVariantProteinDataLoader' ,
811 'SingleVariantUTRDataLoader' ,
912]
13+ deps = Dependencies (
14+ conda = [
15+ 'bioconda::pybedtools' ,
16+ 'bioconda::pyfaidx' ,
17+ 'bioconda::pyranges' ,
18+ 'bioconda::biopython' ,
19+ 'numpy' ,
20+ 'pandas' ,
21+ ],
22+ pip = ['kipoiseq' ]
23+ )
24+ package_authors = [
25+ Author (name = 'Florian R. Hölzlwimmer' , github = 'hoeze' ),
26+ Author (name = 'Kalin Nonchev' , github = 'KalinNonchev' )
27+ ]
1028
1129
1230class SingleVariantProteinDataLoader (SampleIterator ):
31+ """
32+ info:
33+ doc: >
34+ Dataloader for protein sequence models. With inputs as gtf annotation file and fasta file,
35+ each output is a protein sequence with flanking intronic seuqences. Intronic sequnce
36+ lengths specified by the users. Returned sequences are of the type np.array([str])
37+ type: SampleIterator
38+ args:
39+ gtf_file:
40+ doc: file path; Genome annotation GTF file
41+ example:
42+ url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.gtf?raw=true
43+ md5: 8a1f158e17379773fcab21628fc3910f
44+ fasta_file:
45+ doc: Reference Genome sequence in fasta format
46+ example:
47+ url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22.fa.gz?raw=true
48+ md5: 5ebe034256ecc5689989a96387c5a65e
49+ vcf_file:
50+ doc: Genomic variants to evaluate in VCF format
51+ example:
52+ url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz?raw=true
53+ md5: c45e75fb75326c2be514d2dcea52e585
54+ output_schema:
55+ inputs:
56+ ref_seq:
57+ name: ref_seq
58+ shape: ()
59+ special_type: DNAStringSeq
60+ doc: reference sequence of UTR
61+ associated_metadata: ranges
62+ alt_seq:
63+ name: alt_seq
64+ doc: alternative sequence of 5' UTR
65+ shape: ()
66+ special_type: DNAStringSeq
67+ associated_metadata: ranges, variants
68+ metadata:
69+ transcript_id:
70+ type: str
71+ doc: transcript id
72+ variant:
73+ CHROM:
74+ type: str
75+ doc: chromsome of variant
76+ POS:
77+ type: int
78+ doc: variant position
79+ REF:
80+ type: str
81+ doc: variant reference
82+ ALT:
83+ type: str
84+ doc: variant alternative string
85+ STR:
86+ type: str
87+ doc: string representation of the variant
88+ """
1389
1490 def __init__ (self , gtf_file , fasta_file , vcf_file ):
1591 self .protein_vcf_extractor = SingleVariantProteinVCFSeqExtractor (
1692 gtf_file , fasta_file , vcf_file )
17- cds = self .protein_vcf_extractor .cds
18- # only needed metadata
19- self .metadatas = (
20- (
21- cds .loc [~ cds .index .duplicated (keep = 'first' )]
22- ).drop (columns = ['Start' , 'End' ])
23- )
93+
94+ # # only needed metadata
95+ # cds = self.protein_vcf_extractor.cds
96+ # self.metadatas = (
97+ # (
98+ # cds.loc[~cds.index.duplicated(keep='first')]
99+ # ).drop(columns=['Start', 'End'])
100+ # )
101+
24102 # generator for all sequences with variants
25103 self .sequences = self ._extractor ()
26104
@@ -35,7 +113,7 @@ def _extractor(self):
35113 return ref_seq, alt_seq, metadata for all
36114 transcript_ids with variants
37115 Returns: {
38- 'input ': {
116+ 'inputs ': {
39117 'ref_seq': ref_seq,
40118 'alt_seq': alt_seq,
41119 },
@@ -46,34 +124,145 @@ def _extractor(self):
46124 for transcript_id , (ref_seq , alt_seqs ) in self .protein_vcf_extractor .extract_all ():
47125 for (alt_seq , variant ) in alt_seqs :
48126 yield {
49- 'input ' : {
127+ 'inputs ' : {
50128 'ref_seq' : ref_seq ,
51129 'alt_seq' : alt_seq ,
52130 },
53131 'metadata' : self .get_metadata (transcript_id , variant )
54132 }
55133
134+ # def get_metadata(self, transcript_id: str, variant: dict):
135+ # """
136+ # get metadata for given transcript_id
137+ # """
138+ # row = self.metadatas.loc[transcript_id]
139+ # metadata = self.metadatas.loc[transcript_id].to_dict()
140+ # metadata['transcript_id'] = row.name
141+ # metadata['variants'] = variant
142+ # return metadata
143+
56144 def get_metadata (self , transcript_id : str , variant : dict ):
57145 """
58146 get metadata for given transcript_id
59147 """
60- row = self .metadatas .loc [transcript_id ]
61- metadata = self .metadatas .loc [transcript_id ].to_dict ()
62- metadata ['transcript_id' ] = row .name
63- metadata ['variants' ] = variant
148+ metadata = dict ()
149+ metadata ['transcript_id' ] = transcript_id
150+ variant_str_repr = ":" .join ([
151+ variant ["chrom" ],
152+ str (variant ["pos" ]),
153+ variant ["ref" ],
154+ variant ["alt" ],
155+ ])
156+ metadata ['variant' ] = {
157+ "chrom" : variant ["chrom" ],
158+ "pos" : variant ["pos" ],
159+ "ref" : variant ["ref" ],
160+ "alt" : variant ["alt" ],
161+ "id" : variant ['id' ] if "id" in variant else variant_str_repr ,
162+ "str" : variant_str_repr
163+ }
64164 return metadata
65165
66166
167+ @kipoi_dataloader (override = {"dependencies" : deps , 'info.authors' : package_authors })
67168class SingleVariantUTRDataLoader (SampleIterator ):
169+ """
170+ info:
171+ doc: >
172+ Dataloader for splicing models. With inputs as gtf annotation file and fasta file,
173+ each output is an exon sequence with flanking intronic seuqences. Intronic sequnce
174+ lengths specified by the users. Returned sequences are of the type np.array([str])
175+ type: SampleIterator
176+ args:
177+ gtf_file:
178+ doc: file path; Genome annotation GTF file
179+ example:
180+ url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.gtf?raw=true
181+ md5: 8a1f158e17379773fcab21628fc3910f
182+ name: gtf_file.gtf
183+ fasta_file:
184+ doc: Reference Genome sequence in fasta format
185+ example:
186+ url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22.fa.gz?raw=true
187+ md5: 5ebe034256ecc5689989a96387c5a65e
188+ name: fasta_file.fa.gz
189+ vcf_file:
190+ doc: Genomic variants to evaluate in VCF format
191+ example:
192+ url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz?raw=true
193+ md5: c45e75fb75326c2be514d2dcea52e585
194+ name: vcf_file.vcf.gz
195+ vcf_file_tbi:
196+ doc: tabix index of vcf (just to make kipoi tests work - leave as None in normal usage)
197+ example:
198+ url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz.tbi?raw=true
199+ md5: 9aebc88287a3d6b8517ace9e0fc427af
200+ name: vcf_file.vcf.gz.tbi
201+ feature_type:
202+ doc: Either 5UTR or 3UTR
203+ example: 5UTR
204+ type: str
205+ infer_from_cds:
206+ doc: infer UTR regions from coding sequence
207+ optional: True
208+ default: False
209+ example: False
210+ type: bool
211+ on_error_warn:
212+ doc: print warning instead of throwing an error on malformed input
213+ optional: True
214+ default: True
215+ example: True
216+ type: bool
217+ output_schema:
218+ inputs:
219+ ref_seq:
220+ name: ref_seq
221+ shape: ()
222+ special_type: DNAStringSeq
223+ doc: reference sequence of UTR
224+ associated_metadata: ranges
225+ alt_seq:
226+ name: alt_seq
227+ doc: alternative sequence of 5' UTR
228+ shape: ()
229+ special_type: DNAStringSeq
230+ associated_metadata: ranges, variants
231+ metadata:
232+ transcript_id:
233+ type: str
234+ doc: transcript id
235+ variant:
236+ chrom:
237+ type: str
238+ doc: chromsome of variant
239+ pos:
240+ type: int
241+ doc: variant position
242+ ref:
243+ type: str
244+ doc: variant reference
245+ alt:
246+ type: str
247+ doc: variant alternative string
248+ id:
249+ type: str
250+ doc: variant id
251+ str:
252+ type: str
253+ doc: string representation of the variant
254+ """
68255
69256 def __init__ (
70257 self ,
71258 gtf_file ,
72259 fasta_file ,
73260 vcf_file ,
74- feature_type = "5UTR" ,
261+ feature_type ,
75262 infer_from_cds = False ,
76263 on_error_warn = True ,
264+ vcf_file_tbi = None ,
265+ ** kwargs
77266 ):
78267 self .gtf_file = gtf_file
79268 self .fasta_file = fasta_file
@@ -106,12 +295,12 @@ def __init__(
106295 multi_sample_VCF = self .multi_sample_VCF ,
107296 )
108297
109- # only needed metadata
110- self .metadatas = (
111- (
112- df .loc [~ df .index .duplicated (keep = 'first' )]
113- ).drop (columns = ['Start' , 'End' ])
114- )
298+ # # only needed metadata
299+ # self.metadatas = (
300+ # (
301+ # df.loc[~df.index.duplicated(keep='first')]
302+ # ).drop(columns=['Start', 'End'])
303+ # )
115304 # generator for all sequences with variants
116305 self .sequences = self ._extractor ()
117306
@@ -137,7 +326,7 @@ def _extractor(self):
137326 for transcript_id , (ref_seq , alt_seqs ) in self .extractor .items ():
138327 for (alt_seq , variant ) in alt_seqs :
139328 yield {
140- 'input ' : {
329+ 'inputs ' : {
141330 'ref_seq' : ref_seq ,
142331 'alt_seq' : alt_seq ,
143332 },
@@ -148,8 +337,20 @@ def get_metadata(self, transcript_id: str, variant: dict):
148337 """
149338 get metadata for given transcript_id
150339 """
151- row = self .metadatas .loc [transcript_id ]
152- metadata = self .metadatas .loc [transcript_id ].to_dict ()
153- metadata ['transcript_id' ] = row .name
154- metadata ['variants' ] = variant
340+ metadata = dict ()
341+ metadata ['transcript_id' ] = transcript_id
342+ variant_str_repr = ":" .join ([
343+ variant ["chrom" ],
344+ str (variant ["pos" ]),
345+ variant ["ref" ],
346+ variant ["alt" ],
347+ ])
348+ metadata ['variant' ] = {
349+ "chrom" : variant ["chrom" ],
350+ "pos" : variant ["pos" ],
351+ "ref" : variant ["ref" ],
352+ "alt" : variant ["alt" ],
353+ "id" : variant ['id' ] if "id" in variant else variant_str_repr ,
354+ "str" : variant_str_repr
355+ }
155356 return metadata
0 commit comments