Skip to content

Commit 4f774c5

Browse files
committed
Add string annotation to UTR dataloader; Fix some wrong metadata
1 parent 54dbc20 commit 4f774c5

File tree

2 files changed

+277
-76
lines changed

2 files changed

+277
-76
lines changed

kipoiseq/dataloaders/protein.py

Lines changed: 227 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,104 @@
11
from kipoiseq.extractors import SingleVariantProteinVCFSeqExtractor, TranscriptSeqExtractor
22
from kipoiseq.extractors import GenericMultiIntervalSeqExtractor, GenericSingleVariantMultiIntervalVCFSeqExtractor, \
33
FastaStringExtractor, UTRFetcher, MultiSampleVCF, SingleVariantMatcher
4-
from kipoi.data import SampleIterator
4+
5+
from kipoi.data import SampleIterator, kipoi_dataloader
6+
from kipoi_conda.dependencies import Dependencies
7+
from kipoi.specs import Author
58

69
__all__ = [
710
'SingleVariantProteinDataLoader',
811
'SingleVariantUTRDataLoader',
912
]
13+
deps = Dependencies(
14+
conda=[
15+
'bioconda::pybedtools',
16+
'bioconda::pyfaidx',
17+
'bioconda::pyranges',
18+
'bioconda::biopython',
19+
'numpy',
20+
'pandas',
21+
],
22+
pip=['kipoiseq']
23+
)
24+
package_authors = [
25+
Author(name='Florian R. Hölzlwimmer', github='hoeze'),
26+
Author(name='Kalin Nonchev', github='KalinNonchev')
27+
]
1028

1129

1230
class SingleVariantProteinDataLoader(SampleIterator):
31+
"""
32+
info:
33+
doc: >
34+
Dataloader for protein sequence models. With inputs as gtf annotation file and fasta file,
35+
each output is a protein sequence with flanking intronic seuqences. Intronic sequnce
36+
lengths specified by the users. Returned sequences are of the type np.array([str])
37+
type: SampleIterator
38+
args:
39+
gtf_file:
40+
doc: file path; Genome annotation GTF file
41+
example:
42+
url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.gtf?raw=true
43+
md5: 8a1f158e17379773fcab21628fc3910f
44+
fasta_file:
45+
doc: Reference Genome sequence in fasta format
46+
example:
47+
url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22.fa.gz?raw=true
48+
md5: 5ebe034256ecc5689989a96387c5a65e
49+
vcf_file:
50+
doc: Genomic variants to evaluate in VCF format
51+
example:
52+
url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz?raw=true
53+
md5: c45e75fb75326c2be514d2dcea52e585
54+
output_schema:
55+
inputs:
56+
ref_seq:
57+
name: ref_seq
58+
shape: ()
59+
special_type: DNAStringSeq
60+
doc: reference sequence of UTR
61+
associated_metadata: ranges
62+
alt_seq:
63+
name: alt_seq
64+
doc: alternative sequence of 5' UTR
65+
shape: ()
66+
special_type: DNAStringSeq
67+
associated_metadata: ranges, variants
68+
metadata:
69+
transcript_id:
70+
type: str
71+
doc: transcript id
72+
variant:
73+
CHROM:
74+
type: str
75+
doc: chromsome of variant
76+
POS:
77+
type: int
78+
doc: variant position
79+
REF:
80+
type: str
81+
doc: variant reference
82+
ALT:
83+
type: str
84+
doc: variant alternative string
85+
STR:
86+
type: str
87+
doc: string representation of the variant
88+
"""
1389

1490
def __init__(self, gtf_file, fasta_file, vcf_file):
1591
self.protein_vcf_extractor = SingleVariantProteinVCFSeqExtractor(
1692
gtf_file, fasta_file, vcf_file)
17-
cds = self.protein_vcf_extractor.cds
18-
# only needed metadata
19-
self.metadatas = (
20-
(
21-
cds.loc[~cds.index.duplicated(keep='first')]
22-
).drop(columns=['Start', 'End'])
23-
)
93+
94+
# # only needed metadata
95+
# cds = self.protein_vcf_extractor.cds
96+
# self.metadatas = (
97+
# (
98+
# cds.loc[~cds.index.duplicated(keep='first')]
99+
# ).drop(columns=['Start', 'End'])
100+
# )
101+
24102
# generator for all sequences with variants
25103
self.sequences = self._extractor()
26104

@@ -35,7 +113,7 @@ def _extractor(self):
35113
return ref_seq, alt_seq, metadata for all
36114
transcript_ids with variants
37115
Returns: {
38-
'input': {
116+
'inputs': {
39117
'ref_seq': ref_seq,
40118
'alt_seq': alt_seq,
41119
},
@@ -46,34 +124,145 @@ def _extractor(self):
46124
for transcript_id, (ref_seq, alt_seqs) in self.protein_vcf_extractor.extract_all():
47125
for (alt_seq, variant) in alt_seqs:
48126
yield {
49-
'input': {
127+
'inputs': {
50128
'ref_seq': ref_seq,
51129
'alt_seq': alt_seq,
52130
},
53131
'metadata': self.get_metadata(transcript_id, variant)
54132
}
55133

134+
# def get_metadata(self, transcript_id: str, variant: dict):
135+
# """
136+
# get metadata for given transcript_id
137+
# """
138+
# row = self.metadatas.loc[transcript_id]
139+
# metadata = self.metadatas.loc[transcript_id].to_dict()
140+
# metadata['transcript_id'] = row.name
141+
# metadata['variants'] = variant
142+
# return metadata
143+
56144
def get_metadata(self, transcript_id: str, variant: dict):
57145
"""
58146
get metadata for given transcript_id
59147
"""
60-
row = self.metadatas.loc[transcript_id]
61-
metadata = self.metadatas.loc[transcript_id].to_dict()
62-
metadata['transcript_id'] = row.name
63-
metadata['variants'] = variant
148+
metadata = dict()
149+
metadata['transcript_id'] = transcript_id
150+
variant_str_repr = ":".join([
151+
variant["chrom"],
152+
str(variant["pos"]),
153+
variant["ref"],
154+
variant["alt"],
155+
])
156+
metadata['variant'] = {
157+
"chrom": variant["chrom"],
158+
"pos": variant["pos"],
159+
"ref": variant["ref"],
160+
"alt": variant["alt"],
161+
"id": variant['id'] if "id" in variant else variant_str_repr,
162+
"str": variant_str_repr
163+
}
64164
return metadata
65165

66166

167+
@kipoi_dataloader(override={"dependencies": deps, 'info.authors': package_authors})
67168
class SingleVariantUTRDataLoader(SampleIterator):
169+
"""
170+
info:
171+
doc: >
172+
Dataloader for splicing models. With inputs as gtf annotation file and fasta file,
173+
each output is an exon sequence with flanking intronic seuqences. Intronic sequnce
174+
lengths specified by the users. Returned sequences are of the type np.array([str])
175+
type: SampleIterator
176+
args:
177+
gtf_file:
178+
doc: file path; Genome annotation GTF file
179+
example:
180+
url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.gtf?raw=true
181+
md5: 8a1f158e17379773fcab21628fc3910f
182+
name: gtf_file.gtf
183+
fasta_file:
184+
doc: Reference Genome sequence in fasta format
185+
example:
186+
url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22.fa.gz?raw=true
187+
md5: 5ebe034256ecc5689989a96387c5a65e
188+
name: fasta_file.fa.gz
189+
vcf_file:
190+
doc: Genomic variants to evaluate in VCF format
191+
example:
192+
url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz?raw=true
193+
md5: c45e75fb75326c2be514d2dcea52e585
194+
name: vcf_file.vcf.gz
195+
vcf_file_tbi:
196+
doc: tabix index of vcf (just to make kipoi tests work - leave as None in normal usage)
197+
example:
198+
url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz.tbi?raw=true
199+
md5: 9aebc88287a3d6b8517ace9e0fc427af
200+
name: vcf_file.vcf.gz.tbi
201+
feature_type:
202+
doc: Either 5UTR or 3UTR
203+
example: 5UTR
204+
type: str
205+
infer_from_cds:
206+
doc: infer UTR regions from coding sequence
207+
optional: True
208+
default: False
209+
example: False
210+
type: bool
211+
on_error_warn:
212+
doc: print warning instead of throwing an error on malformed input
213+
optional: True
214+
default: True
215+
example: True
216+
type: bool
217+
output_schema:
218+
inputs:
219+
ref_seq:
220+
name: ref_seq
221+
shape: ()
222+
special_type: DNAStringSeq
223+
doc: reference sequence of UTR
224+
associated_metadata: ranges
225+
alt_seq:
226+
name: alt_seq
227+
doc: alternative sequence of 5' UTR
228+
shape: ()
229+
special_type: DNAStringSeq
230+
associated_metadata: ranges, variants
231+
metadata:
232+
transcript_id:
233+
type: str
234+
doc: transcript id
235+
variant:
236+
chrom:
237+
type: str
238+
doc: chromsome of variant
239+
pos:
240+
type: int
241+
doc: variant position
242+
ref:
243+
type: str
244+
doc: variant reference
245+
alt:
246+
type: str
247+
doc: variant alternative string
248+
id:
249+
type: str
250+
doc: variant id
251+
str:
252+
type: str
253+
doc: string representation of the variant
254+
"""
68255

69256
def __init__(
70257
self,
71258
gtf_file,
72259
fasta_file,
73260
vcf_file,
74-
feature_type="5UTR",
261+
feature_type,
75262
infer_from_cds=False,
76263
on_error_warn=True,
264+
vcf_file_tbi=None,
265+
**kwargs
77266
):
78267
self.gtf_file = gtf_file
79268
self.fasta_file = fasta_file
@@ -106,12 +295,12 @@ def __init__(
106295
multi_sample_VCF=self.multi_sample_VCF,
107296
)
108297

109-
# only needed metadata
110-
self.metadatas = (
111-
(
112-
df.loc[~df.index.duplicated(keep='first')]
113-
).drop(columns=['Start', 'End'])
114-
)
298+
# # only needed metadata
299+
# self.metadatas = (
300+
# (
301+
# df.loc[~df.index.duplicated(keep='first')]
302+
# ).drop(columns=['Start', 'End'])
303+
# )
115304
# generator for all sequences with variants
116305
self.sequences = self._extractor()
117306

@@ -137,7 +326,7 @@ def _extractor(self):
137326
for transcript_id, (ref_seq, alt_seqs) in self.extractor.items():
138327
for (alt_seq, variant) in alt_seqs:
139328
yield {
140-
'input': {
329+
'inputs': {
141330
'ref_seq': ref_seq,
142331
'alt_seq': alt_seq,
143332
},
@@ -148,8 +337,20 @@ def get_metadata(self, transcript_id: str, variant: dict):
148337
"""
149338
get metadata for given transcript_id
150339
"""
151-
row = self.metadatas.loc[transcript_id]
152-
metadata = self.metadatas.loc[transcript_id].to_dict()
153-
metadata['transcript_id'] = row.name
154-
metadata['variants'] = variant
340+
metadata = dict()
341+
metadata['transcript_id'] = transcript_id
342+
variant_str_repr = ":".join([
343+
variant["chrom"],
344+
str(variant["pos"]),
345+
variant["ref"],
346+
variant["alt"],
347+
])
348+
metadata['variant'] = {
349+
"chrom": variant["chrom"],
350+
"pos": variant["pos"],
351+
"ref": variant["ref"],
352+
"alt": variant["alt"],
353+
"id": variant['id'] if "id" in variant else variant_str_repr,
354+
"str": variant_str_repr
355+
}
155356
return metadata

0 commit comments

Comments
 (0)