Skip to content

Commit e26b3a2

Browse files
IBCHgenomics
final release.
1 parent c1d1d4a commit e26b3a2

File tree

6 files changed

+690
-152
lines changed

6 files changed

+690
-152
lines changed

README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,42 @@ Options:
2929
-V, --version Print version
3030
```
3131

32+
- Annotate only the ref variant with A
33+
```
34+
./target/debug/varlinker variant-trefanno ./sample-files/sample.vcf A
35+
```
36+
37+
- Annotate only the alt variant with A
38+
```
39+
./target/debug/varlinker variant-taltanno ./sample-files/sample.vcf T
40+
```
41+
- Annotate all the variants in the vcf
42+
```
43+
./target/debug/varlinker variant-linker ./sample-files/sample.vcf
44+
```
45+
46+
- it will produce three output files classifying variant annotation to gene, exon and transcript level.
47+
48+
```
49+
==> annotationfile-exon.txt <==
50+
19 111 . A C exon ENSG00000284900.2 KBTBD4
51+
19 111 . A C exon ENSG00000284900.2 KBTBD4
52+
19 111 . A C exon ENSG00000284900.2 KBTBD4
53+
19 111 . A C exon ENSG00000284900.2 KBTBD4
54+
55+
==> annotationfile-gene.txt <==
56+
19 111 . A C gene ENSG00000284900.2 KBTBD4
57+
19 111 . A C gene ENSG00000278550.4 SLC43A2
58+
19 111 . A C gene ENSG00000264450.1 ENSG00000264450
59+
19 111 . A C gene ENSG00000273929.1 U6
60+
61+
==> annotationfile-transcript.txt <==
62+
19 111 . A C transcript ENSG00000284900.2 KBTBD4
63+
19 111 . A C transcript ENSG00000284900.2 KBTBD4
64+
19 111 . A C transcript ENSG00000284900.2 KBTBD4
65+
19 111 . A C transcript ENSG00000284900.2 KBTBD4
66+
```
67+
3268
- Acknowledgements: MOSAIC platform, developed as part of the ECBiG-MOSAIC project (POIR.04.02.00-00-D017/20), co-financed by the European Regional Development Fund (ERDF) under the Smart Growth Operational Programme 2014-2020, Measure 4.2 for the development of modern research infrastructure in the science sector.
3369
- Project PI and Informal queries: Prof. Luiza Handschuh: [email protected].
3470
- Code related queries: Dr. Gaurav Sablok: [email protected].

sample-files/sample.vcf

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,3 @@
2828
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,.
2929
20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3
3030
20 1235237 . T . . . . GT 0/0 0|0 ./.
31-
X 10 rsTest AC A,ATG 10 PASS . GT 0 0/1 0|2
32-

src/store.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,43 @@ pub struct VCF {
1010
}
1111

1212
#[derive(Debug, Clone, PartialOrd, PartialEq)]
13+
pub struct GenCodeGene {
14+
pub chrom: String,
15+
pub typeannotate: String,
16+
pub start: usize,
17+
pub stop: usize,
18+
pub geneid: String,
19+
pub genename: String,
20+
}
21+
22+
#[derive(Debug, Clone, PartialOrd, PartialEq)]
23+
pub struct GenCodeExon {
24+
pub chrom: String,
25+
pub typeannotate: String,
26+
pub start: usize,
27+
pub stop: usize,
28+
pub geneid: String,
29+
pub genename: String,
30+
}
1331

14-
pub struct GenCode {
32+
#[derive(Debug, Clone, PartialOrd, PartialEq)]
33+
pub struct GenCodeTranscript {
1534
pub chrom: String,
1635
pub typeannotate: String,
1736
pub start: usize,
1837
pub stop: usize,
38+
pub geneid: String,
1939
pub genename: String,
2040
}
2141

42+
#[derive(Debug, Clone, PartialOrd, PartialEq)]
2243
pub struct OUTPUT {
2344
pub chrom: String,
2445
pub pos: String,
2546
pub id: String,
2647
pub refnuc: String,
2748
pub altnuc: String,
2849
pub typeannotate: String,
50+
pub geneid: String,
2951
pub genename: String,
3052
}

src/varaltannot.rs

Lines changed: 209 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
use crate::store::OUTPUT;
12
use crate::store::VCF;
2-
use crate::store::{GenCode, OUTPUT};
3+
use crate::store::{GenCodeExon, GenCodeGene, GenCodeTranscript};
34
use async_std::task;
45
use std::error::Error;
56
use std::fs::File;
@@ -16,7 +17,7 @@ use std::process::Command;
1617
*/
1718

1819
pub async fn varaltanno(pathfile: &str, variant: &str) -> Result<String, Box<dyn Error>> {
19-
let _ = Command::new("wegt").
20+
let _ = Command::new("wget").
2021
arg("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.chr_patch_hapl_scaff.annotation.gtf.gz")
2122
.output()
2223
.expect("command failed");
@@ -26,82 +27,240 @@ pub async fn varaltanno(pathfile: &str, variant: &str) -> Result<String, Box<dyn
2627
.expect("command failed");
2728
let fileopen = File::open(pathfile).expect("file not present");
2829
let fileread = BufReader::new(fileopen);
29-
let gtfresults: Vec<GenCode> =
30-
task::block_on(gtfread("gencode.v48.chr_patch_hapl_scaff.annotation.gtf")).unwrap();
30+
let gtfresults_gene: Vec<GenCodeGene> = task::block_on(gtfread_gene_annotation(
31+
"gencode.v48.chr_patch_hapl_scaff.annotation.gtf",
32+
))
33+
.unwrap();
34+
let gtfresults_exon: Vec<GenCodeExon> = task::block_on(gtfread_exon_annotation(
35+
"gencode.v48.chr_patch_hapl_scaff.annotation.gtf",
36+
))
37+
.unwrap();
38+
let gtfresults_transcript: Vec<GenCodeTranscript> = task::block_on(
39+
gtfread_transcript_annotation("gencode.v48.chr_patch_hapl_scaff.annotation.gtf"),
40+
)
41+
.unwrap();
3142
let mut vcstring_file: Vec<VCF> = Vec::new();
3243
for i in fileread.lines() {
3344
let linevcf = i.expect("file not present");
34-
let linevec: Vec<String> = linevcf
35-
.split("\t")
36-
.map(|x| x.to_string())
37-
.collect::<Vec<_>>();
38-
vcstring_file.push(VCF {
39-
chrom: linevec[0].to_string(),
40-
pos: linevec[1].parse::<usize>().unwrap(),
41-
id: linevec[2].to_string(),
42-
refnuc: linevec[3].to_string(),
43-
altnuc: linevec[4].to_string(),
44-
qual: linevec[5].to_string(),
45-
});
45+
if !linevcf.starts_with("#") {
46+
let linevec: Vec<String> = linevcf
47+
.split("\t")
48+
.map(|x| x.to_string())
49+
.collect::<Vec<_>>();
50+
vcstring_file.push(VCF {
51+
chrom: linevec[0].to_string(),
52+
pos: linevec[1].parse::<usize>().unwrap(),
53+
id: linevec[2].to_string(),
54+
refnuc: linevec[3].to_string(),
55+
altnuc: linevec[4].to_string(),
56+
qual: linevec[5].to_string(),
57+
});
58+
}
4659
}
4760

48-
let mut output: Vec<OUTPUT> = Vec::new();
61+
let mut output_gene: Vec<OUTPUT> = Vec::new();
62+
let mut output_exon: Vec<OUTPUT> = Vec::new();
63+
let mut output_transcript: Vec<OUTPUT> = Vec::new();
64+
4965
for i in vcstring_file.iter() {
50-
for j in gtfresults.iter() {
51-
if i.pos > j.start && i.pos <= j.stop && j.typeannotate == variant {
52-
output.push(OUTPUT {
66+
for j in gtfresults_gene.iter() {
67+
if i.pos > j.start && i.pos <= j.stop && i.altnuc == variant {
68+
output_gene.push(OUTPUT {
5369
chrom: i.chrom.clone(),
5470
pos: i.pos.clone().to_string(),
5571
id: i.id.clone(),
56-
refnuc: i.id.clone(),
72+
refnuc: i.refnuc.clone(),
5773
altnuc: i.altnuc.clone(),
5874
typeannotate: j.typeannotate.clone(),
75+
geneid: j.geneid.clone(),
5976
genename: j.genename.clone(),
6077
});
6178
}
6279
}
6380
}
6481

65-
let mut mutwrite = File::create("annotationfile.txt").expect("file not present");
66-
writeln!(
67-
mutwrite,
68-
"{}\t{}\t{}\t{}\t{}\t{}\t{}",
69-
"chrom", "pos", "id", "refnuc", "altnuc", "typeannotate", "genename"
70-
)
71-
.expect("file not found");
72-
for i in output.iter() {
82+
for i in vcstring_file.iter() {
83+
for j in gtfresults_exon.iter() {
84+
if i.pos > j.start && i.pos <= j.stop && i.altnuc == variant {
85+
output_exon.push(OUTPUT {
86+
chrom: i.chrom.clone(),
87+
pos: i.pos.clone().to_string(),
88+
id: i.id.clone(),
89+
refnuc: i.refnuc.clone(),
90+
altnuc: i.altnuc.clone(),
91+
typeannotate: j.typeannotate.clone(),
92+
geneid: j.geneid.clone(),
93+
genename: j.genename.clone(),
94+
});
95+
}
96+
}
97+
}
98+
99+
for i in vcstring_file.iter() {
100+
for j in gtfresults_transcript.iter() {
101+
if i.pos > j.start && i.pos <= j.stop && i.altnuc == variant {
102+
output_transcript.push(OUTPUT {
103+
chrom: i.chrom.clone(),
104+
pos: i.pos.clone().to_string(),
105+
id: i.id.clone(),
106+
refnuc: i.refnuc.clone(),
107+
altnuc: i.altnuc.clone(),
108+
typeannotate: j.typeannotate.clone(),
109+
geneid: j.geneid.clone(),
110+
genename: j.genename.clone(),
111+
});
112+
}
113+
}
114+
}
115+
116+
let mut mutwrite_gene = File::create("annotationfile-gene.txt").expect("file not present");
117+
let mut mutwrite_exon = File::create("annotationfile-exon.txt").expect("file not present");
118+
let mut mutwrite_transcript =
119+
File::create("annotationfile-transcript.txt").expect("file not present");
120+
121+
for i in output_gene.iter() {
73122
writeln!(
74-
mutwrite,
75-
"{}\t{}\t{}\t{}\t{}\t{}\t{}",
76-
i.chrom, i.pos, i.id, i.refnuc, i.altnuc, i.typeannotate, i.genename
123+
mutwrite_gene,
124+
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
125+
i.chrom, i.pos, i.id, i.refnuc, i.altnuc, i.typeannotate, i.geneid, i.genename
77126
)
78127
.expect("line not found");
79128
}
129+
130+
for i in output_exon.iter() {
131+
writeln!(
132+
mutwrite_exon,
133+
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
134+
i.chrom, i.pos, i.id, i.refnuc, i.altnuc, i.typeannotate, i.geneid, i.genename
135+
)
136+
.expect("line not found");
137+
}
138+
139+
for i in output_transcript.iter() {
140+
writeln!(
141+
mutwrite_transcript,
142+
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
143+
i.chrom, i.pos, i.id, i.refnuc, i.altnuc, i.typeannotate, i.geneid, i.genename
144+
)
145+
.expect("line not found");
146+
}
147+
80148
Ok("The regions have been annotated".to_string())
81149
}
82150

83-
pub async fn gtfread(gtffile: &str) -> Result<Vec<GenCode>, Box<dyn Error>> {
151+
pub async fn gtfread_gene_annotation(gtffile: &str) -> Result<Vec<GenCodeGene>, Box<dyn Error>> {
84152
let fileopen = File::open(gtffile).expect("file not found");
85153
let fileread = BufReader::new(fileopen);
86-
let mut gtf_vector: Vec<GenCode> = Vec::new();
154+
let mut gtf_vector: Vec<GenCodeGene> = Vec::new();
87155
for i in fileread.lines() {
88156
let linegtf = i.expect("line not found");
89-
let linevec: Vec<String> = linegtf
90-
.split("\t")
91-
.map(|x| x.to_string())
92-
.collect::<Vec<String>>();
93-
let linecollect: String = linevec[9].split(";").collect::<Vec<_>>()[2]
94-
.replace(" ", "-")
95-
.split("-")
96-
.collect::<Vec<_>>()[1]
97-
.to_string();
98-
gtf_vector.push(GenCode {
99-
chrom: linevec[0].clone(),
100-
typeannotate: linevec[2].clone(),
101-
start: linevec[3].parse::<usize>().unwrap(),
102-
stop: linevec[4].parse::<usize>().unwrap(),
103-
genename: linecollect,
104-
})
157+
if !linegtf.starts_with("#") {
158+
let linevec: Vec<String> = linegtf
159+
.split("\t")
160+
.map(|x| x.to_string())
161+
.collect::<Vec<String>>();
162+
if linevec[2] == "gene" {
163+
let linecollect: String = linevec[8].split(";").collect::<Vec<_>>()[2]
164+
.replace(" ", "-")
165+
.split("-")
166+
.collect::<Vec<_>>()[2]
167+
.to_string()
168+
.replace("\"", "");
169+
let genecollect: String = linevec[8].split(";").collect::<Vec<_>>()[0]
170+
.replace(" ", "-")
171+
.split("-")
172+
.collect::<Vec<_>>()[1]
173+
.to_string()
174+
.replace("\"", "");
175+
gtf_vector.push(GenCodeGene {
176+
chrom: linevec[0].clone(),
177+
typeannotate: linevec[2].clone(),
178+
start: linevec[3].parse::<usize>().unwrap(),
179+
stop: linevec[4].parse::<usize>().unwrap(),
180+
geneid: genecollect,
181+
genename: linecollect,
182+
})
183+
}
184+
}
185+
}
186+
Ok(gtf_vector)
187+
}
188+
189+
pub async fn gtfread_exon_annotation(gtffile: &str) -> Result<Vec<GenCodeExon>, Box<dyn Error>> {
190+
let fileopen = File::open(gtffile).expect("file not found");
191+
let fileread = BufReader::new(fileopen);
192+
let mut gtf_vector: Vec<GenCodeExon> = Vec::new();
193+
for i in fileread.lines() {
194+
let linegtf = i.expect("line not found");
195+
if !linegtf.starts_with("#") {
196+
let linevec: Vec<String> = linegtf
197+
.split("\t")
198+
.map(|x| x.to_string())
199+
.collect::<Vec<String>>();
200+
if linevec[2] == "exon" {
201+
let linecollect: String = linevec[8].split(";").collect::<Vec<_>>()[3]
202+
.replace(" ", "-")
203+
.split("-")
204+
.collect::<Vec<_>>()[2]
205+
.to_string()
206+
.replace("\"", "");
207+
let genecollect: String = linevec[8].split(";").collect::<Vec<_>>()[0]
208+
.replace(" ", "-")
209+
.split("-")
210+
.collect::<Vec<_>>()[1]
211+
.to_string()
212+
.replace("\"", "");
213+
214+
gtf_vector.push(GenCodeExon {
215+
chrom: linevec[0].clone(),
216+
typeannotate: linevec[2].clone(),
217+
start: linevec[3].parse::<usize>().unwrap(),
218+
stop: linevec[4].parse::<usize>().unwrap(),
219+
geneid: genecollect,
220+
genename: linecollect,
221+
})
222+
}
223+
}
224+
}
225+
Ok(gtf_vector)
226+
}
227+
228+
pub async fn gtfread_transcript_annotation(
229+
gtffile: &str,
230+
) -> Result<Vec<GenCodeTranscript>, Box<dyn Error>> {
231+
let fileopen = File::open(gtffile).expect("file not found");
232+
let fileread = BufReader::new(fileopen);
233+
let mut gtf_vector: Vec<GenCodeTranscript> = Vec::new();
234+
for i in fileread.lines() {
235+
let linegtf = i.expect("line not found");
236+
if !linegtf.starts_with("#") {
237+
let linevec: Vec<String> = linegtf
238+
.split("\t")
239+
.map(|x| x.to_string())
240+
.collect::<Vec<String>>();
241+
if linevec[2] == "transcript" {
242+
let linecollect: String = linevec[8].split(";").collect::<Vec<_>>()[3]
243+
.replace(" ", "-")
244+
.split("-")
245+
.collect::<Vec<_>>()[2]
246+
.to_string()
247+
.replace("\"", "");
248+
let genecollect: String = linevec[8].split(";").collect::<Vec<_>>()[0]
249+
.replace(" ", "-")
250+
.split("-")
251+
.collect::<Vec<_>>()[1]
252+
.to_string()
253+
.replace("\"", "");
254+
gtf_vector.push(GenCodeTranscript {
255+
chrom: linevec[0].clone(),
256+
typeannotate: linevec[2].clone(),
257+
start: linevec[3].parse::<usize>().unwrap(),
258+
stop: linevec[4].parse::<usize>().unwrap(),
259+
geneid: genecollect,
260+
genename: linecollect,
261+
})
262+
}
263+
}
105264
}
106265
Ok(gtf_vector)
107266
}

0 commit comments

Comments
 (0)