|
1 | | -# microBioRust |
| 1 | + <img src="docs/assets/MICROBIO B.svg" width=200em alt="logo" /> |
| 2 | + |
| 3 | +[](https://lcrossman.github.io/microBioRust/) |
| 4 | + |
| 5 | + |
| 6 | + |
| 7 | + |
| 8 | + |
2 | 9 |
|
3 | 10 | ## A Rust bioinformatics crate aimed at Microbial genomics<br> |
4 | 11 |
|
5 | 12 | The aim of this crate is to provide Microbiology friendly Rust functions for bioinformatics.<br> |
6 | 13 |
|
7 | | -Very much under construction!<br> |
| 14 | +> Very much under construction!<br> |
8 | 15 |
|
9 | 16 | Some concepts with many thanks to Rust-bio<br> |
10 | | -Please see the Roadmap for futher details [here](https://github.com/LCrossman/microBioRust/issues/18) |
| 17 | +Please see the Roadmap for futher details [here](ROADMAP.md) |
11 | 18 |
|
12 | 19 | To install Rust - please see here [Rust install](https://www.rust-lang.org/tools/install) or with Conda<br> |
13 | 20 | If you would like to contribute please follow the [Rust code of conduct](https://www.rust-lang.org/policies/code-of-conduct) |
14 | 21 |
|
15 | 22 | Questions and comments - please join the Discord server :) [here](https://discord.gg/xP2ngwTttz) |
16 | 23 |
|
17 | | -Currently there is functionality for:<br> |
18 | | - |
19 | | -# 1. A Genbank to GFF parser |
20 | | - |
21 | | -# 2. An Embl to GFF and GBK parser |
22 | | - |
23 | | -# 3. Calculate sequence metrics <i>e.g.</i> hydrophobicity, distance measures |
24 | | - |
25 | | -# 4. A Heatmap plot with wasm and d3.js |
26 | | - |
27 | | -To use a specific workspace (at the moment microSeqIO or heatmap) clone the project, cd into the specific directory required and build the project from there |
28 | 24 |
|
29 | | -for more background please see <https://LCrossman.github.io/microBioRust_details> |
30 | | - |
31 | | -In microBioRust: |
32 | | - |
33 | | -You can parse genbank files and save as a GFF (gff3) format as well as extracting DNA sequences, gene DNA sequences (ffn) and protein fasta sequences (faa) |
34 | | -Super simple way: |
35 | | - |
36 | | -```rust |
37 | | -pub fn genbank_to_faa() -> Result<(), anyhow::Error> { |
38 | | - let args = Arguments::parse(); |
39 | | - let records = genbank!(&args.filename); |
40 | | - for record in records.iter() { |
41 | | - for (k, v) in &record.cds.attributes { |
42 | | - if let Some(seq) = record.seq_features.get_sequence_faa(k) { |
43 | | - println!(">{}|{}\n{}", &record.id, &k, seq); |
44 | | - } |
45 | | - } |
46 | | - } |
47 | | - return Ok(()); |
48 | | -} |
49 | | - |
50 | | -``` |
51 | | - |
52 | | -Better for Debugging: |
53 | | - |
54 | | -```rust |
55 | | -pub fn genbank_to_faa() -> Result<(), anyhow::Error> { |
56 | | - let args: Vec<String> = env::args().collect(); |
57 | | - let config = Config::new(&args).unwrap_or_else(|err| { |
58 | | - println!("Problem with parsing file arguments: {}", err); |
59 | | - process::exit(1); |
60 | | - }); |
61 | | - let file_gbk = fs::File::open(config.filename)?; |
62 | | - let mut reader = Reader::new(file_gbk); |
63 | | - let mut records = reader.records(); |
64 | | - let mut cds_counter: u32 = 0; |
65 | | - loop { |
66 | | - //collect from each record advancing on a next record basis, count cds records |
67 | | - match records.next() { |
68 | | - Some(Ok(mut record)) => { |
69 | | - for (k, v) in &record.cds.attributes { |
70 | | - match record.seq_features.get_sequence_faa(&k) { |
71 | | - Some(value) => { |
72 | | - let seq_faa = value.to_string(); |
73 | | - println!(">{}|{}\n{}", &record.id, &k, seq_faa); |
74 | | - } |
75 | | - _ => (), |
76 | | - }; |
77 | | - } |
78 | | - cds_counter += 1; |
79 | | - } |
80 | | - Some(Err(e)) => { |
81 | | - println!("Error encountered - an err {:?}", e); |
82 | | - } |
83 | | - None => { |
84 | | - println!("finished iteration"); |
85 | | - break; |
86 | | - } |
87 | | - } |
88 | | - } |
89 | | - println!("Total records processed: {}", read_counter); |
90 | | - return Ok(()); |
91 | | -} |
92 | | -``` |
93 | | - |
94 | | -Example to save a provided multi- or single genbank file as a GFF file (by joining any multi-genbank) |
95 | | - |
96 | | -```rust |
97 | | -pub fn genbank_to_gff() -> io::Result<()> { |
98 | | - let args: Vec<String> = env::args().collect(); |
99 | | - let config = Config::new(&args).unwrap_or_else(|err| { |
100 | | - println!("Problem with parsing file arguments: {}", err); |
101 | | - process::exit(1); |
102 | | - }); |
103 | | - let file_gbk = fs::File::open(&config.filename)?; |
104 | | - let prev_start: u32 = 0; |
105 | | - let mut prev_end: u32 = 0; |
106 | | - let mut reader = Reader::new(file_gbk); |
107 | | - let mut records = reader.records(); |
108 | | - let mut read_counter: u32 = 0; |
109 | | - let mut seq_region: BTreeMap<String, (u32,u32)> = BTreeMap::new(); |
110 | | - let mut record_vec: Vec<Record> = Vec::new(); |
111 | | - loop { |
112 | | - match records.next() { |
113 | | - Some(Ok(mut record)) => { |
114 | | - //println!("next record"); |
115 | | - //println!("Record id: {:?}", record.id); |
116 | | - let source = record.source_map.source_name.clone().expect("issue collecting source name"); |
117 | | - let beginning = match record.source_map.get_start(&source) { |
118 | | - Some(value) => value.get_value(), |
119 | | - _ => 0, |
120 | | - }; |
121 | | - let ending = match record.source_map.get_stop(&source) { |
122 | | - Some(value) => value.get_value(), |
123 | | - _ => 0, |
124 | | - }; |
125 | | - if ending + prev_end < beginning + prev_end { |
126 | | - } |
127 | | - seq_region.insert(source, (beginning + prev_end, ending + prev_end)); |
128 | | - record_vec.push(record); |
129 | | - // Add additional fields to print if needed |
130 | | - read_counter+=1; |
131 | | - prev_end+=ending; // create the joined record if there are multiple |
132 | | - }, |
133 | | - Some(Err(e)) => { println!("theres an err {:?}", e); }, |
134 | | - None => { |
135 | | - println!("finished iteration"); |
136 | | - break; }, |
137 | | - } |
138 | | - } |
139 | | - let output_file = format!("{}.gff", &config.filename); |
140 | | - gff_write(seq_region.clone(), record_vec, &output_file, true); |
141 | | - println!("Total records processed: {}", read_counter); |
142 | | - return Ok(()); |
143 | | -} |
144 | | -``` |
145 | | - |
146 | | -Example to create a completely new record, use of setters or set_ functionality |
| 25 | +Currently there is functionality for:<br> |
| 26 | +```` |
| 27 | + 1. A Genbank to GFF parser |
147 | 28 |
|
148 | | -To write into GFF format requires gff_write(seq_region, record_vec, filename, true or false) |
| 29 | + 2. An Embl to GFF and GBK parser |
149 | 30 |
|
150 | | -The seq_region is the region of interest to save with name and DNA coordinates such as `seqregion.entry("source_1".to_string(), (1,897))` |
| 31 | + 3. Calculate sequence metrics e.g. hydrophobicity, distance measures |
151 | 32 |
|
152 | | -This makes it possible to save the whole file or to subset it |
| 33 | + 4. A Heatmap plot with wasm and d3.js |
153 | 34 |
|
154 | | -record_vec is a list of the records. If there is only one record, include this as a vec using `vec![record]` |
| 35 | +```` |
155 | 36 |
|
156 | | -The boolean true/false describes whether the DNA sequence should be included in the GFF3 file |
| 37 | +To see more on how to use have a look at usage [here](docs/usage.md) |
157 | 38 |
|
158 | | -To write into genbank format requires gbk_write(seq_region, record_vec, filename), no true or false since genbank format will include the DNA sequence |
| 39 | +To use a specific workspace (at the moment microSeqIO or heatmap) clone the project, cd into the specific directory required and build the project from there |
159 | 40 |
|
160 | | - ```rust |
161 | | -pub fn create_new_record() -> Result<(), anyhow::Error> { |
162 | | - let filename = format!("new_record.gff"); |
163 | | - let mut record = Record::new(); |
164 | | - let mut seq_region: BTreeMap<String, (u32, u32)> = BTreeMap::new(); |
165 | | - //example from E.coli K12 |
166 | | - seq_region.insert("source_1".to_string(), (1, 897)); |
167 | | - //Add the source into SourceAttributes |
168 | | - record |
169 | | - .source_map |
170 | | - .set_counter("source_1".to_string()) |
171 | | - .set_start(RangeValue::Exact(1)) |
172 | | - .set_stop(RangeValue::Exact(897)) |
173 | | - .set_organism("Escherichia coli".to_string()) |
174 | | - .set_mol_type("DNA".to_string()) |
175 | | - .set_strain("K-12 substr. MG1655".to_string()) |
176 | | - .set_type_material("type strain of Escherichia coli K12".to_string()) |
177 | | - .set_db_xref("PRJNA57779".to_string()); |
178 | | - //Add the features into FeatureAttributes, here we are setting two features, i.e. coding sequences or genes |
179 | | - record |
180 | | - .cds |
181 | | - .set_counter("b3304".to_string()) |
182 | | - .set_start(RangeValue::Exact(1)) |
183 | | - .set_stop(RangeValue::Exact(354)) |
184 | | - .set_gene("rplR".to_string()) |
185 | | - .set_product("50S ribosomal subunit protein L18".to_string()) |
186 | | - .set_codon_start(1) |
187 | | - .set_strand(-1); |
188 | | - record |
189 | | - .cds |
190 | | - .set_counter("b3305".to_string()) |
191 | | - .set_start(RangeValue::Exact(364)) |
192 | | - .set_stop(RangeValue::Exact(897)) |
193 | | - .set_gene("rplF".to_string()) |
194 | | - .set_product("50S ribosomal subunit protein L6".to_string()) |
195 | | - .set_codon_start(1) |
196 | | - .set_strand(-1); |
197 | | - //Add the sequences for the coding sequence (CDS) into SequenceAttributes |
198 | | - record |
199 | | - .seq_features |
200 | | - .set_counter("b3304".to_string()) |
201 | | - .set_start(RangeValue::Exact(1)) |
202 | | - .set_stop(RangeValue::Exact(354)) |
203 | | - .set_sequence_ffn( |
204 | | - "ATGGATAAGAAATCTGCTCGTATCCGTCGTGCGACCCGCGCACGCCGCAAGCTCCAGGAG |
205 | | -CTGGGCGCAACTCGCCTGGTGGTACATCGTACCCCGCGTCACATTTACGCACAGGTAATT |
206 | | -GCACCGAACGGTTCTGAAGTTCTGGTAGCTGCTTCTACTGTAGAAAAAGCTATCGCTGAA |
207 | | -CAACTGAAGTACACCGGTAACAAAGACGCGGCTGCAGCTGTGGGTAAAGCTGTCGCTGAA |
208 | | -CGCGCTCTGGAAAAAGGCATCAAAGATGTATCCTTTGACCGTTCCGGGTTCCAATATCAT |
209 | | -GGTCGTGTCCAGGCACTGGCAGATGCTGCCCGTGAAGCTGGCCTTCAGTTCTAA" |
210 | | - .to_string(), |
211 | | - ) |
212 | | - .set_sequence_faa( |
213 | | - "MDKKSARIRRATRARRKLQELGATRLVVHRTPRHIYAQVIAPNGSEVLVAASTVEKAIAE |
214 | | -QLKYTGNKDAAAAVGKAVAERALEKGIKDVSFDRSGFQYHGRVQALADAAREAGLQF" |
215 | | - .to_string(), |
216 | | - ) |
217 | | - .set_codon_start(1) |
218 | | - .set_strand(-1); |
219 | | - record |
220 | | - .seq_features |
221 | | - .set_counter("bb3305".to_string()) |
222 | | - .set_start(RangeValue::Exact(364)) |
223 | | - .set_stop(RangeValue::Exact(897)) |
224 | | - .set_sequence_ffn( |
225 | | - "ATGTCTCGTGTTGCTAAAGCACCGGTCGTTGTTCCTGCCGGCGTTGACGTAAAAATCAAC |
226 | | -GGTCAGGTTATTACGATCAAAGGTAAAAACGGCGAGCTGACTCGTACTCTCAACGATGCT |
227 | | -GTTGAAGTTAAACATGCAGATAATACCCTGACCTTCGGTCCGCGTGATGGTTACGCAGAC |
228 | | -GGTTGGGCACAGGCTGGTACCGCGCGTGCCCTGCTGAACTCAATGGTTATCGGTGTTACC |
229 | | -GAAGGCTTCACTAAGAAGCTGCAGCTGGTTGGTGTAGGTTACCGTGCAGCGGTTAAAGGC |
230 | | -AATGTGATTAACCTGTCTCTGGGTTTCTCTCATCCTGTTGACCATCAGCTGCCTGCGGGT |
231 | | -ATCACTGCTGAATGTCCGACTCAGACTGAAATCGTGCTGAAAGGCGCTGATAAGCAGGTG |
232 | | -ATCGGCCAGGTTGCAGCGGATCTGCGCGCCTACCGTCGTCCTGAGCCTTATAAAGGCAAG |
233 | | -GGTGTTCGTTACGCCGACGAAGTCGTGCGTACCAAAGAGGCTAAGAAGAAGTAA" |
234 | | - .to_string(), |
235 | | - ) |
236 | | - .set_sequence_faa( |
237 | | - "MSRVAKAPVVVPAGVDVKINGQVITIKGKNGELTRTLNDAVEVKHADNTLTFGPRDGYAD |
238 | | -GWAQAGTARALLNSMVIGVTEGFTKKLQLVGVGYRAAVKGNVINLSLGFSHPVDHQLPAG |
239 | | -ITAECPTQTEIVLKGADKQVIGQVAADLRAYRRPEPYKGKGVRYADEVVRTKEAKKK" |
240 | | - .to_string(), |
241 | | - ) |
242 | | - .set_codon_start(1) |
243 | | - .set_strand(-1); |
244 | | - //Add the full sequence of the entire record into the record.sequence |
245 | | - record.sequence = "TTAGAACTGAAGGCCAGCTTCACGGGCAGCATCTGCCAGTGCCTGGACACGACCATGATA |
246 | | -TTGGAACCCGGAACGGTCAAAGGATACATCTTTGATGCCTTTTTCCAGAGCGCGTTCAGC |
247 | | -GACAGCTTTACCCACAGCTGCAGCCGCGTCTTTGTTACCGGTGTACTTCAGTTGTTCAGC |
248 | | -GATAGCTTTTTCTACAGTAGAAGCAGCTACCAGAACTTCAGAACCGTTCGGTGCAATTAC |
249 | | -CTGTGCGTAAATGTGACGCGGGGTACGATGTACCACCAGGCGAGTTGCGCCCAGCTCCTG |
250 | | -GAGCTTGCGGCGTGCGCGGGTCGCACGACGGATACGAGCAGATTTCTTATCCATAGTGTT |
251 | | -ACCTTACTTCTTCTTAGCCTCTTTGGTACGCACGACTTCGTCGGCGTAACGAACACCCTT |
252 | | -GCCTTTATAAGGCTCAGGACGACGGTAGGCGCGCAGATCCGCTGCAACCTGGCCGATCAC |
253 | | -CTGCTTATCAGCGCCTTTCAGCACGATTTCAGTCTGAGTCGGACATTCAGCAGTGATACC |
254 | | -CGCAGGCAGCTGATGGTCAACAGGATGAGAGAAACCCAGAGACAGGTTAATCACATTGCC |
255 | | -TTTAACCGCTGCACGGTAACCTACACCAACCAGCTGCAGCTTCTTAGTGAAGCCTTCGGT |
256 | | -AACACCGATAACCATTGAGTTCAGCAGGGCACGCGCGGTACCAGCCTGTGCCCAACCGTC |
257 | | -TGCGTAACCATCACGCGGACCGAAGGTCAGGGTATTATCTGCATGTTTAACTTCAACAGC |
258 | | -ATCGTTGAGAGTACGAGTCAGCTCGCCGTTTTTACCTTTGATCGTAATAACCTGACCGTT |
259 | | -GATTTTTACGTCAACGCCGGCAGGAACAACGACCGGTGCTTTAGCAACACGAGACAT" |
260 | | - .to_string(); |
261 | | - gff_write(seq_region, vec![record], &filename, true); |
262 | | - return Ok(()); |
263 | | -} |
264 | | -``` |
| 41 | +For more background please see <https://LCrossman.github.io/microBioRust_details> |
0 commit comments