Skip to content

Commit aee7539

Browse files
authored
Merge pull request #29 from ZEZE1020/readme-update
Readme update
2 parents c9a69f0 + a505da1 commit aee7539

File tree

4 files changed

+445
-244
lines changed

4 files changed

+445
-244
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Cargo.lock
1818
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
1919
# and can be added to the global gitignore or merged into this file. For a more nuclear
2020
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
21-
#.idea/*~
21+
.idea/*~
2222
.DS_Store
2323
env/
2424
venv/

README.md

Lines changed: 20 additions & 243 deletions
Original file line numberDiff line numberDiff line change
@@ -1,264 +1,41 @@
1-
# microBioRust
1+
<img src="docs/assets/MICROBIO B.svg" width=200em alt="logo" />
2+
3+
[![Docs](https://img.shields.io/badge/docs-mkdocs-blue.svg)](https://lcrossman.github.io/microBioRust/)
4+
5+
![Crates.io Version](https://img.shields.io/crates/v/microBioRust?style=flat&link=https%3A%2F%2Fcrates.io%2Fcrates%2FmicroBioRust)
6+
7+
8+
29

310
## A Rust bioinformatics crate aimed at Microbial genomics<br>
411

512
The aim of this crate is to provide Microbiology friendly Rust functions for bioinformatics.<br>
613

7-
Very much under construction!<br>
14+
> Very much under construction!<br>
815
916
Some concepts with many thanks to Rust-bio<br>
10-
Please see the Roadmap for futher details [here](https://github.com/LCrossman/microBioRust/issues/18)
17+
Please see the Roadmap for futher details [here](ROADMAP.md)
1118

1219
To install Rust - please see here [Rust install](https://www.rust-lang.org/tools/install) or with Conda<br>
1320
If you would like to contribute please follow the [Rust code of conduct](https://www.rust-lang.org/policies/code-of-conduct)
1421

1522
Questions and comments - please join the Discord server :) [here](https://discord.gg/xP2ngwTttz)
1623

17-
Currently there is functionality for:<br>
18-
19-
# 1. A Genbank to GFF parser
20-
21-
# 2. An Embl to GFF and GBK parser
22-
23-
# 3. Calculate sequence metrics <i>e.g.</i> hydrophobicity, distance measures
24-
25-
# 4. A Heatmap plot with wasm and d3.js
26-
27-
To use a specific workspace (at the moment microSeqIO or heatmap) clone the project, cd into the specific directory required and build the project from there
2824

29-
for more background please see <https://LCrossman.github.io/microBioRust_details>
30-
31-
In microBioRust:
32-
33-
You can parse genbank files and save as a GFF (gff3) format as well as extracting DNA sequences, gene DNA sequences (ffn) and protein fasta sequences (faa)
34-
Super simple way:
35-
36-
```rust
37-
pub fn genbank_to_faa() -> Result<(), anyhow::Error> {
38-
let args = Arguments::parse();
39-
let records = genbank!(&args.filename);
40-
for record in records.iter() {
41-
for (k, v) in &record.cds.attributes {
42-
if let Some(seq) = record.seq_features.get_sequence_faa(k) {
43-
println!(">{}|{}\n{}", &record.id, &k, seq);
44-
}
45-
}
46-
}
47-
return Ok(());
48-
}
49-
50-
```
51-
52-
Better for Debugging:
53-
54-
```rust
55-
pub fn genbank_to_faa() -> Result<(), anyhow::Error> {
56-
let args: Vec<String> = env::args().collect();
57-
let config = Config::new(&args).unwrap_or_else(|err| {
58-
println!("Problem with parsing file arguments: {}", err);
59-
process::exit(1);
60-
});
61-
let file_gbk = fs::File::open(config.filename)?;
62-
let mut reader = Reader::new(file_gbk);
63-
let mut records = reader.records();
64-
let mut cds_counter: u32 = 0;
65-
loop {
66-
//collect from each record advancing on a next record basis, count cds records
67-
match records.next() {
68-
Some(Ok(mut record)) => {
69-
for (k, v) in &record.cds.attributes {
70-
match record.seq_features.get_sequence_faa(&k) {
71-
Some(value) => {
72-
let seq_faa = value.to_string();
73-
println!(">{}|{}\n{}", &record.id, &k, seq_faa);
74-
}
75-
_ => (),
76-
};
77-
}
78-
cds_counter += 1;
79-
}
80-
Some(Err(e)) => {
81-
println!("Error encountered - an err {:?}", e);
82-
}
83-
None => {
84-
println!("finished iteration");
85-
break;
86-
}
87-
}
88-
}
89-
println!("Total records processed: {}", read_counter);
90-
return Ok(());
91-
}
92-
```
93-
94-
Example to save a provided multi- or single genbank file as a GFF file (by joining any multi-genbank)
95-
96-
```rust
97-
pub fn genbank_to_gff() -> io::Result<()> {
98-
let args: Vec<String> = env::args().collect();
99-
let config = Config::new(&args).unwrap_or_else(|err| {
100-
println!("Problem with parsing file arguments: {}", err);
101-
process::exit(1);
102-
});
103-
let file_gbk = fs::File::open(&config.filename)?;
104-
let prev_start: u32 = 0;
105-
let mut prev_end: u32 = 0;
106-
let mut reader = Reader::new(file_gbk);
107-
let mut records = reader.records();
108-
let mut read_counter: u32 = 0;
109-
let mut seq_region: BTreeMap<String, (u32,u32)> = BTreeMap::new();
110-
let mut record_vec: Vec<Record> = Vec::new();
111-
loop {
112-
match records.next() {
113-
Some(Ok(mut record)) => {
114-
//println!("next record");
115-
//println!("Record id: {:?}", record.id);
116-
let source = record.source_map.source_name.clone().expect("issue collecting source name");
117-
let beginning = match record.source_map.get_start(&source) {
118-
Some(value) => value.get_value(),
119-
_ => 0,
120-
};
121-
let ending = match record.source_map.get_stop(&source) {
122-
Some(value) => value.get_value(),
123-
_ => 0,
124-
};
125-
if ending + prev_end < beginning + prev_end {
126-
}
127-
seq_region.insert(source, (beginning + prev_end, ending + prev_end));
128-
record_vec.push(record);
129-
// Add additional fields to print if needed
130-
read_counter+=1;
131-
prev_end+=ending; // create the joined record if there are multiple
132-
},
133-
Some(Err(e)) => { println!("theres an err {:?}", e); },
134-
None => {
135-
println!("finished iteration");
136-
break; },
137-
}
138-
}
139-
let output_file = format!("{}.gff", &config.filename);
140-
gff_write(seq_region.clone(), record_vec, &output_file, true);
141-
println!("Total records processed: {}", read_counter);
142-
return Ok(());
143-
}
144-
```
145-
146-
Example to create a completely new record, use of setters or set_ functionality
25+
Currently there is functionality for:<br>
26+
````
27+
1. A Genbank to GFF parser
14728
148-
To write into GFF format requires gff_write(seq_region, record_vec, filename, true or false)
29+
2. An Embl to GFF and GBK parser
14930
150-
The seq_region is the region of interest to save with name and DNA coordinates such as `seqregion.entry("source_1".to_string(), (1,897))`
31+
3. Calculate sequence metrics e.g. hydrophobicity, distance measures
15132
152-
This makes it possible to save the whole file or to subset it
33+
4. A Heatmap plot with wasm and d3.js
15334
154-
record_vec is a list of the records. If there is only one record, include this as a vec using `vec![record]`
35+
````
15536

156-
The boolean true/false describes whether the DNA sequence should be included in the GFF3 file
37+
To see more on how to use have a look at usage [here](docs/usage.md)
15738

158-
To write into genbank format requires gbk_write(seq_region, record_vec, filename), no true or false since genbank format will include the DNA sequence
39+
To use a specific workspace (at the moment microSeqIO or heatmap) clone the project, cd into the specific directory required and build the project from there
15940

160-
```rust
161-
pub fn create_new_record() -> Result<(), anyhow::Error> {
162-
let filename = format!("new_record.gff");
163-
let mut record = Record::new();
164-
let mut seq_region: BTreeMap<String, (u32, u32)> = BTreeMap::new();
165-
//example from E.coli K12
166-
seq_region.insert("source_1".to_string(), (1, 897));
167-
//Add the source into SourceAttributes
168-
record
169-
.source_map
170-
.set_counter("source_1".to_string())
171-
.set_start(RangeValue::Exact(1))
172-
.set_stop(RangeValue::Exact(897))
173-
.set_organism("Escherichia coli".to_string())
174-
.set_mol_type("DNA".to_string())
175-
.set_strain("K-12 substr. MG1655".to_string())
176-
.set_type_material("type strain of Escherichia coli K12".to_string())
177-
.set_db_xref("PRJNA57779".to_string());
178-
//Add the features into FeatureAttributes, here we are setting two features, i.e. coding sequences or genes
179-
record
180-
.cds
181-
.set_counter("b3304".to_string())
182-
.set_start(RangeValue::Exact(1))
183-
.set_stop(RangeValue::Exact(354))
184-
.set_gene("rplR".to_string())
185-
.set_product("50S ribosomal subunit protein L18".to_string())
186-
.set_codon_start(1)
187-
.set_strand(-1);
188-
record
189-
.cds
190-
.set_counter("b3305".to_string())
191-
.set_start(RangeValue::Exact(364))
192-
.set_stop(RangeValue::Exact(897))
193-
.set_gene("rplF".to_string())
194-
.set_product("50S ribosomal subunit protein L6".to_string())
195-
.set_codon_start(1)
196-
.set_strand(-1);
197-
//Add the sequences for the coding sequence (CDS) into SequenceAttributes
198-
record
199-
.seq_features
200-
.set_counter("b3304".to_string())
201-
.set_start(RangeValue::Exact(1))
202-
.set_stop(RangeValue::Exact(354))
203-
.set_sequence_ffn(
204-
"ATGGATAAGAAATCTGCTCGTATCCGTCGTGCGACCCGCGCACGCCGCAAGCTCCAGGAG
205-
CTGGGCGCAACTCGCCTGGTGGTACATCGTACCCCGCGTCACATTTACGCACAGGTAATT
206-
GCACCGAACGGTTCTGAAGTTCTGGTAGCTGCTTCTACTGTAGAAAAAGCTATCGCTGAA
207-
CAACTGAAGTACACCGGTAACAAAGACGCGGCTGCAGCTGTGGGTAAAGCTGTCGCTGAA
208-
CGCGCTCTGGAAAAAGGCATCAAAGATGTATCCTTTGACCGTTCCGGGTTCCAATATCAT
209-
GGTCGTGTCCAGGCACTGGCAGATGCTGCCCGTGAAGCTGGCCTTCAGTTCTAA"
210-
.to_string(),
211-
)
212-
.set_sequence_faa(
213-
"MDKKSARIRRATRARRKLQELGATRLVVHRTPRHIYAQVIAPNGSEVLVAASTVEKAIAE
214-
QLKYTGNKDAAAAVGKAVAERALEKGIKDVSFDRSGFQYHGRVQALADAAREAGLQF"
215-
.to_string(),
216-
)
217-
.set_codon_start(1)
218-
.set_strand(-1);
219-
record
220-
.seq_features
221-
.set_counter("bb3305".to_string())
222-
.set_start(RangeValue::Exact(364))
223-
.set_stop(RangeValue::Exact(897))
224-
.set_sequence_ffn(
225-
"ATGTCTCGTGTTGCTAAAGCACCGGTCGTTGTTCCTGCCGGCGTTGACGTAAAAATCAAC
226-
GGTCAGGTTATTACGATCAAAGGTAAAAACGGCGAGCTGACTCGTACTCTCAACGATGCT
227-
GTTGAAGTTAAACATGCAGATAATACCCTGACCTTCGGTCCGCGTGATGGTTACGCAGAC
228-
GGTTGGGCACAGGCTGGTACCGCGCGTGCCCTGCTGAACTCAATGGTTATCGGTGTTACC
229-
GAAGGCTTCACTAAGAAGCTGCAGCTGGTTGGTGTAGGTTACCGTGCAGCGGTTAAAGGC
230-
AATGTGATTAACCTGTCTCTGGGTTTCTCTCATCCTGTTGACCATCAGCTGCCTGCGGGT
231-
ATCACTGCTGAATGTCCGACTCAGACTGAAATCGTGCTGAAAGGCGCTGATAAGCAGGTG
232-
ATCGGCCAGGTTGCAGCGGATCTGCGCGCCTACCGTCGTCCTGAGCCTTATAAAGGCAAG
233-
GGTGTTCGTTACGCCGACGAAGTCGTGCGTACCAAAGAGGCTAAGAAGAAGTAA"
234-
.to_string(),
235-
)
236-
.set_sequence_faa(
237-
"MSRVAKAPVVVPAGVDVKINGQVITIKGKNGELTRTLNDAVEVKHADNTLTFGPRDGYAD
238-
GWAQAGTARALLNSMVIGVTEGFTKKLQLVGVGYRAAVKGNVINLSLGFSHPVDHQLPAG
239-
ITAECPTQTEIVLKGADKQVIGQVAADLRAYRRPEPYKGKGVRYADEVVRTKEAKKK"
240-
.to_string(),
241-
)
242-
.set_codon_start(1)
243-
.set_strand(-1);
244-
//Add the full sequence of the entire record into the record.sequence
245-
record.sequence = "TTAGAACTGAAGGCCAGCTTCACGGGCAGCATCTGCCAGTGCCTGGACACGACCATGATA
246-
TTGGAACCCGGAACGGTCAAAGGATACATCTTTGATGCCTTTTTCCAGAGCGCGTTCAGC
247-
GACAGCTTTACCCACAGCTGCAGCCGCGTCTTTGTTACCGGTGTACTTCAGTTGTTCAGC
248-
GATAGCTTTTTCTACAGTAGAAGCAGCTACCAGAACTTCAGAACCGTTCGGTGCAATTAC
249-
CTGTGCGTAAATGTGACGCGGGGTACGATGTACCACCAGGCGAGTTGCGCCCAGCTCCTG
250-
GAGCTTGCGGCGTGCGCGGGTCGCACGACGGATACGAGCAGATTTCTTATCCATAGTGTT
251-
ACCTTACTTCTTCTTAGCCTCTTTGGTACGCACGACTTCGTCGGCGTAACGAACACCCTT
252-
GCCTTTATAAGGCTCAGGACGACGGTAGGCGCGCAGATCCGCTGCAACCTGGCCGATCAC
253-
CTGCTTATCAGCGCCTTTCAGCACGATTTCAGTCTGAGTCGGACATTCAGCAGTGATACC
254-
CGCAGGCAGCTGATGGTCAACAGGATGAGAGAAACCCAGAGACAGGTTAATCACATTGCC
255-
TTTAACCGCTGCACGGTAACCTACACCAACCAGCTGCAGCTTCTTAGTGAAGCCTTCGGT
256-
AACACCGATAACCATTGAGTTCAGCAGGGCACGCGCGGTACCAGCCTGTGCCCAACCGTC
257-
TGCGTAACCATCACGCGGACCGAAGGTCAGGGTATTATCTGCATGTTTAACTTCAACAGC
258-
ATCGTTGAGAGTACGAGTCAGCTCGCCGTTTTTACCTTTGATCGTAATAACCTGACCGTT
259-
GATTTTTACGTCAACGCCGGCAGGAACAACGACCGGTGCTTTAGCAACACGAGACAT"
260-
.to_string();
261-
gff_write(seq_region, vec![record], &filename, true);
262-
return Ok(());
263-
}
264-
```
41+
For more background please see <https://LCrossman.github.io/microBioRust_details>

0 commit comments

Comments
 (0)