Skip to content

Commit 6971a94

Browse files
committed
Improve compression rate with zlip compression
1 parent ffd9c81 commit 6971a94

File tree

4 files changed

+75
-18
lines changed

4 files changed

+75
-18
lines changed

Cargo.lock

Lines changed: 43 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ license = "MIT"
88
repository = "https://github.com/marshallku/base-sequence-compression"
99

1010
[dependencies]
11+
flate2 = "1.0.35"

src/lib.rs

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
use std::io::{self, Read, Write};
2+
3+
use flate2::read::ZlibDecoder;
4+
use flate2::write::ZlibEncoder;
5+
use flate2::Compression;
6+
17
/// The bit pattern for the base 'A' (00).
28
pub const A_BITS: u8 = 0b00;
39
/// The bit pattern for the base 'C' (01).
@@ -28,12 +34,12 @@ pub fn compress_sequence(sequence: &str) -> Vec<u8> {
2834
for base in sequence.chars() {
2935
let bits = match base {
3036
'A' => A_BITS,
31-
'C' => C_BITS,
32-
'T' => T_BITS,
33-
'G' => G_BITS,
3437
'a' => A_BITS,
38+
'C' => C_BITS,
3539
'c' => C_BITS,
40+
'T' => T_BITS,
3641
't' => T_BITS,
42+
'G' => G_BITS,
3743
'g' => G_BITS,
3844
_ => continue,
3945
};
@@ -53,7 +59,10 @@ pub fn compress_sequence(sequence: &str) -> Vec<u8> {
5359
compressed.push(current_byte);
5460
}
5561

56-
compressed
62+
// Apply ZLIB compression
63+
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::best());
64+
encoder.write_all(&compressed).unwrap();
65+
encoder.finish().unwrap()
5766
}
5867

5968
/// Decompresses a vector of bytes into a DNA sequence string.
@@ -69,10 +78,14 @@ pub fn compress_sequence(sequence: &str) -> Vec<u8> {
6978
/// # Returns
7079
///
7180
/// A string containing the decompressed DNA sequence.
72-
pub fn decompress_sequence(compressed: &[u8], sequence_length: usize) -> String {
81+
pub fn decompress_sequence(compressed: &[u8], sequence_length: usize) -> io::Result<String> {
82+
let mut decoder = ZlibDecoder::new(compressed);
83+
let mut decompressed_data = Vec::new();
84+
decoder.read_to_end(&mut decompressed_data)?;
85+
7386
let mut sequence = String::new();
7487

75-
for &byte in compressed {
88+
for &byte in &decompressed_data {
7689
let mut current_byte = byte;
7790
for _ in 0..4 {
7891
if sequence.len() >= sequence_length {
@@ -90,5 +103,5 @@ pub fn decompress_sequence(compressed: &[u8], sequence_length: usize) -> String
90103
}
91104
}
92105

93-
sequence
106+
Ok(sequence)
94107
}

tests/compression.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ mod tests {
99
let dna_sequence = "ACGTACGTACGT";
1010
let compressed = compress_sequence(dna_sequence);
1111
let sequence_length = dna_sequence.len();
12-
let decompressed = decompress_sequence(&compressed, sequence_length);
12+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
1313

1414
assert_eq!(dna_sequence, decompressed);
1515
}
@@ -19,7 +19,7 @@ mod tests {
1919
let dna_sequence = "";
2020
let compressed = compress_sequence(dna_sequence);
2121
let sequence_length = dna_sequence.len();
22-
let decompressed = decompress_sequence(&compressed, sequence_length);
22+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
2323
assert_eq!(dna_sequence, decompressed);
2424
}
2525

@@ -28,7 +28,7 @@ mod tests {
2828
let dna_sequence = "A";
2929
let compressed = compress_sequence(dna_sequence);
3030
let sequence_length = dna_sequence.len();
31-
let decompressed = decompress_sequence(&compressed, sequence_length);
31+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
3232
assert_eq!(dna_sequence, decompressed);
3333
}
3434

@@ -37,7 +37,7 @@ mod tests {
3737
let dna_sequence = "C";
3838
let compressed = compress_sequence(dna_sequence);
3939
let sequence_length = dna_sequence.len();
40-
let decompressed = decompress_sequence(&compressed, sequence_length);
40+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
4141
assert_eq!(dna_sequence, decompressed);
4242
}
4343

@@ -46,7 +46,7 @@ mod tests {
4646
let dna_sequence = "T";
4747
let compressed = compress_sequence(dna_sequence);
4848
let sequence_length = dna_sequence.len();
49-
let decompressed = decompress_sequence(&compressed, sequence_length);
49+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
5050
assert_eq!(dna_sequence, decompressed);
5151
}
5252

@@ -55,7 +55,7 @@ mod tests {
5555
let dna_sequence = "G";
5656
let compressed = compress_sequence(dna_sequence);
5757
let sequence_length = dna_sequence.len();
58-
let decompressed = decompress_sequence(&compressed, sequence_length);
58+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
5959
assert_eq!(dna_sequence, decompressed);
6060
}
6161

@@ -64,7 +64,7 @@ mod tests {
6464
let dna_sequence = "ACGTACGTA";
6565
let compressed = compress_sequence(dna_sequence);
6666
let sequence_length = dna_sequence.len();
67-
let decompressed = decompress_sequence(&compressed, sequence_length);
67+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
6868
assert_eq!(dna_sequence, decompressed);
6969
}
7070

@@ -73,7 +73,7 @@ mod tests {
7373
let dna_sequence = "ACGTACGTAC";
7474
let compressed = compress_sequence(dna_sequence);
7575
let sequence_length = dna_sequence.len();
76-
let decompressed = decompress_sequence(&compressed, sequence_length);
76+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
7777
assert_eq!(dna_sequence, decompressed);
7878
}
7979

@@ -82,7 +82,7 @@ mod tests {
8282
let dna_sequence = "ACGTACGTACG";
8383
let compressed = compress_sequence(dna_sequence);
8484
let sequence_length = dna_sequence.len();
85-
let decompressed = decompress_sequence(&compressed, sequence_length);
85+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
8686
assert_eq!(dna_sequence, decompressed);
8787
}
8888

@@ -91,7 +91,7 @@ mod tests {
9191
let dna_sequence = "acgt";
9292
let compressed = compress_sequence(dna_sequence);
9393
let sequence_length = dna_sequence.len();
94-
let decompressed = decompress_sequence(&compressed, sequence_length);
94+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
9595

9696
assert_eq!(dna_sequence.to_uppercase(), decompressed);
9797
}
@@ -101,7 +101,7 @@ mod tests {
101101
let dna_sequence = "ACXGT";
102102
let compressed = compress_sequence(dna_sequence);
103103
let sequence_length = dna_sequence.len();
104-
let decompressed = decompress_sequence(&compressed, sequence_length);
104+
let decompressed = decompress_sequence(&compressed, sequence_length).unwrap();
105105

106106
assert_eq!("ACGT", decompressed);
107107
}

0 commit comments

Comments
 (0)