@@ -105,3 +105,102 @@ pub fn decompress_sequence(compressed: &[u8], sequence_length: usize) -> io::Res
105105
106106 Ok ( sequence)
107107}
108+
109+ /// Compresses a FASTA file content into a vector of bytes.
110+ ///
111+ /// The FASTA file content is expected to have a header line followed by
112+ /// the DNA sequence. The DNA sequence is compressed by representing each
113+ /// base (A, C, T, G) with 2 bits. The compressed data starts with a 4-byte
114+ /// (u32) integer representing the length of the original sequence.
115+ ///
116+ /// # Arguments
117+ ///
118+ /// * `content` - A string slice that holds the FASTA file content.
119+ ///
120+ /// # Returns
121+ ///
122+ /// A vector of bytes containing the compressed FASTA file content.
123+ pub fn compress_fasta ( content : & str ) -> Vec < u8 > {
124+ let mut lines = content. lines ( ) ;
125+ let header = lines. next ( ) . unwrap_or ( "" ) . to_string ( ) ;
126+ let sequence: String = lines. map ( |line| line. trim ( ) ) . collect ( ) ;
127+
128+ let sequence_length = sequence. len ( ) as u32 ;
129+ let compressed_data = compress_sequence ( & sequence) ;
130+
131+ let mut output = Vec :: new ( ) ;
132+
133+ // Write header length (4 bytes)
134+ output. extend_from_slice ( & ( header. len ( ) as u32 ) . to_le_bytes ( ) ) ;
135+
136+ // Write header
137+ output. extend_from_slice ( header. as_bytes ( ) ) ;
138+
139+ // Write sequence length (4 bytes)
140+ output. extend_from_slice ( & sequence_length. to_le_bytes ( ) ) ;
141+
142+ // Write compressed data length (4 bytes)
143+ output. extend_from_slice ( & ( compressed_data. len ( ) as u32 ) . to_le_bytes ( ) ) ;
144+
145+ // Write compressed data
146+ output. extend_from_slice ( & compressed_data) ;
147+
148+ output
149+ }
150+
151+ /// Decompresses a vector of bytes into a FASTA file content.
152+ ///
153+ /// The compressed data starts with a 4-byte (u32) integer representing
154+ /// the length of the header, followed by the header, the sequence length,
155+ /// and the compressed sequence data. Each base (A, C, T, G) is represented
156+ /// by 2 bits.
157+ ///
158+ /// # Arguments
159+ ///
160+ /// * `data` - A slice of bytes containing the compressed FASTA file content.
161+ ///
162+ /// # Returns
163+ ///
164+ /// A string containing the decompressed FASTA file content.
165+ ///
166+ /// # Errors
167+ ///
168+ /// Returns an error if the file is too short or if the file is missing
169+ pub fn decompress_fasta ( data : & [ u8 ] ) -> Result < String , String > {
170+ if data. len ( ) < 12 {
171+ return Err ( "File is too short" . to_string ( ) ) ;
172+ }
173+
174+ let header_len = u32:: from_le_bytes ( data[ 0 ..4 ] . try_into ( ) . unwrap ( ) ) as usize ;
175+
176+ if data. len ( ) < 12 + header_len {
177+ return Err ( "File is too short for header" . to_string ( ) ) ;
178+ }
179+
180+ let header = String :: from_utf8 ( data[ 4 ..4 + header_len] . to_vec ( ) ) . map_err ( |e| e. to_string ( ) ) ?;
181+
182+ let sequence_length =
183+ u32:: from_le_bytes ( data[ 4 + header_len..8 + header_len] . try_into ( ) . unwrap ( ) ) as usize ;
184+
185+ let compressed_len =
186+ u32:: from_le_bytes ( data[ 8 + header_len..12 + header_len] . try_into ( ) . unwrap ( ) ) as usize ;
187+
188+ if data. len ( ) < 12 + header_len + compressed_len {
189+ return Err ( "File is too short for compressed data" . to_string ( ) ) ;
190+ }
191+
192+ let compressed_data = & data[ 12 + header_len..12 + header_len + compressed_len] ;
193+ let decompressed = decompress_sequence ( compressed_data, sequence_length) . unwrap_or_default ( ) ;
194+
195+ let mut result =
196+ String :: with_capacity ( header. len ( ) + decompressed. len ( ) + ( decompressed. len ( ) / 60 ) * 2 ) ;
197+ result. push_str ( & header) ;
198+ result. push ( '\n' ) ;
199+
200+ for chunk in decompressed. as_bytes ( ) . chunks ( 60 ) {
201+ result. extend ( chunk. iter ( ) . map ( |& b| b as char ) ) ;
202+ result. push ( '\n' ) ;
203+ }
204+
205+ Ok ( result)
206+ }
0 commit comments