diff --git a/src/bam/record.rs b/src/bam/record.rs index de2795c35..0751dcf45 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -358,9 +358,9 @@ impl Record { let orig_aux_offset = self.qname_capacity() + 4 * self.cigar_len() - + (self.seq_len() + 1) / 2 + + self.seq_len().div_ceil(2) + self.seq_len(); - let new_aux_offset = q_len + extranul + cigar_width + (seq.len() + 1) / 2 + qual.len(); + let new_aux_offset = q_len + extranul + cigar_width + seq.len().div_ceil(2) + qual.len(); assert!(orig_aux_offset <= self.inner.l_data as usize); let aux_len = self.inner.l_data as usize - orig_aux_offset; self.inner_mut().l_data = (new_aux_offset + aux_len) as i32; @@ -416,7 +416,7 @@ impl Record { }); } self.inner_mut().core.l_qseq = seq.len() as i32; - i += (seq.len() + 1) / 2; + i += seq.len().div_ceil(2); } // qual @@ -564,7 +564,7 @@ impl Record { fn seq_data(&self) -> &[u8] { let offset = self.qname_capacity() + self.cigar_len() * 4; - &self.data()[offset..][..(self.seq_len() + 1) / 2] + &self.data()[offset..][..self.seq_len().div_ceil(2)] } /// Get read sequence. Complexity: O(1). @@ -579,7 +579,7 @@ impl Record { /// This does not entail any offsets, hence the qualities can be used directly without /// e.g. subtracting 33. Complexity: O(1). pub fn qual(&self) -> &[u8] { - &self.data()[self.qname_capacity() + self.cigar_len() * 4 + (self.seq_len() + 1) / 2..] + &self.data()[self.qname_capacity() + self.cigar_len() * 4 + self.seq_len().div_ceil(2)..] [..self.seq_len()] } @@ -787,7 +787,7 @@ impl Record { // CIGAR (uint32_t): + self.cigar_len() * std::mem::size_of::() // Read sequence (4-bit encoded): - + (self.seq_len() + 1) / 2 + + self.seq_len().div_ceil(2) // Base qualities (char): + self.seq_len()..], } diff --git a/src/bcf/mod.rs b/src/bcf/mod.rs index efbbd07b0..a19183073 100644 --- a/src/bcf/mod.rs +++ b/src/bcf/mod.rs @@ -318,7 +318,7 @@ impl IndexedReader { /// # Arguments /// /// * `rid` - numeric ID of the reference to jump to; use `HeaderView::name2rid` for resolving - /// contig name to ID. + /// contig name to ID. /// * `start` - `0`-based **inclusive** start coordinate of region on reference. /// * `end` - Optional `0`-based **inclusive** end coordinate of region on reference. If `None` /// is given, records are fetched from `start` until the end of the contig. @@ -605,7 +605,7 @@ pub mod synced { /// # Arguments /// /// * `rid` - numeric ID of the reference to jump to; use `HeaderView::name2rid` for resolving - /// contig name to ID. + /// contig name to ID. /// * `start` - `0`-based start coordinate of region on reference. /// * `end` - `0`-based end coordinate of region on reference. pub fn fetch(&mut self, rid: u32, start: u64, end: u64) -> Result<()> { @@ -835,6 +835,8 @@ fn bcf_open(target: &[u8], mode: &[u8]) -> Result<*mut htslib::htsFile> { #[cfg(test)] mod tests { + use tempfile::NamedTempFile; + use super::record::Buffer; use super::*; use crate::bcf::header::Id; @@ -1090,6 +1092,129 @@ mod tests { } } + #[test] + fn test_genotypes_read_mixed_ploidy() { + let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file."); + + // Expected genotypes for comparison + let expected = [vec!["0", "1"], vec!["0/1", "1/1"], vec!["1|0", "1/1|0"]]; + + for (rec, exp_gts) in vcf.records().zip(expected.iter()) { + let rec = rec.expect("Error reading record."); + + // Get the genotypes from the record + let genotypes = rec.genotypes().expect("Error reading genotypes"); + + // Compare each genotype with the expected value + for (sample, exp_gt) in exp_gts.iter().enumerate() { + assert_eq!(&format!("{}", genotypes.get(sample)), exp_gt); + } + } + } + + #[test] + fn test_genotypes_write_and_read_mixed_ploidy() { + let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file."); + + // Create a temporary file to write the modified VCF data + let tmp = NamedTempFile::new().unwrap(); + let path = tmp.path(); + + { + // Create a VCF writer with the same header as the input VCF + let mut writer = Writer::from_path( + path, + &Header::from_template(vcf.header()), + true, + Format::Vcf, + ) + .unwrap(); + + // Modify record template by adding different genotypes and write the to the temp file. + let mut rec_tpl = vcf.records().next().unwrap().unwrap(); + rec_tpl + .push_genotype_structured( + &[ + vec![GenotypeAllele::Unphased(0)], + vec![GenotypeAllele::Unphased(1)], + ], + 3, + ) + .unwrap(); + writer.write(&rec_tpl).unwrap(); + rec_tpl + .push_genotype_structured( + &[ + vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)], + vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)], + ], + 3, + ) + .unwrap(); + writer.write(&rec_tpl).unwrap(); + rec_tpl + .push_genotype_structured( + &[ + vec![GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)], + vec![ + GenotypeAllele::Unphased(1), + GenotypeAllele::Unphased(1), + GenotypeAllele::Phased(0), + ], + ], + 3, + ) + .unwrap(); + writer.write(&rec_tpl).unwrap(); + } + + // Read back the temporary file with the modified VCF data + let mut reader = Reader::from_path(path).unwrap(); + + // Expected genotypes for validation + let expected = [vec!["0", "1"], vec!["0/1", "1/1"], vec!["1|0", "1/1|0"]]; + + // Iterate over the records in the temporary file and validate the genotypes + for (rec, exp_gts) in reader.records().zip(expected.iter()) { + let rec = rec.expect("Error reading record"); + let genotypes = rec.genotypes().expect("Error reading genotypes"); + + // Compare each genotype with the expected value + for (sample, exp_gt) in exp_gts.iter().enumerate() { + assert_eq!(&format!("{}", genotypes.get(sample)), exp_gt); + } + } + } + + #[test] + fn test_genotypes_wrong_max_ploidy() { + let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file."); + + // Modify record template by adding different genotypes and write the to the temp file. + let mut rec_tpl = vcf.records().next().unwrap().unwrap(); + let err = rec_tpl + .push_genotype_structured( + &[ + vec![ + GenotypeAllele::Unphased(0), + GenotypeAllele::Unphased(1), + GenotypeAllele::Unphased(0), + ], + vec![ + GenotypeAllele::Unphased(1), + GenotypeAllele::Unphased(0), + GenotypeAllele::Unphased(1), + GenotypeAllele::Unphased(0), + ], + ], + 3, + ) + .expect_err( + "This should fail since there are more alleles specified (4 for second sample) than max_ploidy (3) suggests", + ); + assert_eq!(err, crate::errors::Error::BcfSetValues); + } + #[test] fn test_header_ids() { let vcf = Reader::from_path("test/test_string.vcf").expect("Error opening file."); diff --git a/src/bcf/record.rs b/src/bcf/record.rs index 121200b71..8cd1edd4b 100644 --- a/src/bcf/record.rs +++ b/src/bcf/record.rs @@ -4,7 +4,6 @@ // except according to those terms. use std::borrow::{Borrow, BorrowMut}; -use std::ffi; use std::fmt; use std::marker::PhantomData; use std::ops::Deref; @@ -13,6 +12,7 @@ use std::ptr; use std::rc::Rc; use std::slice; use std::str; +use std::{ffi, iter}; use bio_types::genome; use derive_new::new; @@ -659,7 +659,7 @@ impl Record { /// # Arguments /// /// - `genotypes` - a flattened, two-dimensional array of GenotypeAllele, - /// the first dimension contains one array for each sample. + /// the first dimension contains one array for each sample. /// /// # Errors /// @@ -692,6 +692,76 @@ impl Record { self.push_format_integer(b"GT", &encoded) } + /// Add/replace genotypes in FORMAT GT tag by providing a list of genotypes. + /// + /// # Arguments + /// + /// - `genotypes` - a two-dimensional array of GenotypeAllele + /// - `max_ploidy` - the maximum number of alleles allowed for any genotype on any sample. + /// + /// # Errors + /// + /// Returns an error if any genotype has more allelles than `max_ploidy` or if the GT tag is not present in the header. + /// + /// # Example + /// + /// Example assumes we have a Record `record` from a VCF with a `GT` `FORMAT` tag and three samples. + /// See [module documentation](../index.html#example-writing) for how to set up + /// VCF, header, and record. + /// + /// ``` + /// # use rust_htslib::bcf::{Format, Writer}; + /// # use rust_htslib::bcf::header::Header; + /// # use rust_htslib::bcf::record::GenotypeAllele; + /// # use std::iter; + /// # let mut header = Header::new(); + /// # let header_contig_line = r#"##contig="#; + /// # header.push_record(header_contig_line.as_bytes()); + /// # let header_gt_line = r#"##FORMAT="#; + /// # header.push_record(header_gt_line.as_bytes()); + /// # header.push_sample("first_sample".as_bytes()); + /// # header.push_sample("second_sample".as_bytes()); + /// # header.push_sample("third_sample".as_bytes()); + /// # let mut vcf = Writer::from_stdout(&header, true, Format::Vcf)?; + /// # let mut record = vcf.empty_record(); + /// let alleles = vec![ + /// vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)], + /// vec![GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)], + /// vec![GenotypeAllele::Unphased(0)], + /// ]; + /// record.push_genotype_structured(&alleles, 2); + /// let gts = record.genotypes()?; + /// assert_eq!("1/1", &format!("{}", gts.get(0))); + /// assert_eq!("0|1", &format!("{}", gts.get(1))); + /// assert_eq!("0", &format!("{}", gts.get(2))); + /// # Ok::<(), rust_htslib::errors::Error>(()) + /// ``` + pub fn push_genotype_structured( + &mut self, + genotypes: &[GT], + max_ploidy: usize, + ) -> Result<()> + where + GT: AsRef<[GenotypeAllele]>, + { + let mut data = Vec::with_capacity(max_ploidy * genotypes.len()); + for gt in genotypes { + if gt.as_ref().len() > max_ploidy { + return Err(Error::BcfSetValues); + } + data.extend( + gt.as_ref() + .iter() + .map(|gta| i32::from(*gta)) + .chain(iter::repeat_n( + VECTOR_END_INTEGER, + max_ploidy - gt.as_ref().len(), + )), + ); + } + self.push_format_integer(b"GT", &data) + } + /// Get genotypes as vector of one `Genotype` per sample. /// /// # Example @@ -771,7 +841,7 @@ impl Record { /// /// - `tag` - The tag's string. /// - `data` - a flattened, two-dimensional array, the first dimension contains one array - /// for each sample. + /// for each sample. /// /// # Errors /// @@ -786,7 +856,7 @@ impl Record { /// /// - `tag` - The tag's string. /// - `data` - a flattened, two-dimensional array, the first dimension contains one array - /// for each sample. + /// for each sample. /// /// # Errors /// @@ -823,7 +893,7 @@ impl Record { /// /// - `tag` - The tag's string. /// - `data` - a flattened, two-dimensional array, the first dimension contains one array - /// for each sample. + /// for each sample. /// /// # Errors /// @@ -864,7 +934,7 @@ impl Record { /// /// - `tag` - The tag's string. /// - `data` - a two-dimensional array, the first dimension contains one array - /// for each sample. Must be non-empty. + /// for each sample. Must be non-empty. /// /// # Errors /// @@ -1249,14 +1319,19 @@ where } impl<'a, B: Borrow + 'a> Genotypes<'a, B> { - /// Get genotype of ith sample. So far, only supports diploid genotypes. + /// Get genotype of ith sample. /// /// Note that the result complies with the BCF spec. This means that the /// first allele will always be marked as `Unphased`. That is, if you have 1|1 in the VCF, /// this method will return `[Unphased(1), Phased(1)]`. pub fn get(&self, i: usize) -> Genotype { let igt = self.encoded[i]; - Genotype(igt.iter().map(|&e| GenotypeAllele::from(e)).collect()) + let allelles = igt + .iter() + .take_while(|&&i| i != VECTOR_END_INTEGER) + .map(|&i| GenotypeAllele::from(i)) + .collect(); + Genotype(allelles) } } diff --git a/src/bgzf/mod.rs b/src/bgzf/mod.rs index 8b1935bce..571f9272d 100644 --- a/src/bgzf/mod.rs +++ b/src/bgzf/mod.rs @@ -121,10 +121,7 @@ impl std::io::Read for Reader { htslib::bgzf_read(self.inner, buf.as_mut_ptr() as *mut libc::c_void, buf.len()) }; if nbytes < 0 { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - "Can not read", - )) + Err(std::io::Error::other("Can not read")) } else { Ok(nbytes as usize) } @@ -268,10 +265,7 @@ impl std::io::Write for Writer { let nbytes = unsafe { htslib::bgzf_write(self.inner, buf.as_ptr() as *mut libc::c_void, buf.len()) }; if nbytes < 0 { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - "Can not write", - )) + Err(std::io::Error::other("Can not write")) } else { Ok(nbytes as usize) } @@ -282,10 +276,7 @@ impl std::io::Write for Writer { if exit_code == 0 { Ok(()) } else { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - "Can not flush", - )) + Err(std::io::Error::other("Can not flush")) } } } diff --git a/test/test_non_diploid.vcf b/test/test_non_diploid.vcf new file mode 100644 index 000000000..ca47d5f22 --- /dev/null +++ b/test/test_non_diploid.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT one two +1 100 . C A . . . GT 0 1 +1 101 . T G . . . GT 0/1 1/1 +1 102 . A G . . . GT 1|0 1/1|0