Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/bam/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,9 +358,9 @@ impl Record {

let orig_aux_offset = self.qname_capacity()
+ 4 * self.cigar_len()
+ (self.seq_len() + 1) / 2
+ self.seq_len().div_ceil(2)
+ self.seq_len();
let new_aux_offset = q_len + extranul + cigar_width + (seq.len() + 1) / 2 + qual.len();
let new_aux_offset = q_len + extranul + cigar_width + seq.len().div_ceil(2) + qual.len();
assert!(orig_aux_offset <= self.inner.l_data as usize);
let aux_len = self.inner.l_data as usize - orig_aux_offset;
self.inner_mut().l_data = (new_aux_offset + aux_len) as i32;
Expand Down Expand Up @@ -416,7 +416,7 @@ impl Record {
});
}
self.inner_mut().core.l_qseq = seq.len() as i32;
i += (seq.len() + 1) / 2;
i += seq.len().div_ceil(2);
}

// qual
Expand Down Expand Up @@ -564,7 +564,7 @@ impl Record {

fn seq_data(&self) -> &[u8] {
let offset = self.qname_capacity() + self.cigar_len() * 4;
&self.data()[offset..][..(self.seq_len() + 1) / 2]
&self.data()[offset..][..self.seq_len().div_ceil(2)]
}

/// Get read sequence. Complexity: O(1).
Expand All @@ -579,7 +579,7 @@ impl Record {
/// This does not entail any offsets, hence the qualities can be used directly without
/// e.g. subtracting 33. Complexity: O(1).
pub fn qual(&self) -> &[u8] {
&self.data()[self.qname_capacity() + self.cigar_len() * 4 + (self.seq_len() + 1) / 2..]
&self.data()[self.qname_capacity() + self.cigar_len() * 4 + self.seq_len().div_ceil(2)..]
[..self.seq_len()]
}

Expand Down Expand Up @@ -787,7 +787,7 @@ impl Record {
// CIGAR (uint32_t):
+ self.cigar_len() * std::mem::size_of::<u32>()
// Read sequence (4-bit encoded):
+ (self.seq_len() + 1) / 2
+ self.seq_len().div_ceil(2)
// Base qualities (char):
+ self.seq_len()..],
}
Expand Down
129 changes: 127 additions & 2 deletions src/bcf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ impl IndexedReader {
/// # Arguments
///
/// * `rid` - numeric ID of the reference to jump to; use `HeaderView::name2rid` for resolving
/// contig name to ID.
/// contig name to ID.
/// * `start` - `0`-based **inclusive** start coordinate of region on reference.
/// * `end` - Optional `0`-based **inclusive** end coordinate of region on reference. If `None`
/// is given, records are fetched from `start` until the end of the contig.
Expand Down Expand Up @@ -605,7 +605,7 @@ pub mod synced {
/// # Arguments
///
/// * `rid` - numeric ID of the reference to jump to; use `HeaderView::name2rid` for resolving
/// contig name to ID.
/// contig name to ID.
/// * `start` - `0`-based start coordinate of region on reference.
/// * `end` - `0`-based end coordinate of region on reference.
pub fn fetch(&mut self, rid: u32, start: u64, end: u64) -> Result<()> {
Expand Down Expand Up @@ -835,6 +835,8 @@ fn bcf_open(target: &[u8], mode: &[u8]) -> Result<*mut htslib::htsFile> {

#[cfg(test)]
mod tests {
use tempfile::NamedTempFile;

use super::record::Buffer;
use super::*;
use crate::bcf::header::Id;
Expand Down Expand Up @@ -1090,6 +1092,129 @@ mod tests {
}
}

#[test]
fn test_genotypes_read_mixed_ploidy() {
let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file.");

// Expected genotypes for comparison
let expected = [vec!["0", "1"], vec!["0/1", "1/1"], vec!["1|0", "1/1|0"]];

for (rec, exp_gts) in vcf.records().zip(expected.iter()) {
let rec = rec.expect("Error reading record.");

// Get the genotypes from the record
let genotypes = rec.genotypes().expect("Error reading genotypes");

// Compare each genotype with the expected value
for (sample, exp_gt) in exp_gts.iter().enumerate() {
assert_eq!(&format!("{}", genotypes.get(sample)), exp_gt);
}
}
}

#[test]
fn test_genotypes_write_and_read_mixed_ploidy() {
let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file.");

// Create a temporary file to write the modified VCF data
let tmp = NamedTempFile::new().unwrap();
let path = tmp.path();

{
// Create a VCF writer with the same header as the input VCF
let mut writer = Writer::from_path(
path,
&Header::from_template(vcf.header()),
true,
Format::Vcf,
)
.unwrap();

// Modify record template by adding different genotypes and write the to the temp file.
let mut rec_tpl = vcf.records().next().unwrap().unwrap();
rec_tpl
.push_genotype_structured(
&[
vec![GenotypeAllele::Unphased(0)],
vec![GenotypeAllele::Unphased(1)],
],
3,
)
.unwrap();
writer.write(&rec_tpl).unwrap();
rec_tpl
.push_genotype_structured(
&[
vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
],
3,
)
.unwrap();
writer.write(&rec_tpl).unwrap();
rec_tpl
.push_genotype_structured(
&[
vec![GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)],
vec![
GenotypeAllele::Unphased(1),
GenotypeAllele::Unphased(1),
GenotypeAllele::Phased(0),
],
],
3,
)
.unwrap();
writer.write(&rec_tpl).unwrap();
}

// Read back the temporary file with the modified VCF data
let mut reader = Reader::from_path(path).unwrap();

// Expected genotypes for validation
let expected = [vec!["0", "1"], vec!["0/1", "1/1"], vec!["1|0", "1/1|0"]];

// Iterate over the records in the temporary file and validate the genotypes
for (rec, exp_gts) in reader.records().zip(expected.iter()) {
let rec = rec.expect("Error reading record");
let genotypes = rec.genotypes().expect("Error reading genotypes");

// Compare each genotype with the expected value
for (sample, exp_gt) in exp_gts.iter().enumerate() {
assert_eq!(&format!("{}", genotypes.get(sample)), exp_gt);
}
}
}

#[test]
fn test_genotypes_wrong_max_ploidy() {
let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file.");

// Modify record template by adding different genotypes and write the to the temp file.
let mut rec_tpl = vcf.records().next().unwrap().unwrap();
let err = rec_tpl
.push_genotype_structured(
&[
vec![
GenotypeAllele::Unphased(0),
GenotypeAllele::Unphased(1),
GenotypeAllele::Unphased(0),
],
vec![
GenotypeAllele::Unphased(1),
GenotypeAllele::Unphased(0),
GenotypeAllele::Unphased(1),
GenotypeAllele::Unphased(0),
],
],
3,
)
.expect_err(
"This should fail since there are more alleles specified (4 for second sample) than max_ploidy (3) suggests",
);
assert_eq!(err, crate::errors::Error::BcfSetValues);
}

#[test]
fn test_header_ids() {
let vcf = Reader::from_path("test/test_string.vcf").expect("Error opening file.");
Expand Down
91 changes: 83 additions & 8 deletions src/bcf/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
// except according to those terms.

use std::borrow::{Borrow, BorrowMut};
use std::ffi;
use std::fmt;
use std::marker::PhantomData;
use std::ops::Deref;
Expand All @@ -13,6 +12,7 @@ use std::ptr;
use std::rc::Rc;
use std::slice;
use std::str;
use std::{ffi, iter};

use bio_types::genome;
use derive_new::new;
Expand Down Expand Up @@ -659,7 +659,7 @@ impl Record {
/// # Arguments
///
/// - `genotypes` - a flattened, two-dimensional array of GenotypeAllele,
/// the first dimension contains one array for each sample.
/// the first dimension contains one array for each sample.
///
/// # Errors
///
Expand Down Expand Up @@ -692,6 +692,76 @@ impl Record {
self.push_format_integer(b"GT", &encoded)
}

/// Add/replace genotypes in FORMAT GT tag by providing a list of genotypes.
///
/// # Arguments
///
/// - `genotypes` - a two-dimensional array of GenotypeAllele
/// - `max_ploidy` - the maximum number of alleles allowed for any genotype on any sample.
///
/// # Errors
///
/// Returns an error if any genotype has more allelles than `max_ploidy` or if the GT tag is not present in the header.
///
/// # Example
///
/// Example assumes we have a Record `record` from a VCF with a `GT` `FORMAT` tag and three samples.
/// See [module documentation](../index.html#example-writing) for how to set up
/// VCF, header, and record.
///
/// ```
/// # use rust_htslib::bcf::{Format, Writer};
/// # use rust_htslib::bcf::header::Header;
/// # use rust_htslib::bcf::record::GenotypeAllele;
/// # use std::iter;
/// # let mut header = Header::new();
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
/// # header.push_record(header_contig_line.as_bytes());
/// # let header_gt_line = r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#;
/// # header.push_record(header_gt_line.as_bytes());
/// # header.push_sample("first_sample".as_bytes());
/// # header.push_sample("second_sample".as_bytes());
/// # header.push_sample("third_sample".as_bytes());
/// # let mut vcf = Writer::from_stdout(&header, true, Format::Vcf)?;
/// # let mut record = vcf.empty_record();
/// let alleles = vec![
/// vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
/// vec![GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)],
/// vec![GenotypeAllele::Unphased(0)],
/// ];
/// record.push_genotype_structured(&alleles, 2);
/// let gts = record.genotypes()?;
/// assert_eq!("1/1", &format!("{}", gts.get(0)));
/// assert_eq!("0|1", &format!("{}", gts.get(1)));
/// assert_eq!("0", &format!("{}", gts.get(2)));
/// # Ok::<(), rust_htslib::errors::Error>(())
/// ```
pub fn push_genotype_structured<GT>(
&mut self,
genotypes: &[GT],
max_ploidy: usize,
) -> Result<()>
where
GT: AsRef<[GenotypeAllele]>,
{
let mut data = Vec::with_capacity(max_ploidy * genotypes.len());
for gt in genotypes {
if gt.as_ref().len() > max_ploidy {
return Err(Error::BcfSetValues);
}
data.extend(
gt.as_ref()
.iter()
.map(|gta| i32::from(*gta))
.chain(iter::repeat_n(
VECTOR_END_INTEGER,
max_ploidy - gt.as_ref().len(),
)),
);
}
self.push_format_integer(b"GT", &data)
}

/// Get genotypes as vector of one `Genotype` per sample.
///
/// # Example
Expand Down Expand Up @@ -771,7 +841,7 @@ impl Record {
///
/// - `tag` - The tag's string.
/// - `data` - a flattened, two-dimensional array, the first dimension contains one array
/// for each sample.
/// for each sample.
///
/// # Errors
///
Expand All @@ -786,7 +856,7 @@ impl Record {
///
/// - `tag` - The tag's string.
/// - `data` - a flattened, two-dimensional array, the first dimension contains one array
/// for each sample.
/// for each sample.
///
/// # Errors
///
Expand Down Expand Up @@ -823,7 +893,7 @@ impl Record {
///
/// - `tag` - The tag's string.
/// - `data` - a flattened, two-dimensional array, the first dimension contains one array
/// for each sample.
/// for each sample.
///
/// # Errors
///
Expand Down Expand Up @@ -864,7 +934,7 @@ impl Record {
///
/// - `tag` - The tag's string.
/// - `data` - a two-dimensional array, the first dimension contains one array
/// for each sample. Must be non-empty.
/// for each sample. Must be non-empty.
///
/// # Errors
///
Expand Down Expand Up @@ -1249,14 +1319,19 @@ where
}

impl<'a, B: Borrow<Buffer> + 'a> Genotypes<'a, B> {
/// Get genotype of ith sample. So far, only supports diploid genotypes.
/// Get genotype of ith sample.
///
/// Note that the result complies with the BCF spec. This means that the
/// first allele will always be marked as `Unphased`. That is, if you have 1|1 in the VCF,
/// this method will return `[Unphased(1), Phased(1)]`.
pub fn get(&self, i: usize) -> Genotype {
let igt = self.encoded[i];
Genotype(igt.iter().map(|&e| GenotypeAllele::from(e)).collect())
let allelles = igt
.iter()
.take_while(|&&i| i != VECTOR_END_INTEGER)
.map(|&i| GenotypeAllele::from(i))
.collect();
Genotype(allelles)
}
}

Expand Down
Loading
Loading