From c9d22bc268b87f087ea5b883a710e6a02166c6e2 Mon Sep 17 00:00:00 2001 From: SeppeDeWinter Date: Fri, 7 Mar 2025 16:02:38 +0100 Subject: [PATCH 1/5] Pseudobulk now does not need to resort fragment files. --- pycistopic-lib/Cargo.lock | 241 +++++++++++++++++++++++++ pycistopic-lib/Cargo.toml | 20 +++ pycistopic-lib/src/custom_errors.rs | 58 ++++++ pycistopic-lib/src/lib.rs | 14 ++ pycistopic-lib/src/pseudobulk.rs | 264 ++++++++++++++++++++++++++++ 5 files changed, 597 insertions(+) create mode 100644 pycistopic-lib/Cargo.lock create mode 100644 pycistopic-lib/Cargo.toml create mode 100644 pycistopic-lib/src/custom_errors.rs create mode 100644 pycistopic-lib/src/lib.rs create mode 100644 pycistopic-lib/src/pseudobulk.rs diff --git a/pycistopic-lib/Cargo.lock b/pycistopic-lib/Cargo.lock new file mode 100644 index 0000000..68b46be --- /dev/null +++ b/pycistopic-lib/Cargo.lock @@ -0,0 +1,241 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "cc" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" +dependencies = [ + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "flate2" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" +dependencies = [ + "crc32fast", + "libz-ng-sys", + "miniz_oxide", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indoc" +version = "2.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" + +[[package]] +name = "libc" +version = "0.2.170" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" + +[[package]] +name = "libz-ng-sys" +version = "1.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cee1488e961a80d172564fd6fcda11d8a4ac6672c06fe008e9213fa60520c2b" +dependencies = [ + "cmake", + "libc", +] + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "miniz_oxide" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5" +dependencies = [ + "adler2", +] + +[[package]] +name = "once_cell" +version = "1.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" + +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "proc-macro2" +version = "1.0.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pycistopic-lib" +version = "0.1.0" +dependencies = [ + "flate2", + "pyo3", +] + +[[package]] +name = "pyo3" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "syn" +version = "2.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" diff --git a/pycistopic-lib/Cargo.toml b/pycistopic-lib/Cargo.toml new file mode 100644 index 0000000..a9186bf --- /dev/null +++ b/pycistopic-lib/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "pycistopic-lib" +version = "0.1.0" +edition = "2021" + +[lib] +# The name of the native library. This is the name which will be used in Python to import the +# library (i.e. `import string_sum`). If you change this, you must also change the name of the +# `#[pymodule]` in `src/lib.rs`. +name = "pycistopic_lib" +# "cdylib" is necessary to produce a shared library for Python to import from. +# +# Downstream Rust code (including code in `bin/`, `examples/`, and `tests/`) will not be able +# to `use string_sum;` unless the "rlib" or "lib" crate type is also included, e.g.: +# crate-type = ["cdylib", "rlib"] +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.23.5", features = ["extension-module"] } +flate2 = {version = "1.1.0",features = ["zlib-ng"], default-features = false} diff --git a/pycistopic-lib/src/custom_errors.rs b/pycistopic-lib/src/custom_errors.rs new file mode 100644 index 0000000..1653eb7 --- /dev/null +++ b/pycistopic-lib/src/custom_errors.rs @@ -0,0 +1,58 @@ +use pyo3::exceptions::{PyIOError, PyValueError}; +use std::fmt; +use pyo3::PyErr; + +#[derive(Debug)] +pub struct InvalidFragmentFileError { + pub filename: String, +} + +impl InvalidFragmentFileError { + pub fn new(filename: &str) -> InvalidFragmentFileError { + InvalidFragmentFileError{ + filename: filename.to_string() + } + } +} + +impl std::error::Error for InvalidFragmentFileError{} + +impl fmt::Display for InvalidFragmentFileError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}: Invalid fragment file.", self.filename) + } +} + +impl From for PyErr { + fn from(err: InvalidFragmentFileError) -> PyErr { + PyIOError::new_err(err.to_string()) + } +} + +#[derive(Debug)] +pub struct ValueError { + pub message: String, +} + +impl ValueError { + pub fn new(message: String) -> ValueError { + ValueError { + message + } + } +} + +impl std::error::Error for ValueError{} + +impl fmt::Display for ValueError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.message) + } +} + +impl From for PyErr { + fn from(err: ValueError) -> PyErr { + PyValueError::new_err(err.to_string()) + } +} + diff --git a/pycistopic-lib/src/lib.rs b/pycistopic-lib/src/lib.rs new file mode 100644 index 0000000..ee315e2 --- /dev/null +++ b/pycistopic-lib/src/lib.rs @@ -0,0 +1,14 @@ +mod pseudobulk; +mod custom_errors; + +use pyo3::prelude::*; + + +/// A Python module implemented in Rust. The name of this function must match +/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to +/// import the module. +#[pymodule] +fn pycistopic_lib(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(pseudobulk::split_fragment_files_by_cell_type, m)?)?; + Ok(()) +} diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs new file mode 100644 index 0000000..9b74486 --- /dev/null +++ b/pycistopic-lib/src/pseudobulk.rs @@ -0,0 +1,264 @@ +use super::custom_errors; +use std::fmt; +use std::fs::File; +use std::path::Path; +use std::collections::BinaryHeap; +use std::collections::HashMap; +use std::cmp::Reverse; +use std::io::{BufRead, Write}; +use flate2::Compression; +use pyo3::prelude::*; +use flate2::read::MultiGzDecoder; +use flate2::write::GzEncoder; +use std::thread; + + +#[derive(Eq, PartialEq)] +struct GenomicRange { + chromosome: String, + start: usize, + end: usize, + cell_barcode: String, + score: Option, + file_index: usize, + file_name: String +} + +impl Ord for GenomicRange { + fn cmp(&self, other: &GenomicRange) -> std::cmp::Ordering { + self.start.cmp(&other.start) + .then(self.end.cmp(&other.end)) + } +} + +impl PartialOrd for GenomicRange { + fn partial_cmp(&self, other: &GenomicRange) -> Option { + Some(self.cmp(other)) + } +} + +impl GenomicRange { + fn new(line: String, file_index: usize, filename: &str) -> Result { + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() < 4 { + return Err(custom_errors::InvalidFragmentFileError::new(filename)); + } + Ok(GenomicRange{ + chromosome: fields[0].to_string(), + start: fields[1].parse::() + .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))?, + end: fields[2].parse::() + .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))?, + cell_barcode: fields[3].to_string(), + score: if fields.len() > 4 { + Some(fields[4].parse::() + .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))? + ) + } else {None}, + file_index, + file_name: filename.to_string() + }) + } +} + +impl fmt::Display for GenomicRange { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.score { + Some(score) => write!( + f, + "{}\t{}\t{}\t{}\t{}", + self.chromosome, self.start, self.end, self.cell_barcode, score + ), + None => write!( + f, + "{}\t{}\t{}\t{}", + self.chromosome, self.start, self.end, self.cell_barcode + ), + } + } +} + + +fn split_fragments_by_cell_barcodes_for_chromosome( + fragment_file_paths: &[&str], + fragment_file_to_cell_barcode: &HashMap>, + chromosome: &str, + gz_output_file: &mut GzEncoder +) -> PyResult<()>{ + + // Open fragment files which are gzipped, and pos-sorted. + let mut readers: Vec<_> = fragment_file_paths + .iter() + .map(|&path| { + let file = File::open(Path::new(path))?; + let d_file = MultiGzDecoder::new(file); + Ok(std::io::BufReader::new(d_file)) + }) + .collect::>()?; + + // A binary heap will be used to write fragments in order from different files + let mut heap = BinaryHeap::new(); + + let mut line = "#".to_string(); + let mut bytes_read; + + // Push the first fragment from each file to the heap + for ( + fragment_file, + (index, reader) + ) in fragment_file_paths.iter().zip(readers.iter_mut().enumerate()) { + // skip header lines + while line.starts_with("#") { + line.clear(); + reader.read_line(&mut line)?; + line = line.trim().to_string(); + } + // Loop until a fragment with cell barcode, on the correct chromosome, in fragment_file_to_cell_barcode is found + let mut fragment_found = false; + // is the barcode found for the first (non-header) line? + let fragment = GenomicRange::new( + line.to_string(), index, fragment_file + )?; + match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) { + Some(cell_barcodes) => { + if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome { + // Reverse so that "smaller" fragments (i.e. lower genomic location + // are written first later on (heap will pop large elements first). + heap.push(Reverse(fragment)); + fragment_found = true; + } + }, + None => { + return Err( + custom_errors::ValueError::new( + format!( + "fragment_file_to_cell_barcode does not contain entry for {}", + fragment_file) + ).into()); + } + } + + // barcode not in first line. + // keep reading until a fragment with correct barcode, on correct chromosome, is found. + while !fragment_found { + line.clear(); + bytes_read = reader.read_line(&mut line)?; + if bytes_read == 0 { + // end of file + break; + } + line = line.trim().to_string(); + let fragment = GenomicRange::new( + line.to_string(), index, fragment_file + )?; + match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) { + Some(cell_barcodes) => { + if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome { + // Reverse so that "smaller" fragments (i.e. lower genomic location + // are written first later on (heap will pop large elements first). + heap.push(Reverse(fragment)); + fragment_found = true; + } + }, + None => { + return Err( + custom_errors::ValueError::new( + format!( + "fragment_file_to_cell_barcode does not contain entry for {}", + fragment_file) + ).into()); + } + } + } + } + + while let Some(Reverse(fragment)) = heap.pop() { + gz_output_file.write_all(format!("{}\n", fragment).as_bytes())?; + // Loop until a fragment with cell barcode in fragment_file_to_cell_barcode is found + let mut fragment_found = false; + while !fragment_found { + // Read next range from file that had the smallest range and add this to the heap. + line.clear(); + bytes_read = readers[fragment.file_index].read_line(&mut line)?; + if bytes_read == 0 { + // end of file + break; + } + line = line.trim().to_string(); + let next_fragment = GenomicRange::new( + line.to_string(), fragment.file_index, &fragment.file_name + )?; + // Assuming that the fragment files are sorted. + // Using the previous while loop, for each file, we should be at the correct location of the file + // (i.e. where the current chromosomes are located). + // if the next fragment file has a different chromosome, we should be done with this file and we can skip it. + if next_fragment.chromosome != chromosome { + break; + } + match fragment_file_to_cell_barcode.get(&next_fragment.file_name) { + Some(cell_barcodes) => { + if cell_barcodes.contains(&next_fragment.cell_barcode) { + heap.push(Reverse(next_fragment)); + fragment_found = true; + } + }, + None => { + return Err( + custom_errors::ValueError::new( + format!( + "fragment_file_to_cell_barcode does not contain entry for {}", + next_fragment.file_name) + ).into() + ); + } + } + } + } + Ok(()) +} + +#[pyfunction] +pub fn split_fragment_files_by_cell_type( + fragment_file_paths: Vec, + output_file_prefix: &str, + cell_type_to_fragment_file_to_cell_barcode: HashMap>>, + chromosomes: Vec +) -> PyResult<()> { + for cell_type in cell_type_to_fragment_file_to_cell_barcode.keys() { + let mut handles: Vec> = Vec::new(); + for chromosome in &chromosomes { + let output_file_name = format!("{}_{}.{}.tsv.gz", output_file_prefix, cell_type, chromosome); + let fragment_file_paths = fragment_file_paths.clone(); // Need to clone since threads take ownership + let fragment_file_to_cell_barcode = cell_type_to_fragment_file_to_cell_barcode + .get(cell_type) + .unwrap() + .clone(); + let file = File::create(output_file_name)?; + let chromosome = chromosome.clone(); + let handle = thread::spawn(move || { + let mut gz_output_file = GzEncoder::new(file, Compression::default()); + split_fragments_by_cell_barcodes_for_chromosome( + &fragment_file_paths.iter().map(|p| p.as_str()).collect::>(), + &fragment_file_to_cell_barcode, + &chromosome, + &mut gz_output_file + ) + }); + handles.push(handle); + } + for handle in handles { + handle.join().expect("Thread panicked")?; + } + // concat all chromosomes + let output_file_name = format!("{}_{}.tsv.gz", output_file_prefix, cell_type); + let output_file = File::create(&output_file_name)?; + let mut writer = std::io::BufWriter::new(output_file); + for chromosome in &chromosomes { + let input_file_name = format!("{}_{}.{}.tsv.gz", output_file_prefix, cell_type, chromosome); + let mut input_file = File::open(&input_file_name)?; + std::io::copy(&mut input_file, &mut writer)?; + } + writer.flush()?; + } + Ok(()) +} From a8fc34bf320492b5811babd162f79233f1b5f22d Mon Sep 17 00:00:00 2001 From: SeppeDeWinter Date: Fri, 7 Mar 2025 23:31:51 +0100 Subject: [PATCH 2/5] Refactor. --- pycistopic-lib/src/pseudobulk.rs | 238 ++++++++++++++++--------------- 1 file changed, 122 insertions(+), 116 deletions(-) diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs index 9b74486..59c6892 100644 --- a/pycistopic-lib/src/pseudobulk.rs +++ b/pycistopic-lib/src/pseudobulk.rs @@ -13,7 +13,7 @@ use flate2::write::GzEncoder; use std::thread; -#[derive(Eq, PartialEq)] +#[derive(Eq, PartialEq, Clone)] struct GenomicRange { chromosome: String, start: usize, @@ -38,7 +38,7 @@ impl PartialOrd for GenomicRange { } impl GenomicRange { - fn new(line: String, file_index: usize, filename: &str) -> Result { + fn new(line: &str, file_index: usize, filename: &str) -> Result { let fields: Vec<&str> = line.split('\t').collect(); if fields.len() < 4 { return Err(custom_errors::InvalidFragmentFileError::new(filename)); @@ -79,6 +79,102 @@ impl fmt::Display for GenomicRange { } +struct FragmentFileReader { + reader: std::io::BufReader>, + fragment: Result, + buffer: String, + valid_cell_barcodes: Vec, + target_chrom: String, + file_index: usize, + file_name: String, + at_end_of_file: bool, +} + +impl FragmentFileReader { + fn new(fragment_file_path: &str, valid_cell_barcodes: Vec, target_chrom: String, file_index: usize) -> Result { + let file = File::open(Path::new(fragment_file_path))?; + let d_file = MultiGzDecoder::new(file); + let reader = std::io::BufReader::new(d_file); + Ok(FragmentFileReader{ + reader, + fragment: GenomicRange::new("", file_index, fragment_file_path), + buffer: String::new(), + valid_cell_barcodes, + target_chrom, + file_index, + file_name: fragment_file_path.to_string(), + at_end_of_file: false + }) + } + + fn at_chrom(&self) -> bool { + if let Ok(fragment) = &self.fragment{ + fragment.chromosome == self.target_chrom + } else { + false + } + } + + fn read_next(&mut self) -> Result<(), custom_errors::InvalidFragmentFileError>{ + self.buffer.clear(); + match self.reader.read_line(&mut self.buffer) { + Ok(bytes_read) => { + if bytes_read == 0 { + self.at_end_of_file = true + } + }, + Err(_) => { + return Err(custom_errors::InvalidFragmentFileError::new(&self.file_name)) + } + } + self.buffer = self.buffer.trim().to_string(); + self.fragment = GenomicRange::new(&self.buffer, self.file_index, &self.file_name); + Ok(()) + } + + fn skip_header(&mut self, pattern: &str) -> Result<(), custom_errors::InvalidFragmentFileError> { + self.buffer = pattern.to_string(); + while self.buffer.starts_with(pattern) && !self.at_end_of_file { + self.read_next()?; + } + Ok(()) + } + + fn skip_to_chromosome(&mut self, chromosome: &str) -> Result<(), custom_errors::InvalidFragmentFileError> { + while !self.at_end_of_file { + if let Ok(fragment) = &self.fragment { + if fragment.chromosome == chromosome { + return Ok(()); + } + }; + self.read_next()?; + } + Ok(()) + } + + fn get_next_valid_fragment(&mut self) -> Result, custom_errors::InvalidFragmentFileError> { + while !self.at_end_of_file { + // first check current fragment + // it could be that after skip_header and skip_to_chromosome we are already at a + // valid fragment. + if let Ok(fragment) = &self.fragment { + if self.valid_cell_barcodes.contains(&fragment.cell_barcode) { + let fragment = fragment.clone(); + if fragment.chromosome != self.target_chrom { + return Ok(None) + } + // read next fragment so next time we enter this function we have a new frag,et + self.read_next()?; + return Ok(Some(fragment)); + } + } + self.read_next()?; + } + Ok(None) + } +} + + fn split_fragments_by_cell_barcodes_for_chromosome( fragment_file_paths: &[&str], fragment_file_to_cell_barcode: &HashMap>, @@ -87,130 +183,40 @@ fn split_fragments_by_cell_barcodes_for_chromosome( ) -> PyResult<()>{ // Open fragment files which are gzipped, and pos-sorted. - let mut readers: Vec<_> = fragment_file_paths - .iter() - .map(|&path| { - let file = File::open(Path::new(path))?; - let d_file = MultiGzDecoder::new(file); - Ok(std::io::BufReader::new(d_file)) - }) - .collect::>()?; + let mut readers: Vec = Vec::new(); + for (file_index, fragment_file_path) in fragment_file_paths.iter().enumerate() { + if let Some(cell_barcodes) = fragment_file_to_cell_barcode.get(&fragment_file_path.to_string()) { + readers.push( + FragmentFileReader::new( + fragment_file_path, + cell_barcodes.to_vec(), + chromosome.to_string(), + file_index)? + ); + } + } // A binary heap will be used to write fragments in order from different files let mut heap = BinaryHeap::new(); - let mut line = "#".to_string(); - let mut bytes_read; - - // Push the first fragment from each file to the heap - for ( - fragment_file, - (index, reader) - ) in fragment_file_paths.iter().zip(readers.iter_mut().enumerate()) { - // skip header lines - while line.starts_with("#") { - line.clear(); - reader.read_line(&mut line)?; - line = line.trim().to_string(); - } - // Loop until a fragment with cell barcode, on the correct chromosome, in fragment_file_to_cell_barcode is found - let mut fragment_found = false; - // is the barcode found for the first (non-header) line? - let fragment = GenomicRange::new( - line.to_string(), index, fragment_file - )?; - match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) { - Some(cell_barcodes) => { - if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome { - // Reverse so that "smaller" fragments (i.e. lower genomic location - // are written first later on (heap will pop large elements first). - heap.push(Reverse(fragment)); - fragment_found = true; - } - }, - None => { - return Err( - custom_errors::ValueError::new( - format!( - "fragment_file_to_cell_barcode does not contain entry for {}", - fragment_file) - ).into()); - } - } - - // barcode not in first line. - // keep reading until a fragment with correct barcode, on correct chromosome, is found. - while !fragment_found { - line.clear(); - bytes_read = reader.read_line(&mut line)?; - if bytes_read == 0 { - // end of file - break; - } - line = line.trim().to_string(); - let fragment = GenomicRange::new( - line.to_string(), index, fragment_file - )?; - match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) { - Some(cell_barcodes) => { - if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome { - // Reverse so that "smaller" fragments (i.e. lower genomic location - // are written first later on (heap will pop large elements first). - heap.push(Reverse(fragment)); - fragment_found = true; - } - }, - None => { - return Err( - custom_errors::ValueError::new( - format!( - "fragment_file_to_cell_barcode does not contain entry for {}", - fragment_file) - ).into()); - } + // skip header lines, go to correct chromosome, and push first valid fragment to binary heap + for reader in readers.iter_mut(){ + reader.skip_header("#")?; + reader.skip_to_chromosome(chromosome)?; + if !reader.at_end_of_file && reader.at_chrom() { + if let Some(fragment) = reader.get_next_valid_fragment()? { + heap.push(Reverse(fragment)); } } } while let Some(Reverse(fragment)) = heap.pop() { gz_output_file.write_all(format!("{}\n", fragment).as_bytes())?; - // Loop until a fragment with cell barcode in fragment_file_to_cell_barcode is found - let mut fragment_found = false; - while !fragment_found { - // Read next range from file that had the smallest range and add this to the heap. - line.clear(); - bytes_read = readers[fragment.file_index].read_line(&mut line)?; - if bytes_read == 0 { - // end of file - break; - } - line = line.trim().to_string(); - let next_fragment = GenomicRange::new( - line.to_string(), fragment.file_index, &fragment.file_name - )?; - // Assuming that the fragment files are sorted. - // Using the previous while loop, for each file, we should be at the correct location of the file - // (i.e. where the current chromosomes are located). - // if the next fragment file has a different chromosome, we should be done with this file and we can skip it. - if next_fragment.chromosome != chromosome { - break; - } - match fragment_file_to_cell_barcode.get(&next_fragment.file_name) { - Some(cell_barcodes) => { - if cell_barcodes.contains(&next_fragment.cell_barcode) { - heap.push(Reverse(next_fragment)); - fragment_found = true; - } - }, - None => { - return Err( - custom_errors::ValueError::new( - format!( - "fragment_file_to_cell_barcode does not contain entry for {}", - next_fragment.file_name) - ).into() - ); - } + // read from file that currently has the smallest genomic range + let reader = &mut readers[fragment.file_index]; + if !reader.at_end_of_file && reader.at_chrom() { + if let Some(fragment) = reader.get_next_valid_fragment()? { + heap.push(Reverse(fragment)); } } } From de9c483ebc1209a2428bd760e6bf31d6a7e73ad4 Mon Sep 17 00:00:00 2001 From: SeppeDeWinter Date: Sat, 8 Mar 2025 19:59:19 +0100 Subject: [PATCH 3/5] Make use of tabix file to seek to chromosome position. --- pycistopic-lib/Cargo.lock | 164 ++++++++++++++++++++++++++----- pycistopic-lib/Cargo.toml | 2 +- pycistopic-lib/src/pseudobulk.rs | 42 +++++--- 3 files changed, 172 insertions(+), 36 deletions(-) diff --git a/pycistopic-lib/Cargo.lock b/pycistopic-lib/Cargo.lock index 68b46be..17eeb5f 100644 --- a/pycistopic-lib/Cargo.lock +++ b/pycistopic-lib/Cargo.lock @@ -15,14 +15,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] -name = "cc" -version = "1.2.14" +name = "bit-vec" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bstr" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ - "shlex", + "memchr", + "serde", ] +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + [[package]] name = "cfg-if" version = "1.0.0" @@ -30,23 +49,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] -name = "cmake" -version = "0.1.54" +name = "crc32fast" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ - "cc", + "cfg-if", ] [[package]] -name = "crc32fast" -version = "1.4.2" +name = "crossbeam-channel" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" dependencies = [ - "cfg-if", + "crossbeam-utils", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "flate2" version = "1.1.0" @@ -54,16 +85,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" dependencies = [ "crc32fast", - "libz-ng-sys", "miniz_oxide", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "indoc" version = "2.0.6" @@ -77,14 +123,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" [[package]] -name = "libz-ng-sys" -version = "1.1.21" +name = "memchr" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cee1488e961a80d172564fd6fcda11d8a4ac6672c06fe008e9213fa60520c2b" -dependencies = [ - "cmake", - "libc", -] +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memoffset" @@ -104,6 +146,66 @@ dependencies = [ "adler2", ] +[[package]] +name = "noodles" +version = "0.93.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74eea88cacba7e16cb6aff1bc3e077f3f9e036e59b9107a8415bf9242d0c2d97" +dependencies = [ + "noodles-bgzf", + "noodles-core", + "noodles-csi", + "noodles-tabix", +] + +[[package]] +name = "noodles-bgzf" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c10c4cb29950028823653395ebedfcaf5b652e504aacf816b510fb0268daa2eb" +dependencies = [ + "byteorder", + "bytes", + "crossbeam-channel", + "flate2", +] + +[[package]] +name = "noodles-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "962b13b79312f773a12ffcb0cdaccab6327f8343b6f440a888eff10c749d52b0" +dependencies = [ + "bstr", +] + +[[package]] +name = "noodles-csi" +version = "0.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45b402963653cacd5df474889a1cfb969e2f66469023836ad3b0a2c75e2b4e7c" +dependencies = [ + "bit-vec", + "bstr", + "byteorder", + "indexmap", + "noodles-bgzf", + "noodles-core", +] + +[[package]] +name = "noodles-tabix" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46ee069d57770840554e3d8bd02e7415b3a2d972cdea7525bea02472032c234" +dependencies = [ + "byteorder", + "indexmap", + "noodles-bgzf", + "noodles-core", + "noodles-csi", +] + [[package]] name = "once_cell" version = "1.20.3" @@ -129,7 +231,7 @@ dependencies = [ name = "pycistopic-lib" version = "0.1.0" dependencies = [ - "flate2", + "noodles", "pyo3", ] @@ -206,10 +308,24 @@ dependencies = [ ] [[package]] -name = "shlex" -version = "1.3.0" +name = "serde" +version = "1.0.213" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.213" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "syn" diff --git a/pycistopic-lib/Cargo.toml b/pycistopic-lib/Cargo.toml index a9186bf..4dda0b7 100644 --- a/pycistopic-lib/Cargo.toml +++ b/pycistopic-lib/Cargo.toml @@ -17,4 +17,4 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.23.5", features = ["extension-module"] } -flate2 = {version = "1.1.0",features = ["zlib-ng"], default-features = false} +noodles = { version = "0.93.0", features = ["tabix", "bgzf", "csi", "core"] } diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs index 59c6892..b769b06 100644 --- a/pycistopic-lib/src/pseudobulk.rs +++ b/pycistopic-lib/src/pseudobulk.rs @@ -6,12 +6,11 @@ use std::collections::BinaryHeap; use std::collections::HashMap; use std::cmp::Reverse; use std::io::{BufRead, Write}; -use flate2::Compression; use pyo3::prelude::*; -use flate2::read::MultiGzDecoder; -use flate2::write::GzEncoder; use std::thread; - +use noodles::{tabix, bgzf}; +use noodles::csi::BinningIndex; +use noodles::core::{region::Interval, Position}; #[derive(Eq, PartialEq, Clone)] struct GenomicRange { @@ -80,7 +79,8 @@ impl fmt::Display for GenomicRange { struct FragmentFileReader { - reader: std::io::BufReader>, + reader: bgzf::Reader, + index: Option, fragment: Result, buffer: String, valid_cell_barcodes: Vec, @@ -92,11 +92,15 @@ struct FragmentFileReader { impl FragmentFileReader { fn new(fragment_file_path: &str, valid_cell_barcodes: Vec, target_chrom: String, file_index: usize) -> Result { - let file = File::open(Path::new(fragment_file_path))?; - let d_file = MultiGzDecoder::new(file); - let reader = std::io::BufReader::new(d_file); + let reader = bgzf::Reader::new(File::open(Path::new(fragment_file_path))?); + let mut index: Option = None; + if Path::new(&format!("{}.tbi", fragment_file_path)).exists() { + println!("Index found: {}", format!("{}.tbi", fragment_file_path)); + index = Some(tabix::fs::read(format!("{}.tbi", fragment_file_path))?); + } Ok(FragmentFileReader{ reader, + index, fragment: GenomicRange::new("", file_index, fragment_file_path), buffer: String::new(), valid_cell_barcodes, @@ -141,6 +145,23 @@ impl FragmentFileReader { } fn skip_to_chromosome(&mut self, chromosome: &str) -> Result<(), custom_errors::InvalidFragmentFileError> { + if let Some(index) = &self.index { + if let Some(header) = index.header() { + let seq_names = header.reference_sequence_names(); + if let Some(chromosome_index) = seq_names.get_index_of(chromosome.as_bytes()) { + if let Ok(q) = index.query( + chromosome_index, + Interval::from(Position::MIN..) + ) { + let first_chunk = q[0]; + self.reader.seek(first_chunk.start()).map_err(|_| custom_errors::InvalidFragmentFileError::new(&self.file_name))?; + self.read_next()?; + } + } + } + } else { + println!("Something is wrong with the index"); + } while !self.at_end_of_file { if let Ok(fragment) = &self.fragment { if fragment.chromosome == chromosome { @@ -174,12 +195,11 @@ impl FragmentFileReader { } } - fn split_fragments_by_cell_barcodes_for_chromosome( fragment_file_paths: &[&str], fragment_file_to_cell_barcode: &HashMap>, chromosome: &str, - gz_output_file: &mut GzEncoder + gz_output_file: &mut bgzf::Writer ) -> PyResult<()>{ // Open fragment files which are gzipped, and pos-sorted. @@ -242,7 +262,7 @@ pub fn split_fragment_files_by_cell_type( let file = File::create(output_file_name)?; let chromosome = chromosome.clone(); let handle = thread::spawn(move || { - let mut gz_output_file = GzEncoder::new(file, Compression::default()); + let mut gz_output_file = bgzf::Writer::new(file); split_fragments_by_cell_barcodes_for_chromosome( &fragment_file_paths.iter().map(|p| p.as_str()).collect::>(), &fragment_file_to_cell_barcode, From a33fad9f979821252e83183735b5cb2343dd3666 Mon Sep 17 00:00:00 2001 From: SeppeDeWinter Date: Mon, 10 Mar 2025 09:54:24 +0100 Subject: [PATCH 4/5] Only skip header when not using index. --- pycistopic-lib/src/pseudobulk.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs index b769b06..280793e 100644 --- a/pycistopic-lib/src/pseudobulk.rs +++ b/pycistopic-lib/src/pseudobulk.rs @@ -145,6 +145,7 @@ impl FragmentFileReader { } fn skip_to_chromosome(&mut self, chromosome: &str) -> Result<(), custom_errors::InvalidFragmentFileError> { + let mut using_index = false; if let Some(index) = &self.index { if let Some(header) = index.header() { let seq_names = header.reference_sequence_names(); @@ -156,11 +157,15 @@ impl FragmentFileReader { let first_chunk = q[0]; self.reader.seek(first_chunk.start()).map_err(|_| custom_errors::InvalidFragmentFileError::new(&self.file_name))?; self.read_next()?; + using_index = true; } } } - } else { - println!("Something is wrong with the index"); + } + if !using_index { + // If no index, manually skip header lines. + // otherwise this is done automatically + self.skip_header("#")?; } while !self.at_end_of_file { if let Ok(fragment) = &self.fragment { @@ -219,9 +224,8 @@ fn split_fragments_by_cell_barcodes_for_chromosome( // A binary heap will be used to write fragments in order from different files let mut heap = BinaryHeap::new(); - // skip header lines, go to correct chromosome, and push first valid fragment to binary heap + // go to correct chromosome and push first valid fragment to binary heap for reader in readers.iter_mut(){ - reader.skip_header("#")?; reader.skip_to_chromosome(chromosome)?; if !reader.at_end_of_file && reader.at_chrom() { if let Some(fragment) = reader.get_next_valid_fragment()? { From 48e11028eb9d3b31ed4b208fd6fc6058aa8fdafa Mon Sep 17 00:00:00 2001 From: SeppeDeWinter Date: Mon, 10 Mar 2025 09:55:57 +0100 Subject: [PATCH 5/5] Add testing code. --- pycistopic-lib/test/bcs_nextgem.txt | 10 ++ pycistopic-lib/test/bcs_v1.txt | 10 ++ pycistopic-lib/test/pseudobulk_test.py | 154 +++++++++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 pycistopic-lib/test/bcs_nextgem.txt create mode 100644 pycistopic-lib/test/bcs_v1.txt create mode 100644 pycistopic-lib/test/pseudobulk_test.py diff --git a/pycistopic-lib/test/bcs_nextgem.txt b/pycistopic-lib/test/bcs_nextgem.txt new file mode 100644 index 0000000..2441cc2 --- /dev/null +++ b/pycistopic-lib/test/bcs_nextgem.txt @@ -0,0 +1,10 @@ +CCACGTTCAGTAACTC-1 +TTCGATTCATCTCTCG-1 +TTGCCCACAAGGTTCT-1 +TTTGGCCAGTACAGAT-1 +CTAACTTAGAAATGGG-1 +TGAGTCACACCACGAC-1 +CGAGTTATCAAGTTGC-1 +TGCATTTGTAACATAG-1 +TGCATTTGTAACATAG-1 +GAAGTCTTCTACTTTG-1 diff --git a/pycistopic-lib/test/bcs_v1.txt b/pycistopic-lib/test/bcs_v1.txt new file mode 100644 index 0000000..cc57001 --- /dev/null +++ b/pycistopic-lib/test/bcs_v1.txt @@ -0,0 +1,10 @@ +GGTGTCGTCAAGGCAG-1 +CTCAACCTCAGGTTTG-1 +GGTGTCGTCAAGGCAG-1 +AGACAAACACATCATG-1 +CATGCCTGTCCTTATT-1 +GGTGTCGTCAAGGCAG-1 +GCATTCCGTTAGAGAT-1 +CAGCTAATCTGATCCC-1 +GAGCATTAGCTTACCA-1 +GCATTCCGTTAGAGAT-1 diff --git a/pycistopic-lib/test/pseudobulk_test.py b/pycistopic-lib/test/pseudobulk_test.py new file mode 100644 index 0000000..19d4353 --- /dev/null +++ b/pycistopic-lib/test/pseudobulk_test.py @@ -0,0 +1,154 @@ +import pytest +import urllib.request +import urllib +import gzip +import os +import pathlib +import shutil + +from pycistopic_lib import split_fragment_files_by_cell_type + +TEST_DIRECTORY = pathlib.Path(__file__).parent.absolute() + +FRAGMENT_FILEs = [ + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_nextgem_fragments.tsv.gz"), + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_v1_fragments.tsv.gz") +] + +INDEX_FILEs= [ + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_nextgem_fragments.tsv.gz.tbi"), + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_v1_fragments.tsv.gz.tbi") +] + +@pytest.fixture +def fragment_file(tmp_path): + for f in FRAGMENT_FILEs: + shutil.copy(f, tmp_path) + return [tmp_path / f.name for f in FRAGMENT_FILEs] + +@pytest.fixture +def fragment_file_and_index(tmp_path): + for f in FRAGMENT_FILEs: + shutil.copy(f, tmp_path) + for f in INDEX_FILEs: + shutil.copy(f, tmp_path) + return [tmp_path / f.name for f in FRAGMENT_FILEs] + +def count_number_of_fragments_per_chrom_for_cbs(file_name: str, cbs: list[str]): + n_fragment_per_cb_per_chrom = { + cb: {} for cb in cbs + } + with gzip.open(file_name) as f: + for fragment in f: + chrom, _, _, cb = fragment.decode().split()[0:4] + if cb in cbs: + if chrom not in n_fragment_per_cb_per_chrom[cb]: + n_fragment_per_cb_per_chrom[cb][chrom] = 0 + n_fragment_per_cb_per_chrom[cb][chrom] += 1 + return n_fragment_per_cb_per_chrom + +def fragment_file_correct_order(file_name: str): + chrom_to_start = {} + with gzip.open(file_name) as f: + for l in f: + l = l.decode() + if l.startswith("#"): + continue + chrom, start = l.split()[0:2] + if chrom not in chrom_to_start: + chrom_to_start[chrom] = [] + chrom_to_start[chrom].append(int(start)) + def always_inc(l): + x = l[0] + for y in l[1:]: + if y < x: + return False + return True + return all([always_inc(chrom_to_start[chrom]) for chrom in chrom_to_start]) + +def test_pseudobulk_indexed(fragment_file_and_index, tmp_path): + fragment_files = fragment_file_and_index + bcs_1 = [] + bcs_2 = [] + with open(TEST_DIRECTORY / pathlib.Path("bcs_nextgem.txt")) as f: + for l in f: + bcs_1.append(l.strip()) + + with open(TEST_DIRECTORY / pathlib.Path("bcs_v1.txt")) as f: + for l in f: + bcs_2.append(l.strip()) + sample_to_cb = { + str(fragment_files[0]): bcs_1, + str(fragment_files[1]): bcs_2, + } + split_fragment_files_by_cell_type( + fragment_file_paths=[str(f) for f in fragment_files], + output_file_prefix=str(tmp_path) + "/", + cell_type_to_fragment_file_to_cell_barcode={ + "cell_type": sample_to_cb + }, + chromosomes=[*[f"chr{x + 1}" for x in range(22)], "chrY", "chrX"] + ) + assert(fragment_file_correct_order(tmp_path / "_cell_type.tsv.gz")) + n_fragment_per_cb_per_chrom_outf = count_number_of_fragments_per_chrom_for_cbs( + tmp_path / "_cell_type.tsv.gz", + [*list(sample_to_cb.values())[0], *list(sample_to_cb.values())[1]] + ) + n_fragment_per_cb_per_chrom_inf = {} + for sample in fragment_files: + n_fragment_per_cb_per_chrom_sample = count_number_of_fragments_per_chrom_for_cbs( + sample, sample_to_cb[str(sample)]) + for cb in n_fragment_per_cb_per_chrom_sample: + if cb not in n_fragment_per_cb_per_chrom_inf: + n_fragment_per_cb_per_chrom_inf[cb] = {} + for chrom in n_fragment_per_cb_per_chrom_sample[cb]: + if chrom not in n_fragment_per_cb_per_chrom_inf[cb]: + n_fragment_per_cb_per_chrom_inf[cb][chrom] = 0 + n_fragment_per_cb_per_chrom_inf[cb][chrom] += n_fragment_per_cb_per_chrom_sample[cb][chrom] + for cb in n_fragment_per_cb_per_chrom_inf: + for chrom in n_fragment_per_cb_per_chrom_inf[cb]: + assert(n_fragment_per_cb_per_chrom_inf[cb][chrom] == n_fragment_per_cb_per_chrom_outf[cb][chrom]) + + +def test_pseudobulk_not_indexed(fragment_file, tmp_path): + fragment_files = fragment_file + bcs_1 = [] + bcs_2 = [] + with open(TEST_DIRECTORY / pathlib.Path("bcs_nextgem.txt")) as f: + for l in f: + bcs_1.append(l.strip()) + + with open(TEST_DIRECTORY / pathlib.Path("bcs_v1.txt")) as f: + for l in f: + bcs_2.append(l.strip()) + sample_to_cb = { + str(fragment_files[0]): bcs_1, + str(fragment_files[1]): bcs_2, + } + split_fragment_files_by_cell_type( + fragment_file_paths=[str(f) for f in fragment_files], + output_file_prefix=str(tmp_path) + "/", + cell_type_to_fragment_file_to_cell_barcode={ + "cell_type": sample_to_cb + }, + chromosomes=[*[f"chr{x + 1}" for x in range(22)], "chrY", "chrX"] + ) + assert(fragment_file_correct_order(tmp_path / "_cell_type.tsv.gz")) + n_fragment_per_cb_per_chrom_outf = count_number_of_fragments_per_chrom_for_cbs( + tmp_path / "_cell_type.tsv.gz", + [*list(sample_to_cb.values())[0], *list(sample_to_cb.values())[1]] + ) + n_fragment_per_cb_per_chrom_inf = {} + for sample in fragment_files: + n_fragment_per_cb_per_chrom_sample = count_number_of_fragments_per_chrom_for_cbs( + sample, sample_to_cb[str(sample)]) + for cb in n_fragment_per_cb_per_chrom_sample: + if cb not in n_fragment_per_cb_per_chrom_inf: + n_fragment_per_cb_per_chrom_inf[cb] = {} + for chrom in n_fragment_per_cb_per_chrom_sample[cb]: + if chrom not in n_fragment_per_cb_per_chrom_inf[cb]: + n_fragment_per_cb_per_chrom_inf[cb][chrom] = 0 + n_fragment_per_cb_per_chrom_inf[cb][chrom] += n_fragment_per_cb_per_chrom_sample[cb][chrom] + for cb in n_fragment_per_cb_per_chrom_inf: + for chrom in n_fragment_per_cb_per_chrom_inf[cb]: + assert(n_fragment_per_cb_per_chrom_inf[cb][chrom] == n_fragment_per_cb_per_chrom_outf[cb][chrom]) \ No newline at end of file