From c9d22bc268b87f087ea5b883a710e6a02166c6e2 Mon Sep 17 00:00:00 2001
From: SeppeDeWinter <seppe.dewinter@hotmail.com>
Date: Fri, 7 Mar 2025 16:02:38 +0100
Subject: [PATCH 1/5] Pseudobulk now does not need to resort fragment files.

---
 pycistopic-lib/Cargo.lock           | 241 +++++++++++++++++++++++++
 pycistopic-lib/Cargo.toml           |  20 +++
 pycistopic-lib/src/custom_errors.rs |  58 ++++++
 pycistopic-lib/src/lib.rs           |  14 ++
 pycistopic-lib/src/pseudobulk.rs    | 264 ++++++++++++++++++++++++++++
 5 files changed, 597 insertions(+)
 create mode 100644 pycistopic-lib/Cargo.lock
 create mode 100644 pycistopic-lib/Cargo.toml
 create mode 100644 pycistopic-lib/src/custom_errors.rs
 create mode 100644 pycistopic-lib/src/lib.rs
 create mode 100644 pycistopic-lib/src/pseudobulk.rs

diff --git a/pycistopic-lib/Cargo.lock b/pycistopic-lib/Cargo.lock
new file mode 100644
index 0000000..68b46be
--- /dev/null
+++ b/pycistopic-lib/Cargo.lock
@@ -0,0 +1,241 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
+[[package]]
+name = "autocfg"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+
+[[package]]
+name = "cc"
+version = "1.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
+dependencies = [
+ "shlex",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "cmake"
+version = "0.1.54"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "crc32fast"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "flate2"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
+dependencies = [
+ "crc32fast",
+ "libz-ng-sys",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "indoc"
+version = "2.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
+
+[[package]]
+name = "libc"
+version = "0.2.170"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828"
+
+[[package]]
+name = "libz-ng-sys"
+version = "1.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cee1488e961a80d172564fd6fcda11d8a4ac6672c06fe008e9213fa60520c2b"
+dependencies = [
+ "cmake",
+ "libc",
+]
+
+[[package]]
+name = "memoffset"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e"
+
+[[package]]
+name = "portable-atomic"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "pycistopic-lib"
+version = "0.1.0"
+dependencies = [
+ "flate2",
+ "pyo3",
+]
+
+[[package]]
+name = "pyo3"
+version = "0.23.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872"
+dependencies = [
+ "cfg-if",
+ "indoc",
+ "libc",
+ "memoffset",
+ "once_cell",
+ "portable-atomic",
+ "pyo3-build-config",
+ "pyo3-ffi",
+ "pyo3-macros",
+ "unindent",
+]
+
+[[package]]
+name = "pyo3-build-config"
+version = "0.23.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb"
+dependencies = [
+ "once_cell",
+ "target-lexicon",
+]
+
+[[package]]
+name = "pyo3-ffi"
+version = "0.23.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d"
+dependencies = [
+ "libc",
+ "pyo3-build-config",
+]
+
+[[package]]
+name = "pyo3-macros"
+version = "0.23.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da"
+dependencies = [
+ "proc-macro2",
+ "pyo3-macros-backend",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pyo3-macros-backend"
+version = "0.23.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "pyo3-build-config",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "syn"
+version = "2.0.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "target-lexicon"
+version = "0.12.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "unindent"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
diff --git a/pycistopic-lib/Cargo.toml b/pycistopic-lib/Cargo.toml
new file mode 100644
index 0000000..a9186bf
--- /dev/null
+++ b/pycistopic-lib/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "pycistopic-lib"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+# The name of the native library. This is the name which will be used in Python to import the
+# library (i.e. `import string_sum`). If you change this, you must also change the name of the
+# `#[pymodule]` in `src/lib.rs`.
+name = "pycistopic_lib"
+# "cdylib" is necessary to produce a shared library for Python to import from.
+#
+# Downstream Rust code (including code in `bin/`, `examples/`, and `tests/`) will not be able
+# to `use string_sum;` unless the "rlib" or "lib" crate type is also included, e.g.:
+# crate-type = ["cdylib", "rlib"]
+crate-type = ["cdylib"]
+
+[dependencies]
+pyo3 = { version = "0.23.5", features = ["extension-module"] }
+flate2 = {version = "1.1.0",features = ["zlib-ng"], default-features = false}
diff --git a/pycistopic-lib/src/custom_errors.rs b/pycistopic-lib/src/custom_errors.rs
new file mode 100644
index 0000000..1653eb7
--- /dev/null
+++ b/pycistopic-lib/src/custom_errors.rs
@@ -0,0 +1,58 @@
+use pyo3::exceptions::{PyIOError, PyValueError};
+use std::fmt;
+use pyo3::PyErr;
+
+#[derive(Debug)]
+pub struct InvalidFragmentFileError {
+    pub filename: String,
+}
+
+impl InvalidFragmentFileError {
+    pub fn new(filename: &str) -> InvalidFragmentFileError {
+        InvalidFragmentFileError{
+            filename: filename.to_string()
+        }
+    }
+}
+
+impl std::error::Error for InvalidFragmentFileError{}
+
+impl fmt::Display for InvalidFragmentFileError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}: Invalid fragment file.", self.filename)
+    }
+}
+
+impl From<InvalidFragmentFileError> for PyErr {
+    fn from(err: InvalidFragmentFileError) -> PyErr {
+        PyIOError::new_err(err.to_string())
+    }
+}
+
+#[derive(Debug)]
+pub struct ValueError {
+    pub message: String,
+}
+
+impl ValueError {
+    pub fn new(message: String) -> ValueError {
+        ValueError {
+           message
+        }
+    }
+}
+
+impl std::error::Error for ValueError{}
+
+impl fmt::Display for ValueError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.message)
+    }
+}
+
+impl From<ValueError> for PyErr {
+    fn from(err: ValueError) -> PyErr {
+        PyValueError::new_err(err.to_string())
+    }
+}
+
diff --git a/pycistopic-lib/src/lib.rs b/pycistopic-lib/src/lib.rs
new file mode 100644
index 0000000..ee315e2
--- /dev/null
+++ b/pycistopic-lib/src/lib.rs
@@ -0,0 +1,14 @@
+mod pseudobulk;
+mod custom_errors;
+
+use pyo3::prelude::*;
+
+
+/// A Python module implemented in Rust. The name of this function must match
+/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
+/// import the module.
+#[pymodule]
+fn pycistopic_lib(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_function(wrap_pyfunction!(pseudobulk::split_fragment_files_by_cell_type, m)?)?;
+    Ok(())
+}
diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs
new file mode 100644
index 0000000..9b74486
--- /dev/null
+++ b/pycistopic-lib/src/pseudobulk.rs
@@ -0,0 +1,264 @@
+use super::custom_errors;
+use std::fmt;
+use std::fs::File;
+use std::path::Path;
+use std::collections::BinaryHeap;
+use std::collections::HashMap;
+use std::cmp::Reverse;
+use std::io::{BufRead, Write};
+use flate2::Compression;
+use pyo3::prelude::*;
+use flate2::read::MultiGzDecoder;
+use flate2::write::GzEncoder;
+use std::thread;
+
+
+#[derive(Eq, PartialEq)]
+struct GenomicRange {
+    chromosome: String,
+    start: usize,
+    end: usize,
+    cell_barcode: String,
+    score: Option<usize>,
+    file_index: usize,
+    file_name: String
+}
+
+impl Ord for GenomicRange {
+    fn cmp(&self, other: &GenomicRange) -> std::cmp::Ordering {
+           self.start.cmp(&other.start)
+            .then(self.end.cmp(&other.end))
+    }
+}
+
+impl PartialOrd for GenomicRange {
+    fn partial_cmp(&self, other: &GenomicRange) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl GenomicRange {
+    fn new(line: String, file_index: usize, filename: &str) -> Result<GenomicRange, custom_errors::InvalidFragmentFileError> {
+        let fields: Vec<&str> = line.split('\t').collect();
+        if fields.len() < 4 {
+            return Err(custom_errors::InvalidFragmentFileError::new(filename));
+        }
+        Ok(GenomicRange{
+            chromosome: fields[0].to_string(),
+            start: fields[1].parse::<usize>()
+                .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))?,
+            end: fields[2].parse::<usize>()
+                .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))?,
+            cell_barcode: fields[3].to_string(),
+            score: if fields.len() > 4 {
+                    Some(fields[4].parse::<usize>()
+                        .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))?
+                    )
+                } else {None},
+            file_index,
+            file_name: filename.to_string()
+        })
+    }
+}
+
+impl fmt::Display for GenomicRange {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self.score {
+            Some(score) => write!(
+                f,
+                "{}\t{}\t{}\t{}\t{}",
+                self.chromosome, self.start, self.end, self.cell_barcode, score
+            ),
+            None => write!(
+                f,
+                "{}\t{}\t{}\t{}",
+                self.chromosome, self.start, self.end, self.cell_barcode
+            ),
+        }
+    }
+}
+
+
+fn split_fragments_by_cell_barcodes_for_chromosome(
+    fragment_file_paths: &[&str],
+    fragment_file_to_cell_barcode: &HashMap<String, Vec<String>>,
+    chromosome: &str,
+    gz_output_file: &mut GzEncoder<File>
+) -> PyResult<()>{
+
+    // Open fragment files which are gzipped, and pos-sorted.
+    let mut readers: Vec<_> = fragment_file_paths
+        .iter()
+        .map(|&path| {
+            let file = File::open(Path::new(path))?;
+            let d_file = MultiGzDecoder::new(file);
+            Ok(std::io::BufReader::new(d_file))
+        })
+        .collect::<Result<_, std::io::Error>>()?;
+
+    // A binary heap will be used to write fragments in order from different files
+    let mut heap = BinaryHeap::new();
+    
+    let mut line = "#".to_string();
+    let mut bytes_read;
+
+    // Push the first fragment from each file to the heap
+    for (
+            fragment_file,
+            (index, reader)
+    ) in fragment_file_paths.iter().zip(readers.iter_mut().enumerate()) {
+        // skip header lines
+        while line.starts_with("#") {
+            line.clear();
+            reader.read_line(&mut line)?;
+            line = line.trim().to_string();
+        }
+        // Loop until a fragment with cell barcode, on the correct chromosome, in fragment_file_to_cell_barcode is found
+        let mut fragment_found = false;
+        // is the barcode found for the first (non-header) line?
+        let fragment = GenomicRange::new(
+            line.to_string(), index, fragment_file
+        )?;
+        match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) {
+            Some(cell_barcodes) => {
+                if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome {
+                    // Reverse so that "smaller" fragments (i.e. lower genomic location
+                    // are written first later on (heap will pop large elements first).
+                    heap.push(Reverse(fragment));
+                    fragment_found = true;
+                }
+            },
+            None => {
+                return Err(
+                    custom_errors::ValueError::new(
+                        format!(
+                            "fragment_file_to_cell_barcode does not contain entry for {}",
+                            fragment_file)
+                    ).into());
+            }
+        }
+
+        // barcode not in first line.
+        // keep reading until a fragment with correct barcode, on correct chromosome, is found.
+        while !fragment_found {
+            line.clear();
+            bytes_read = reader.read_line(&mut line)?;
+            if bytes_read == 0 {
+                // end of file
+                break;
+            }
+            line = line.trim().to_string();
+            let fragment = GenomicRange::new(
+                line.to_string(), index, fragment_file
+            )?;
+            match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) {
+                Some(cell_barcodes) => {
+                    if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome {
+                        // Reverse so that "smaller" fragments (i.e. lower genomic location
+                        // are written first later on (heap will pop large elements first).
+                        heap.push(Reverse(fragment));
+                        fragment_found = true;
+                    }
+                },
+                None => {
+                    return Err(
+                        custom_errors::ValueError::new(
+                            format!(
+                                "fragment_file_to_cell_barcode does not contain entry for {}",
+                                fragment_file)
+                        ).into());
+                }
+            }
+        }
+    }
+
+    while let Some(Reverse(fragment)) = heap.pop() {
+        gz_output_file.write_all(format!("{}\n", fragment).as_bytes())?;
+        // Loop until a fragment with cell barcode in fragment_file_to_cell_barcode is found
+        let mut fragment_found  = false;
+        while !fragment_found {
+            // Read next range from file that had the smallest range and add this to the heap.
+            line.clear();
+            bytes_read = readers[fragment.file_index].read_line(&mut line)?;
+            if bytes_read == 0 {
+                // end of file
+                break;
+            }
+            line = line.trim().to_string();
+            let next_fragment = GenomicRange::new(
+                line.to_string(), fragment.file_index, &fragment.file_name
+            )?;
+            // Assuming that the fragment files are sorted.
+            // Using the previous while loop, for each file, we should be at the correct location of the file
+            // (i.e. where the current chromosomes are located).
+            // if the next fragment file has a different chromosome, we should be done with this file and we can skip it.
+            if next_fragment.chromosome != chromosome {
+                break;
+            }
+            match fragment_file_to_cell_barcode.get(&next_fragment.file_name) {
+                Some(cell_barcodes) => {
+                    if cell_barcodes.contains(&next_fragment.cell_barcode) {
+                        heap.push(Reverse(next_fragment));
+                        fragment_found = true;
+                    }
+                },
+                None => {
+                    return Err(
+                        custom_errors::ValueError::new(
+                            format!(
+                                "fragment_file_to_cell_barcode does not contain entry for {}",
+                                next_fragment.file_name)
+                        ).into()
+                    );
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+#[pyfunction]
+pub fn split_fragment_files_by_cell_type(
+    fragment_file_paths: Vec<String>,
+    output_file_prefix: &str,
+    cell_type_to_fragment_file_to_cell_barcode: HashMap<String, HashMap<String, Vec<String>>>,
+    chromosomes: Vec<String>
+) -> PyResult<()> {
+    for cell_type in cell_type_to_fragment_file_to_cell_barcode.keys() {
+        let mut handles: Vec<thread::JoinHandle<_>> = Vec::new();
+        for chromosome in &chromosomes {
+            let output_file_name = format!("{}_{}.{}.tsv.gz", output_file_prefix, cell_type, chromosome);
+            let fragment_file_paths = fragment_file_paths.clone(); // Need to clone since threads take ownership
+            let fragment_file_to_cell_barcode = cell_type_to_fragment_file_to_cell_barcode
+                .get(cell_type)
+                .unwrap()
+                .clone();
+            let file = File::create(output_file_name)?;
+            let chromosome = chromosome.clone();
+            let handle = thread::spawn(move || {
+                let mut gz_output_file = GzEncoder::new(file, Compression::default());
+                split_fragments_by_cell_barcodes_for_chromosome(
+                    &fragment_file_paths.iter().map(|p| p.as_str()).collect::<Vec<_>>(),
+                    &fragment_file_to_cell_barcode,
+                    &chromosome,
+                    &mut gz_output_file
+                )
+            });
+            handles.push(handle);
+        }
+        for handle in handles {
+            handle.join().expect("Thread panicked")?;
+        }
+        // concat all chromosomes
+        let output_file_name = format!("{}_{}.tsv.gz", output_file_prefix, cell_type);
+        let output_file = File::create(&output_file_name)?;
+        let mut writer = std::io::BufWriter::new(output_file);
+        for chromosome in &chromosomes {
+            let input_file_name = format!("{}_{}.{}.tsv.gz", output_file_prefix, cell_type, chromosome);
+            let mut input_file = File::open(&input_file_name)?;
+            std::io::copy(&mut input_file, &mut writer)?;
+        }
+        writer.flush()?;
+    }
+    Ok(())
+}

From a8fc34bf320492b5811babd162f79233f1b5f22d Mon Sep 17 00:00:00 2001
From: SeppeDeWinter <seppe.dewinter@hotmail.com>
Date: Fri, 7 Mar 2025 23:31:51 +0100
Subject: [PATCH 2/5] Refactor.

---
 pycistopic-lib/src/pseudobulk.rs | 238 ++++++++++++++++---------------
 1 file changed, 122 insertions(+), 116 deletions(-)

diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs
index 9b74486..59c6892 100644
--- a/pycistopic-lib/src/pseudobulk.rs
+++ b/pycistopic-lib/src/pseudobulk.rs
@@ -13,7 +13,7 @@ use flate2::write::GzEncoder;
 use std::thread;
 
 
-#[derive(Eq, PartialEq)]
+#[derive(Eq, PartialEq, Clone)]
 struct GenomicRange {
     chromosome: String,
     start: usize,
@@ -38,7 +38,7 @@ impl PartialOrd for GenomicRange {
 }
 
 impl GenomicRange {
-    fn new(line: String, file_index: usize, filename: &str) -> Result<GenomicRange, custom_errors::InvalidFragmentFileError> {
+    fn new(line: &str, file_index: usize, filename: &str) -> Result<GenomicRange, custom_errors::InvalidFragmentFileError> {
         let fields: Vec<&str> = line.split('\t').collect();
         if fields.len() < 4 {
             return Err(custom_errors::InvalidFragmentFileError::new(filename));
@@ -79,6 +79,102 @@ impl fmt::Display for GenomicRange {
 }
 
 
+struct FragmentFileReader {
+    reader: std::io::BufReader<MultiGzDecoder<File>>,
+    fragment: Result<GenomicRange, custom_errors::InvalidFragmentFileError>,
+    buffer: String,
+    valid_cell_barcodes: Vec<String>,
+    target_chrom: String,
+    file_index: usize,
+    file_name: String,
+    at_end_of_file: bool,
+}
+
+impl FragmentFileReader {
+    fn new(fragment_file_path: &str, valid_cell_barcodes: Vec<String>, target_chrom: String, file_index: usize) -> Result<FragmentFileReader, std::io::Error> {
+        let file = File::open(Path::new(fragment_file_path))?;
+        let d_file = MultiGzDecoder::new(file);
+        let reader = std::io::BufReader::new(d_file);
+        Ok(FragmentFileReader{
+            reader,
+            fragment: GenomicRange::new("", file_index, fragment_file_path),
+            buffer: String::new(),
+            valid_cell_barcodes,
+            target_chrom,
+            file_index, 
+            file_name: fragment_file_path.to_string(),
+            at_end_of_file: false
+        })
+    }
+
+    fn at_chrom(&self) -> bool {
+        if let Ok(fragment) = &self.fragment{
+            fragment.chromosome == self.target_chrom
+        } else {
+            false
+        }
+    }
+
+    fn read_next(&mut self) -> Result<(), custom_errors::InvalidFragmentFileError>{
+        self.buffer.clear();
+        match self.reader.read_line(&mut self.buffer) {
+            Ok(bytes_read) => {
+                if bytes_read == 0 {
+                    self.at_end_of_file = true
+                }
+            },
+            Err(_) => {
+                return Err(custom_errors::InvalidFragmentFileError::new(&self.file_name))
+            }
+        }
+        self.buffer = self.buffer.trim().to_string();
+        self.fragment = GenomicRange::new(&self.buffer, self.file_index, &self.file_name);
+        Ok(())
+    }
+
+    fn skip_header(&mut self, pattern: &str) -> Result<(), custom_errors::InvalidFragmentFileError> {
+        self.buffer = pattern.to_string();
+        while self.buffer.starts_with(pattern) && !self.at_end_of_file {
+            self.read_next()?;
+        }
+        Ok(())
+    }
+
+    fn skip_to_chromosome(&mut self, chromosome: &str) -> Result<(), custom_errors::InvalidFragmentFileError> {
+        while !self.at_end_of_file {
+            if let Ok(fragment) = &self.fragment {
+                if fragment.chromosome == chromosome {
+                    return Ok(());
+                }
+            };
+            self.read_next()?;
+        }
+        Ok(())
+    }
+
+    fn get_next_valid_fragment(&mut self) -> Result<Option<GenomicRange>, custom_errors::InvalidFragmentFileError> {
+        while !self.at_end_of_file {
+            // first check current fragment
+            // it could be that after skip_header and skip_to_chromosome we are already at a 
+            // valid fragment.
+            if let Ok(fragment) = &self.fragment {
+                if self.valid_cell_barcodes.contains(&fragment.cell_barcode) {
+                    let fragment = fragment.clone();
+                    if fragment.chromosome != self.target_chrom {
+                        return Ok(None)
+                    }
+                    // read next fragment so next time we enter this function we have a new frag,et
+                    self.read_next()?;
+                    return Ok(Some(fragment));
+                }
+            }
+            self.read_next()?;
+        }
+        Ok(None)
+    }
+}
+
+
 fn split_fragments_by_cell_barcodes_for_chromosome(
     fragment_file_paths: &[&str],
     fragment_file_to_cell_barcode: &HashMap<String, Vec<String>>,
@@ -87,130 +183,40 @@ fn split_fragments_by_cell_barcodes_for_chromosome(
 ) -> PyResult<()>{
 
     // Open fragment files which are gzipped, and pos-sorted.
-    let mut readers: Vec<_> = fragment_file_paths
-        .iter()
-        .map(|&path| {
-            let file = File::open(Path::new(path))?;
-            let d_file = MultiGzDecoder::new(file);
-            Ok(std::io::BufReader::new(d_file))
-        })
-        .collect::<Result<_, std::io::Error>>()?;
+    let mut readers: Vec<FragmentFileReader> = Vec::new();
+    for (file_index, fragment_file_path) in fragment_file_paths.iter().enumerate() {
+        if let Some(cell_barcodes) = fragment_file_to_cell_barcode.get(&fragment_file_path.to_string()) {
+            readers.push(
+                FragmentFileReader::new(
+                    fragment_file_path,
+                    cell_barcodes.to_vec(),
+                    chromosome.to_string(),
+                    file_index)?
+            );
+        }
+    }
 
     // A binary heap will be used to write fragments in order from different files
     let mut heap = BinaryHeap::new();
     
-    let mut line = "#".to_string();
-    let mut bytes_read;
-
-    // Push the first fragment from each file to the heap
-    for (
-            fragment_file,
-            (index, reader)
-    ) in fragment_file_paths.iter().zip(readers.iter_mut().enumerate()) {
-        // skip header lines
-        while line.starts_with("#") {
-            line.clear();
-            reader.read_line(&mut line)?;
-            line = line.trim().to_string();
-        }
-        // Loop until a fragment with cell barcode, on the correct chromosome, in fragment_file_to_cell_barcode is found
-        let mut fragment_found = false;
-        // is the barcode found for the first (non-header) line?
-        let fragment = GenomicRange::new(
-            line.to_string(), index, fragment_file
-        )?;
-        match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) {
-            Some(cell_barcodes) => {
-                if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome {
-                    // Reverse so that "smaller" fragments (i.e. lower genomic location
-                    // are written first later on (heap will pop large elements first).
-                    heap.push(Reverse(fragment));
-                    fragment_found = true;
-                }
-            },
-            None => {
-                return Err(
-                    custom_errors::ValueError::new(
-                        format!(
-                            "fragment_file_to_cell_barcode does not contain entry for {}",
-                            fragment_file)
-                    ).into());
-            }
-        }
-
-        // barcode not in first line.
-        // keep reading until a fragment with correct barcode, on correct chromosome, is found.
-        while !fragment_found {
-            line.clear();
-            bytes_read = reader.read_line(&mut line)?;
-            if bytes_read == 0 {
-                // end of file
-                break;
-            }
-            line = line.trim().to_string();
-            let fragment = GenomicRange::new(
-                line.to_string(), index, fragment_file
-            )?;
-            match fragment_file_to_cell_barcode.get(&fragment_file.to_string()) {
-                Some(cell_barcodes) => {
-                    if cell_barcodes.contains(&fragment.cell_barcode) && fragment.chromosome == chromosome {
-                        // Reverse so that "smaller" fragments (i.e. lower genomic location
-                        // are written first later on (heap will pop large elements first).
-                        heap.push(Reverse(fragment));
-                        fragment_found = true;
-                    }
-                },
-                None => {
-                    return Err(
-                        custom_errors::ValueError::new(
-                            format!(
-                                "fragment_file_to_cell_barcode does not contain entry for {}",
-                                fragment_file)
-                        ).into());
-                }
+    // skip header lines, go to correct chromosome, and push first valid fragment to binary heap
+    for reader in readers.iter_mut(){
+        reader.skip_header("#")?;
+        reader.skip_to_chromosome(chromosome)?;
+        if !reader.at_end_of_file  && reader.at_chrom() {
+            if let Some(fragment) = reader.get_next_valid_fragment()? {
+                heap.push(Reverse(fragment));
             }
         }
     }
 
     while let Some(Reverse(fragment)) = heap.pop() {
         gz_output_file.write_all(format!("{}\n", fragment).as_bytes())?;
-        // Loop until a fragment with cell barcode in fragment_file_to_cell_barcode is found
-        let mut fragment_found  = false;
-        while !fragment_found {
-            // Read next range from file that had the smallest range and add this to the heap.
-            line.clear();
-            bytes_read = readers[fragment.file_index].read_line(&mut line)?;
-            if bytes_read == 0 {
-                // end of file
-                break;
-            }
-            line = line.trim().to_string();
-            let next_fragment = GenomicRange::new(
-                line.to_string(), fragment.file_index, &fragment.file_name
-            )?;
-            // Assuming that the fragment files are sorted.
-            // Using the previous while loop, for each file, we should be at the correct location of the file
-            // (i.e. where the current chromosomes are located).
-            // if the next fragment file has a different chromosome, we should be done with this file and we can skip it.
-            if next_fragment.chromosome != chromosome {
-                break;
-            }
-            match fragment_file_to_cell_barcode.get(&next_fragment.file_name) {
-                Some(cell_barcodes) => {
-                    if cell_barcodes.contains(&next_fragment.cell_barcode) {
-                        heap.push(Reverse(next_fragment));
-                        fragment_found = true;
-                    }
-                },
-                None => {
-                    return Err(
-                        custom_errors::ValueError::new(
-                            format!(
-                                "fragment_file_to_cell_barcode does not contain entry for {}",
-                                next_fragment.file_name)
-                        ).into()
-                    );
-                }
+        // read from file that currently has the smallest genomic range
+        let reader = &mut readers[fragment.file_index];
+        if !reader.at_end_of_file && reader.at_chrom() {
+            if let Some(fragment) = reader.get_next_valid_fragment()? {
+                heap.push(Reverse(fragment));
             }
         }
     }

From de9c483ebc1209a2428bd760e6bf31d6a7e73ad4 Mon Sep 17 00:00:00 2001
From: SeppeDeWinter <seppe.dewinter@hotmail.com>
Date: Sat, 8 Mar 2025 19:59:19 +0100
Subject: [PATCH 3/5] Make use of tabix file to seek to chromosome position.

---
 pycistopic-lib/Cargo.lock        | 164 ++++++++++++++++++++++++++-----
 pycistopic-lib/Cargo.toml        |   2 +-
 pycistopic-lib/src/pseudobulk.rs |  42 +++++---
 3 files changed, 172 insertions(+), 36 deletions(-)

diff --git a/pycistopic-lib/Cargo.lock b/pycistopic-lib/Cargo.lock
index 68b46be..17eeb5f 100644
--- a/pycistopic-lib/Cargo.lock
+++ b/pycistopic-lib/Cargo.lock
@@ -15,14 +15,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 
 [[package]]
-name = "cc"
-version = "1.2.14"
+name = "bit-vec"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
+checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
+
+[[package]]
+name = "bstr"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c"
 dependencies = [
- "shlex",
+ "memchr",
+ "serde",
 ]
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@@ -30,23 +49,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
-name = "cmake"
-version = "0.1.54"
+name = "crc32fast"
+version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
 dependencies = [
- "cc",
+ "cfg-if",
 ]
 
 [[package]]
-name = "crc32fast"
-version = "1.4.2"
+name = "crossbeam-channel"
+version = "0.5.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
+checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471"
 dependencies = [
- "cfg-if",
+ "crossbeam-utils",
 ]
 
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
 [[package]]
 name = "flate2"
 version = "1.1.0"
@@ -54,16 +85,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
 dependencies = [
  "crc32fast",
- "libz-ng-sys",
  "miniz_oxide",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "indexmap"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
 [[package]]
 name = "indoc"
 version = "2.0.6"
@@ -77,14 +123,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828"
 
 [[package]]
-name = "libz-ng-sys"
-version = "1.1.21"
+name = "memchr"
+version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7cee1488e961a80d172564fd6fcda11d8a4ac6672c06fe008e9213fa60520c2b"
-dependencies = [
- "cmake",
- "libc",
-]
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
 name = "memoffset"
@@ -104,6 +146,66 @@ dependencies = [
  "adler2",
 ]
 
+[[package]]
+name = "noodles"
+version = "0.93.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74eea88cacba7e16cb6aff1bc3e077f3f9e036e59b9107a8415bf9242d0c2d97"
+dependencies = [
+ "noodles-bgzf",
+ "noodles-core",
+ "noodles-csi",
+ "noodles-tabix",
+]
+
+[[package]]
+name = "noodles-bgzf"
+version = "0.36.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c10c4cb29950028823653395ebedfcaf5b652e504aacf816b510fb0268daa2eb"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "crossbeam-channel",
+ "flate2",
+]
+
+[[package]]
+name = "noodles-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "962b13b79312f773a12ffcb0cdaccab6327f8343b6f440a888eff10c749d52b0"
+dependencies = [
+ "bstr",
+]
+
+[[package]]
+name = "noodles-csi"
+version = "0.44.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45b402963653cacd5df474889a1cfb969e2f66469023836ad3b0a2c75e2b4e7c"
+dependencies = [
+ "bit-vec",
+ "bstr",
+ "byteorder",
+ "indexmap",
+ "noodles-bgzf",
+ "noodles-core",
+]
+
+[[package]]
+name = "noodles-tabix"
+version = "0.50.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e46ee069d57770840554e3d8bd02e7415b3a2d972cdea7525bea02472032c234"
+dependencies = [
+ "byteorder",
+ "indexmap",
+ "noodles-bgzf",
+ "noodles-core",
+ "noodles-csi",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.20.3"
@@ -129,7 +231,7 @@ dependencies = [
 name = "pycistopic-lib"
 version = "0.1.0"
 dependencies = [
- "flate2",
+ "noodles",
  "pyo3",
 ]
 
@@ -206,10 +308,24 @@ dependencies = [
 ]
 
 [[package]]
-name = "shlex"
-version = "1.3.0"
+name = "serde"
+version = "1.0.213"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.213"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
 
 [[package]]
 name = "syn"
diff --git a/pycistopic-lib/Cargo.toml b/pycistopic-lib/Cargo.toml
index a9186bf..4dda0b7 100644
--- a/pycistopic-lib/Cargo.toml
+++ b/pycistopic-lib/Cargo.toml
@@ -17,4 +17,4 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.23.5", features = ["extension-module"] }
-flate2 = {version = "1.1.0",features = ["zlib-ng"], default-features = false}
+noodles = { version = "0.93.0", features = ["tabix", "bgzf", "csi", "core"] }
diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs
index 59c6892..b769b06 100644
--- a/pycistopic-lib/src/pseudobulk.rs
+++ b/pycistopic-lib/src/pseudobulk.rs
@@ -6,12 +6,11 @@ use std::collections::BinaryHeap;
 use std::collections::HashMap;
 use std::cmp::Reverse;
 use std::io::{BufRead, Write};
-use flate2::Compression;
 use pyo3::prelude::*;
-use flate2::read::MultiGzDecoder;
-use flate2::write::GzEncoder;
 use std::thread;
-
+use noodles::{tabix, bgzf};
+use noodles::csi::BinningIndex;
+use noodles::core::{region::Interval, Position};
 
 #[derive(Eq, PartialEq, Clone)]
 struct GenomicRange {
@@ -80,7 +79,8 @@ impl fmt::Display for GenomicRange {
 
 
 struct FragmentFileReader {
-    reader: std::io::BufReader<MultiGzDecoder<File>>,
+    reader: bgzf::Reader<File>,
+    index: Option<tabix::Index>,
     fragment: Result<GenomicRange, custom_errors::InvalidFragmentFileError>,
     buffer: String,
     valid_cell_barcodes: Vec<String>,
@@ -92,11 +92,15 @@ struct FragmentFileReader {
 
 impl FragmentFileReader {
     fn new(fragment_file_path: &str, valid_cell_barcodes: Vec<String>, target_chrom: String, file_index: usize) -> Result<FragmentFileReader, std::io::Error> {
-        let file = File::open(Path::new(fragment_file_path))?;
-        let d_file = MultiGzDecoder::new(file);
-        let reader = std::io::BufReader::new(d_file);
+        let reader = bgzf::Reader::new(File::open(Path::new(fragment_file_path))?);
+        let mut index: Option<tabix::Index> = None;
+        if Path::new(&format!("{}.tbi", fragment_file_path)).exists() {
+            println!("Index found: {}", format!("{}.tbi", fragment_file_path));
+            index = Some(tabix::fs::read(format!("{}.tbi", fragment_file_path))?);
+        }
         Ok(FragmentFileReader{
             reader,
+            index,
             fragment: GenomicRange::new("", file_index, fragment_file_path),
             buffer: String::new(),
             valid_cell_barcodes,
@@ -141,6 +145,23 @@ impl FragmentFileReader {
     }
 
     fn skip_to_chromosome(&mut self, chromosome: &str) -> Result<(), custom_errors::InvalidFragmentFileError> {
+        if let Some(index) = &self.index {
+            if let Some(header) = index.header() {
+                let seq_names = header.reference_sequence_names();
+                if let Some(chromosome_index) = seq_names.get_index_of(chromosome.as_bytes()) {
+                    if let Ok(q) = index.query(
+                        chromosome_index,
+                        Interval::from(Position::MIN..)
+                    ) {
+                        let first_chunk = q[0];
+                        self.reader.seek(first_chunk.start()).map_err(|_| custom_errors::InvalidFragmentFileError::new(&self.file_name))?;
+                        self.read_next()?;
+                    } 
+                }
+            }
+        } else {
+            println!("Something is wrong with the index");
+        }
         while !self.at_end_of_file {
             if let Ok(fragment) = &self.fragment {
                 if fragment.chromosome == chromosome {
@@ -174,12 +195,11 @@ impl FragmentFileReader {
     }
 }
 
-
 fn split_fragments_by_cell_barcodes_for_chromosome(
     fragment_file_paths: &[&str],
     fragment_file_to_cell_barcode: &HashMap<String, Vec<String>>,
     chromosome: &str,
-    gz_output_file: &mut GzEncoder<File>
+    gz_output_file: &mut bgzf::Writer<File>
 ) -> PyResult<()>{
 
     // Open fragment files which are gzipped, and pos-sorted.
@@ -242,7 +262,7 @@ pub fn split_fragment_files_by_cell_type(
             let file = File::create(output_file_name)?;
             let chromosome = chromosome.clone();
             let handle = thread::spawn(move || {
-                let mut gz_output_file = GzEncoder::new(file, Compression::default());
+                let mut gz_output_file = bgzf::Writer::new(file);
                 split_fragments_by_cell_barcodes_for_chromosome(
                     &fragment_file_paths.iter().map(|p| p.as_str()).collect::<Vec<_>>(),
                     &fragment_file_to_cell_barcode,

From a33fad9f979821252e83183735b5cb2343dd3666 Mon Sep 17 00:00:00 2001
From: SeppeDeWinter <seppe.dewinter@hotmail.com>
Date: Mon, 10 Mar 2025 09:54:24 +0100
Subject: [PATCH 4/5] Only skip header when not using index.

---
 pycistopic-lib/src/pseudobulk.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs
index b769b06..280793e 100644
--- a/pycistopic-lib/src/pseudobulk.rs
+++ b/pycistopic-lib/src/pseudobulk.rs
@@ -145,6 +145,7 @@ impl FragmentFileReader {
     }
 
     fn skip_to_chromosome(&mut self, chromosome: &str) -> Result<(), custom_errors::InvalidFragmentFileError> {
+        let mut using_index = false;
         if let Some(index) = &self.index {
             if let Some(header) = index.header() {
                 let seq_names = header.reference_sequence_names();
@@ -156,11 +157,15 @@ impl FragmentFileReader {
                         let first_chunk = q[0];
                         self.reader.seek(first_chunk.start()).map_err(|_| custom_errors::InvalidFragmentFileError::new(&self.file_name))?;
                         self.read_next()?;
+                        using_index = true;
                     } 
                 }
             }
-        } else {
-            println!("Something is wrong with the index");
+        } 
+        if !using_index {
+            // If no index, manually skip header lines.
+            // otherwise this is done automatically
+            self.skip_header("#")?;
         }
         while !self.at_end_of_file {
             if let Ok(fragment) = &self.fragment {
@@ -219,9 +224,8 @@ fn split_fragments_by_cell_barcodes_for_chromosome(
     // A binary heap will be used to write fragments in order from different files
     let mut heap = BinaryHeap::new();
     
-    // skip header lines, go to correct chromosome, and push first valid fragment to binary heap
+    // go to correct chromosome and push first valid fragment to binary heap
     for reader in readers.iter_mut(){
-        reader.skip_header("#")?;
         reader.skip_to_chromosome(chromosome)?;
         if !reader.at_end_of_file  && reader.at_chrom() {
             if let Some(fragment) = reader.get_next_valid_fragment()? {

From 48e11028eb9d3b31ed4b208fd6fc6058aa8fdafa Mon Sep 17 00:00:00 2001
From: SeppeDeWinter <seppe.dewinter@hotmail.com>
Date: Mon, 10 Mar 2025 09:55:57 +0100
Subject: [PATCH 5/5] Add testing code.

---
 pycistopic-lib/test/bcs_nextgem.txt    |  10 ++
 pycistopic-lib/test/bcs_v1.txt         |  10 ++
 pycistopic-lib/test/pseudobulk_test.py | 154 +++++++++++++++++++++++++
 3 files changed, 174 insertions(+)
 create mode 100644 pycistopic-lib/test/bcs_nextgem.txt
 create mode 100644 pycistopic-lib/test/bcs_v1.txt
 create mode 100644 pycistopic-lib/test/pseudobulk_test.py

diff --git a/pycistopic-lib/test/bcs_nextgem.txt b/pycistopic-lib/test/bcs_nextgem.txt
new file mode 100644
index 0000000..2441cc2
--- /dev/null
+++ b/pycistopic-lib/test/bcs_nextgem.txt
@@ -0,0 +1,10 @@
+CCACGTTCAGTAACTC-1
+TTCGATTCATCTCTCG-1
+TTGCCCACAAGGTTCT-1
+TTTGGCCAGTACAGAT-1
+CTAACTTAGAAATGGG-1
+TGAGTCACACCACGAC-1
+CGAGTTATCAAGTTGC-1
+TGCATTTGTAACATAG-1
+TGCATTTGTAACATAG-1
+GAAGTCTTCTACTTTG-1
diff --git a/pycistopic-lib/test/bcs_v1.txt b/pycistopic-lib/test/bcs_v1.txt
new file mode 100644
index 0000000..cc57001
--- /dev/null
+++ b/pycistopic-lib/test/bcs_v1.txt
@@ -0,0 +1,10 @@
+GGTGTCGTCAAGGCAG-1
+CTCAACCTCAGGTTTG-1
+GGTGTCGTCAAGGCAG-1
+AGACAAACACATCATG-1
+CATGCCTGTCCTTATT-1
+GGTGTCGTCAAGGCAG-1
+GCATTCCGTTAGAGAT-1
+CAGCTAATCTGATCCC-1
+GAGCATTAGCTTACCA-1
+GCATTCCGTTAGAGAT-1
diff --git a/pycistopic-lib/test/pseudobulk_test.py b/pycistopic-lib/test/pseudobulk_test.py
new file mode 100644
index 0000000..19d4353
--- /dev/null
+++ b/pycistopic-lib/test/pseudobulk_test.py
@@ -0,0 +1,154 @@
+import pytest
+import urllib.request
+import urllib
+import gzip
+import os
+import pathlib
+import shutil
+
+from pycistopic_lib import split_fragment_files_by_cell_type
+
+TEST_DIRECTORY = pathlib.Path(__file__).parent.absolute()
+
+FRAGMENT_FILEs = [
+    TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_nextgem_fragments.tsv.gz"),
+    TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_v1_fragments.tsv.gz")
+]
+
+INDEX_FILEs= [
+    TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_nextgem_fragments.tsv.gz.tbi"),
+    TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_v1_fragments.tsv.gz.tbi")
+]
+
+@pytest.fixture
+def fragment_file(tmp_path):
+    for f in FRAGMENT_FILEs:
+        shutil.copy(f, tmp_path)
+    return [tmp_path / f.name for f in FRAGMENT_FILEs]
+
+@pytest.fixture
+def fragment_file_and_index(tmp_path):
+    for f in FRAGMENT_FILEs:
+        shutil.copy(f, tmp_path)
+    for f in INDEX_FILEs:
+        shutil.copy(f, tmp_path)
+    return [tmp_path / f.name for f in FRAGMENT_FILEs]
+
+def count_number_of_fragments_per_chrom_for_cbs(file_name: str, cbs: list[str]):
+    n_fragment_per_cb_per_chrom = {
+        cb: {} for cb in cbs
+    }
+    with gzip.open(file_name) as f:
+        for fragment in f:
+            chrom, _, _, cb = fragment.decode().split()[0:4]
+            if cb in cbs:
+                if chrom not in n_fragment_per_cb_per_chrom[cb]:
+                    n_fragment_per_cb_per_chrom[cb][chrom] = 0
+                n_fragment_per_cb_per_chrom[cb][chrom] += 1
+    return n_fragment_per_cb_per_chrom
+
+def fragment_file_correct_order(file_name: str):
+    chrom_to_start = {}
+    with gzip.open(file_name) as f:
+        for l in f:
+            l = l.decode()
+            if l.startswith("#"):
+                continue
+            chrom, start = l.split()[0:2]
+            if chrom not in chrom_to_start:
+                chrom_to_start[chrom] = []
+            chrom_to_start[chrom].append(int(start))
+    def always_inc(l):
+        x = l[0]
+        for y in l[1:]:
+            if y < x:
+                return False
+        return True
+    return all([always_inc(chrom_to_start[chrom]) for chrom in chrom_to_start])
+
+def test_pseudobulk_indexed(fragment_file_and_index, tmp_path):
+    fragment_files = fragment_file_and_index
+    bcs_1 = []
+    bcs_2 = []
+    with open(TEST_DIRECTORY / pathlib.Path("bcs_nextgem.txt")) as f:
+        for l in f:
+            bcs_1.append(l.strip())
+
+    with open(TEST_DIRECTORY / pathlib.Path("bcs_v1.txt")) as f:
+        for l in f:
+            bcs_2.append(l.strip())
+    sample_to_cb = {
+        str(fragment_files[0]): bcs_1,
+        str(fragment_files[1]): bcs_2,
+    }
+    split_fragment_files_by_cell_type(
+        fragment_file_paths=[str(f) for f in fragment_files],
+        output_file_prefix=str(tmp_path) +  "/",
+        cell_type_to_fragment_file_to_cell_barcode={
+            "cell_type": sample_to_cb
+        },
+        chromosomes=[*[f"chr{x + 1}" for x in range(22)], "chrY", "chrX"]
+    )
+    assert(fragment_file_correct_order(tmp_path / "_cell_type.tsv.gz"))
+    n_fragment_per_cb_per_chrom_outf = count_number_of_fragments_per_chrom_for_cbs(
+        tmp_path / "_cell_type.tsv.gz",
+        [*list(sample_to_cb.values())[0], *list(sample_to_cb.values())[1]]
+    )
+    n_fragment_per_cb_per_chrom_inf = {}
+    for sample in fragment_files:
+        n_fragment_per_cb_per_chrom_sample = count_number_of_fragments_per_chrom_for_cbs(
+            sample, sample_to_cb[str(sample)])
+        for cb in n_fragment_per_cb_per_chrom_sample:
+            if cb not in n_fragment_per_cb_per_chrom_inf:
+                n_fragment_per_cb_per_chrom_inf[cb] = {}
+            for chrom in n_fragment_per_cb_per_chrom_sample[cb]:
+                if chrom not in n_fragment_per_cb_per_chrom_inf[cb]:
+                    n_fragment_per_cb_per_chrom_inf[cb][chrom] = 0
+                n_fragment_per_cb_per_chrom_inf[cb][chrom] += n_fragment_per_cb_per_chrom_sample[cb][chrom]
+    for cb in n_fragment_per_cb_per_chrom_inf:
+        for chrom in n_fragment_per_cb_per_chrom_inf[cb]:
+            assert(n_fragment_per_cb_per_chrom_inf[cb][chrom] == n_fragment_per_cb_per_chrom_outf[cb][chrom])
+
+
+def test_pseudobulk_not_indexed(fragment_file, tmp_path):
+    fragment_files = fragment_file
+    bcs_1 = []
+    bcs_2 = []
+    with open(TEST_DIRECTORY / pathlib.Path("bcs_nextgem.txt")) as f:
+        for l in f:
+            bcs_1.append(l.strip())
+
+    with open(TEST_DIRECTORY / pathlib.Path("bcs_v1.txt")) as f:
+        for l in f:
+            bcs_2.append(l.strip())
+    sample_to_cb = {
+        str(fragment_files[0]): bcs_1,
+        str(fragment_files[1]): bcs_2,
+    }
+    split_fragment_files_by_cell_type(
+        fragment_file_paths=[str(f) for f in fragment_files],
+        output_file_prefix=str(tmp_path) +  "/",
+        cell_type_to_fragment_file_to_cell_barcode={
+            "cell_type": sample_to_cb
+        },
+        chromosomes=[*[f"chr{x + 1}" for x in range(22)], "chrY", "chrX"]
+    )
+    assert(fragment_file_correct_order(tmp_path / "_cell_type.tsv.gz"))
+    n_fragment_per_cb_per_chrom_outf = count_number_of_fragments_per_chrom_for_cbs(
+        tmp_path / "_cell_type.tsv.gz",
+        [*list(sample_to_cb.values())[0], *list(sample_to_cb.values())[1]]
+    )
+    n_fragment_per_cb_per_chrom_inf = {}
+    for sample in fragment_files:
+        n_fragment_per_cb_per_chrom_sample = count_number_of_fragments_per_chrom_for_cbs(
+            sample, sample_to_cb[str(sample)])
+        for cb in n_fragment_per_cb_per_chrom_sample:
+            if cb not in n_fragment_per_cb_per_chrom_inf:
+                n_fragment_per_cb_per_chrom_inf[cb] = {}
+            for chrom in n_fragment_per_cb_per_chrom_sample[cb]:
+                if chrom not in n_fragment_per_cb_per_chrom_inf[cb]:
+                    n_fragment_per_cb_per_chrom_inf[cb][chrom] = 0
+                n_fragment_per_cb_per_chrom_inf[cb][chrom] += n_fragment_per_cb_per_chrom_sample[cb][chrom]
+    for cb in n_fragment_per_cb_per_chrom_inf:
+        for chrom in n_fragment_per_cb_per_chrom_inf[cb]:
+            assert(n_fragment_per_cb_per_chrom_inf[cb][chrom] == n_fragment_per_cb_per_chrom_outf[cb][chrom])
\ No newline at end of file