Skip to content

Commit 1b59a6f

Browse files
authored
Merge pull request #7 from SingleRust/feature-dev-load-speedup
Feature dev load speedup
2 parents eea65be + 0febb0c commit 1b59a6f

File tree

7 files changed

+1261
-106
lines changed

7 files changed

+1261
-106
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "anndata-memory"
3-
version = "1.0.4"
3+
version = "1.0.5"
44
edition = "2021"
55
readme = "README.md"
66
repository = "https://github.com/SingleRust/Anndata-Memory"

src/ad/mod.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,25 @@ impl IMAnnData {
464464
layers,
465465
})
466466
}
467+
468+
#[cfg(test)]
469+
pub fn debug_info(&self) -> anyhow::Result<()> {
470+
println!("AnnData Debug Info:");
471+
println!(" Dimensions: {} obs x {} vars", self.n_obs(), self.n_vars());
472+
473+
let x_shape = self.x().get_shape()?;
474+
println!(" X matrix shape: {:?}", x_shape);
475+
476+
let obs_df_shape = self.obs().get_data().shape();
477+
let var_df_shape = self.var().get_data().shape();
478+
println!(" obs DataFrame shape: {:?}", obs_df_shape);
479+
println!(" var DataFrame shape: {:?}", var_df_shape);
480+
481+
println!(" First 3 obs names: {:?}", &self.obs_names()[..self.n_obs().min(3)]);
482+
println!(" First 3 var names: {:?}", &self.var_names()[..self.n_vars().min(3)]);
483+
484+
Ok(())
485+
}
467486
}
468487

469488
use std::fmt;

src/optimized_loader.rs

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
use anndata::{backend::{AttributeOp, DataContainer, DatasetOp, GroupOp, ScalarType}, ArrayData, Backend};
1+
use anndata::{
2+
backend::{AttributeOp, DataContainer, DatasetOp, GroupOp, ScalarType},
3+
ArrayData, Backend,
4+
};
25
use ndarray::Ix1;
36

4-
use crate::utils::{build_csr_matrix, read_array_as_usize_optimized};
7+
use crate::utils::{build_csc_matrix, build_csr_matrix, read_array_as_usize_optimized};
58

6-
pub fn load_csr_optimized<B: Backend>(
7-
container: &DataContainer<B>,
8-
) -> anyhow::Result<ArrayData> {
9+
pub fn load_csr_optimized<B: Backend>(container: &DataContainer<B>) -> anyhow::Result<ArrayData> {
910
let group = container.as_group()?;
1011
let shape: Vec<u64> = group.get_attr("shape")?;
1112
let nrows = shape[0] as usize;
@@ -85,3 +86,82 @@ pub fn load_csr_optimized<B: Backend>(
8586
}
8687
}
8788

89+
pub fn load_csc_optimized<B: Backend>(container: &DataContainer<B>) -> anyhow::Result<ArrayData> {
90+
let group = container.as_group()?;
91+
let shape: Vec<u64> = group.get_attr("shape")?;
92+
let nrows = shape[0] as usize;
93+
let ncols = shape[1] as usize;
94+
95+
let data_ds = group.open_dataset("data")?;
96+
let indices_ds = group.open_dataset("indices")?;
97+
let indptr_ds = group.open_dataset("indptr")?;
98+
99+
// Load indices arrays with zero-copy optimization
100+
let indptr = read_array_as_usize_optimized::<B>(&indptr_ds)?;
101+
let indices = read_array_as_usize_optimized::<B>(&indices_ds)?;
102+
103+
// Load data array with zero-copy optimization
104+
match data_ds.dtype()? {
105+
ScalarType::F64 => {
106+
let arr = data_ds.read_array::<f64, Ix1>()?;
107+
let (data, _offset) = arr.into_raw_vec_and_offset();
108+
// Zero-copy successful if _offset is None
109+
build_csc_matrix(nrows, ncols, indptr, indices, data)
110+
}
111+
ScalarType::F32 => {
112+
let arr = data_ds.read_array::<f32, Ix1>()?;
113+
let (data, _offset) = arr.into_raw_vec_and_offset();
114+
build_csc_matrix(nrows, ncols, indptr, indices, data)
115+
}
116+
ScalarType::I64 => {
117+
let arr = data_ds.read_array::<i64, Ix1>()?;
118+
let (data, _offset) = arr.into_raw_vec_and_offset();
119+
build_csc_matrix(nrows, ncols, indptr, indices, data)
120+
}
121+
ScalarType::I32 => {
122+
let arr = data_ds.read_array::<i32, Ix1>()?;
123+
let (data, _offset) = arr.into_raw_vec_and_offset();
124+
build_csc_matrix(nrows, ncols, indptr, indices, data)
125+
}
126+
ScalarType::I16 => {
127+
let arr = data_ds.read_array::<i16, Ix1>()?;
128+
let (data, _offset) = arr.into_raw_vec_and_offset();
129+
build_csc_matrix(nrows, ncols, indptr, indices, data)
130+
}
131+
ScalarType::I8 => {
132+
let arr = data_ds.read_array::<i8, Ix1>()?;
133+
let (data, _offset) = arr.into_raw_vec_and_offset();
134+
build_csc_matrix(nrows, ncols, indptr, indices, data)
135+
}
136+
ScalarType::U64 => {
137+
let arr = data_ds.read_array::<u64, Ix1>()?;
138+
let (data, _offset) = arr.into_raw_vec_and_offset();
139+
build_csc_matrix(nrows, ncols, indptr, indices, data)
140+
}
141+
ScalarType::U32 => {
142+
let arr = data_ds.read_array::<u32, Ix1>()?;
143+
let (data, _offset) = arr.into_raw_vec_and_offset();
144+
build_csc_matrix(nrows, ncols, indptr, indices, data)
145+
}
146+
ScalarType::U16 => {
147+
let arr = data_ds.read_array::<u16, Ix1>()?;
148+
let (data, _offset) = arr.into_raw_vec_and_offset();
149+
build_csc_matrix(nrows, ncols, indptr, indices, data)
150+
}
151+
ScalarType::U8 => {
152+
let arr = data_ds.read_array::<u8, Ix1>()?;
153+
let (data, _offset) = arr.into_raw_vec_and_offset();
154+
build_csc_matrix(nrows, ncols, indptr, indices, data)
155+
}
156+
ScalarType::Bool => {
157+
let arr = data_ds.read_array::<bool, Ix1>()?;
158+
let (data, _offset) = arr.into_raw_vec_and_offset();
159+
build_csc_matrix(nrows, ncols, indptr, indices, data)
160+
}
161+
ScalarType::String => {
162+
let arr = data_ds.read_array::<String, Ix1>()?;
163+
let (data, _offset) = arr.into_raw_vec_and_offset();
164+
build_csc_matrix(nrows, ncols, indptr, indices, data)
165+
}
166+
}
167+
}

src/utils/mod.rs

Lines changed: 34 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
use anndata::backend::AttributeOp;
22
use anndata::data::index::Interval;
3-
use anndata::data::{self, DataFrameIndex};
3+
use anndata::data::{DataFrameIndex};
44
use anndata::{
55
backend::{DataContainer, DatasetOp, GroupOp, ScalarType},
66
data::{DynCscMatrix, DynCsrMatrix, SelectInfoElem},
77
ArrayData, Backend,
88
};
99
use nalgebra_sparse::{pattern::SparsityPattern, CscMatrix, CsrMatrix};
1010
use ndarray::Slice;
11-
use std::{collections::HashMap, mem::replace};
11+
use std::collections::HashMap;
1212

13+
use crate::utils::subset::{subset_csr_internal};
1314
use crate::{LoadingConfig, LoadingStrategy};
1415

16+
mod subset;
17+
1518
pub(crate) fn select_info_elem_to_indices(
1619
elem: &SelectInfoElem,
1720
bound: usize,
@@ -48,6 +51,11 @@ pub(crate) fn select_info_elem_to_indices(
4851
}
4952
}
5053

54+
// ####################################################################################################
55+
// Subsetting
56+
// ####################################################################################################
57+
58+
5159
pub(crate) fn subset_dyn_csc_matrix(
5260
dyn_csc: DynCscMatrix,
5361
s: &[&SelectInfoElem],
@@ -113,107 +121,18 @@ fn subset_csr_matrix<T>(
113121
let nrows = matrix.nrows();
114122
let ncols = matrix.ncols();
115123

116-
let row_indices = select_info_elem_to_indices(s[0], nrows)?;
124+
let row_indices = crate::utils::select_info_elem_to_indices(s[0], nrows)?;
117125
let col_indices = if s.len() > 1 {
118-
select_info_elem_to_indices(s[1], ncols)?
126+
crate::utils::select_info_elem_to_indices(s[1], ncols)?
119127
} else {
120128
(0..ncols).collect()
121129
};
122130

123-
if row_indices.len() == nrows && col_indices.len() == ncols {
124-
return Ok(matrix);
125-
}
126-
131+
// Use matrix disassembly to get ownership of the data
127132
let (row_offsets, col_indices_orig, values) = matrix.disassemble();
128-
129-
if col_indices.len() == ncols {
130-
let mut new_row_offsets = Vec::with_capacity(row_indices.len() + 1);
131-
let mut new_col_indices = Vec::new();
132-
let mut new_values = Vec::new();
133-
new_row_offsets.push(0);
134-
135-
let mut values_iter = values.into_iter();
136-
let mut current_pos = 0;
137-
138-
for &row_idx in &row_indices {
139-
let start = row_offsets[row_idx];
140-
let end = row_offsets[row_idx + 1];
141-
142-
for _ in current_pos..start {
143-
values_iter.next();
144-
}
145-
146-
new_col_indices.extend_from_slice(&col_indices_orig[start..end]);
147-
for _ in start..end {
148-
if let Some(val) = values_iter.next() {
149-
new_values.push(val);
150-
}
151-
}
152-
153-
current_pos = end;
154-
new_row_offsets.push(new_col_indices.len());
155-
}
156-
157-
let new_pattern = unsafe {
158-
SparsityPattern::from_offset_and_indices_unchecked(
159-
row_indices.len(),
160-
ncols,
161-
new_row_offsets,
162-
new_col_indices,
163-
)
164-
};
165-
166-
return CsrMatrix::try_from_pattern_and_values(new_pattern, new_values)
167-
.map_err(|e| anyhow::anyhow!("Failed to create CSR matrix: {:?}", e));
168-
}
169-
170-
let col_map: HashMap<usize, usize> = col_indices
171-
.iter()
172-
.enumerate()
173-
.map(|(new_idx, &old_idx)| (old_idx, new_idx))
174-
.collect();
175-
176-
let capacity: usize = row_indices
177-
.iter()
178-
.flat_map(|&row| {
179-
let start = row_offsets[row];
180-
let end = row_offsets[row + 1];
181-
(start..end).filter(|&idx| col_map.contains_key(&col_indices_orig[idx]))
182-
})
183-
.count();
184-
185-
let mut new_row_offsets = Vec::with_capacity(row_indices.len() + 1);
186-
let mut new_col_indices = Vec::with_capacity(capacity);
187-
let mut new_values = Vec::with_capacity(capacity);
188-
new_row_offsets.push(0);
189-
190-
let mut values_vec = values;
191-
192-
for &row_idx in &row_indices {
193-
let start = row_offsets[row_idx];
194-
let end = row_offsets[row_idx + 1];
195-
196-
for idx in start..end {
197-
let col = col_indices_orig[idx];
198-
if let Some(&new_col) = col_map.get(&col) {
199-
new_col_indices.push(new_col);
200-
new_values.push(replace(&mut values_vec[idx], unsafe { std::mem::zeroed() }));
201-
}
202-
}
203-
new_row_offsets.push(new_col_indices.len());
204-
}
205-
206-
let new_pattern = unsafe {
207-
SparsityPattern::from_offset_and_indices_unchecked(
208-
row_indices.len(),
209-
col_indices.len(),
210-
new_row_offsets,
211-
new_col_indices,
212-
)
213-
};
214-
215-
CsrMatrix::try_from_pattern_and_values(new_pattern, new_values)
216-
.map_err(|e| anyhow::anyhow!("Failed to create CSR matrix: {:?}", e))
133+
134+
// Call the internal optimized subset function
135+
subset_csr_internal(row_offsets, col_indices_orig, values, &row_indices, &col_indices)
217136
}
218137

219138
fn subset_csc_matrix<T>(
@@ -397,7 +316,6 @@ pub fn read_array_as_usize_optimized<B: Backend>(
397316
}
398317

399318
pub fn read_array_as_usize<B: Backend>(dataset: &B::Dataset) -> anyhow::Result<Vec<usize>> {
400-
println!("Dtype: {}", dataset.dtype()?);
401319
match dataset.dtype()? {
402320
ScalarType::U64 => {
403321
let arr = dataset.read_array::<u64, ndarray::Ix1>()?;
@@ -544,7 +462,6 @@ pub fn build_csr_matrix<T>(
544462
where
545463
CsrMatrix<T>: Into<ArrayData>,
546464
{
547-
// Use unsafe constructor since we trust the data from AnnData
548465
let pattern = unsafe {
549466
SparsityPattern::from_offset_and_indices_unchecked(nrows, ncols, indptr, indices)
550467
};
@@ -553,6 +470,24 @@ where
553470
Ok(csr.into())
554471
}
555472

473+
pub fn build_csc_matrix<T>(
474+
nrows: usize,
475+
ncols: usize,
476+
indptr: Vec<usize>,
477+
indices: Vec<usize>,
478+
data: Vec<T>,
479+
) -> anyhow::Result<ArrayData>
480+
where
481+
CscMatrix<T>: Into<ArrayData>,
482+
{
483+
let pattern = unsafe {
484+
SparsityPattern::from_offset_and_indices_unchecked(nrows, ncols, indptr, indices)
485+
};
486+
let csc = CscMatrix::try_from_pattern_and_values(pattern, data)
487+
.map_err(|e| anyhow::anyhow!("Building the CSC matrix encountered an error: {}", e))?;
488+
Ok(csc.into())
489+
}
490+
556491
pub fn read_dataframe_index(
557492
container: &DataContainer<anndata_hdf5::H5>,
558493
) -> anyhow::Result<DataFrameIndex> {

0 commit comments

Comments
 (0)