Skip to content

Commit cb37378

Browse files
authored
feat: added keep intervals for tables and tree sequences (#635)
1 parent 1fed2cf commit cb37378

File tree

3 files changed

+777
-0
lines changed

3 files changed

+777
-0
lines changed

src/table_collection.rs

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,17 @@ use crate::metadata::SiteMetadata;
1010
use crate::sys::bindings as ll_bindings;
1111
use crate::sys::TableCollection as LLTableCollection;
1212
use crate::types::Bookmark;
13+
use crate::EdgeTable;
1314
use crate::IndividualTableSortOptions;
1415
use crate::MigrationId;
16+
use crate::MigrationTable;
1517
use crate::MutationId;
18+
use crate::MutationTable;
1619
use crate::PopulationId;
1720
use crate::Position;
1821
use crate::SimplificationOptions;
1922
use crate::SiteId;
23+
use crate::SiteTable;
2024
use crate::TableClearOptions;
2125
use crate::TableEqualityOptions;
2226
use crate::TableIntegrityCheckFlags;
@@ -1372,4 +1376,206 @@ impl TableCollection {
13721376
pub fn as_mut_ptr(&mut self) -> *mut ll_bindings::tsk_table_collection_t {
13731377
self.inner.as_mut_ptr()
13741378
}
1379+
1380+
/// Truncate the [TableCollection] to specified genome intervals.
1381+
///
1382+
/// # Return
1383+
/// - `Ok(None)`: when truncation leads to empty edge table.
1384+
/// - `Ok(Some(TableCollection))`: when trunction is successfully performed
1385+
/// and results in non-empty edge table.
1386+
/// - `Error(TskitError)`: Any errors from the C API propagate. An
1387+
/// [TskitError::RangeError] will occur when `intervals` are not
1388+
/// sorted. Note that as `tskit` currently does not support `simplify`
1389+
/// on [TableCollection] with a non-empty migration table, calling
1390+
/// `keep_intervals` on those [TableCollection] with `simplify` set to
1391+
/// `true` will return an error.
1392+
///
1393+
/// # Example
1394+
/// ```rust
1395+
/// # use tskit::*;
1396+
/// # let snode = NodeFlags::new_sample();
1397+
/// # let anode = NodeFlags::default();
1398+
/// # let pop = PopulationId::NULL;
1399+
/// # let ind = IndividualId::NULL;
1400+
/// # let seqlen = 100.0;
1401+
/// # let (t0, t10) = (0.0, 10.0);
1402+
/// # let (left, right) = (0.0, 100.0);
1403+
/// # let sim_opts = SimplificationOptions::default();
1404+
/// #
1405+
/// # let mut tables = TableCollection::new(seqlen).unwrap();
1406+
/// # let child1 = tables.add_node(snode, t0, pop, ind).unwrap();
1407+
/// # let child2 = tables.add_node(snode, t0, pop, ind).unwrap();
1408+
/// # let parent = tables.add_node(anode, t10, pop, ind).unwrap();
1409+
/// #
1410+
/// # tables.add_edge(left, right, parent, child1).unwrap();
1411+
/// # tables.add_edge(left, right, parent, child2).unwrap();
1412+
/// # tables.full_sort(TableSortOptions::all()).unwrap();
1413+
/// # tables.simplify(&[child1, child2], sim_opts, false).unwrap();
1414+
/// # tables.build_index().unwrap();
1415+
/// #
1416+
/// let intervals = [(0.0, 10.0), (90.0, 100.0)].into_iter();
1417+
/// tables.keep_intervals(intervals, true).unwrap().unwrap();
1418+
/// ```
1419+
///
1420+
/// Note that no new provenance will be appended.
1421+
pub fn keep_intervals<P>(
1422+
self,
1423+
intervals: impl Iterator<Item = (P, P)>,
1424+
simplify: bool,
1425+
) -> Result<Option<Self>, TskitError>
1426+
where
1427+
P: Into<Position>,
1428+
{
1429+
use streaming_iterator::StreamingIterator;
1430+
let mut tables = self;
1431+
// use tables from sys to allow easier process with metadata
1432+
let options = 0;
1433+
let mut new_edges = crate::sys::EdgeTable::new(options)?;
1434+
let mut new_migrations = crate::sys::MigrationTable::new(options)?;
1435+
let mut new_sites = crate::sys::SiteTable::new(options)?;
1436+
let mut new_mutations = crate::sys::MutationTable::new(options)?;
1437+
1438+
// for old site id to new site id mapping
1439+
let mut site_map = vec![-1i32; tables.sites().num_rows().as_usize()];
1440+
1441+
// logicals to indicate whether a site (old) will be kept in new site table
1442+
let mut keep_sites = vec![false; tables.sites().num_rows().try_into()?];
1443+
1444+
let mut last_interval = (Position::from(0.0), Position::from(0.0));
1445+
for (s, e) in intervals {
1446+
let (s, e) = (s.into(), e.into());
1447+
// make sure intervals are sorted
1448+
if (s > e) || (s < last_interval.1) {
1449+
return Err(TskitError::RangeError(
1450+
"intervals not valid or sorted".into(),
1451+
));
1452+
}
1453+
keep_sites
1454+
.iter_mut()
1455+
.zip(tables.sites_iter())
1456+
.for_each(|(k, site_row)| {
1457+
*k = *k || ((site_row.position >= s) && (site_row.position < e));
1458+
});
1459+
1460+
// use stream_iter and while-let pattern for easier ? operator within a loop
1461+
let mut edge_iter = tables
1462+
.edges()
1463+
.lending_iter()
1464+
.filter(|edge_row| !((edge_row.right <= s) || (edge_row.left >= e)));
1465+
1466+
while let Some(edge_row) = edge_iter.next() {
1467+
new_edges.add_row_with_metadata(
1468+
if edge_row.left < s { s } else { edge_row.left }.into(),
1469+
if edge_row.right > e {
1470+
e
1471+
} else {
1472+
edge_row.right
1473+
}
1474+
.into(),
1475+
edge_row.parent.into(),
1476+
edge_row.child.into(),
1477+
edge_row.metadata.unwrap_or(&[0u8; 0]),
1478+
)?;
1479+
}
1480+
1481+
let mut migration_iter = tables
1482+
.migrations()
1483+
.lending_iter()
1484+
.filter(|mrow| !((mrow.right <= s) || (mrow.left >= e)));
1485+
1486+
while let Some(migration_row) = migration_iter.next() {
1487+
new_migrations.add_row_with_metadata(
1488+
(migration_row.left.into(), migration_row.right.into()),
1489+
migration_row.node.into(),
1490+
migration_row.source.into(),
1491+
migration_row.dest.into(),
1492+
migration_row.time.into(),
1493+
migration_row.metadata.unwrap_or(&[0u8; 0]),
1494+
)?;
1495+
}
1496+
last_interval = (s, e);
1497+
}
1498+
1499+
let mut running_site_id = 0;
1500+
let mut site_iter = tables.sites().lending_iter();
1501+
while let Some(site_row) = site_iter.next() {
1502+
let old_id = site_row.id.to_usize().unwrap();
1503+
if keep_sites[old_id] {
1504+
new_sites.add_row_with_metadata(
1505+
site_row.position.into(),
1506+
site_row.ancestral_state,
1507+
site_row.metadata.unwrap_or(&[0u8; 0]),
1508+
)?;
1509+
site_map[old_id] = running_site_id;
1510+
running_site_id += 1;
1511+
}
1512+
}
1513+
1514+
// build mutation_map
1515+
let mutation_map: Vec<_> = {
1516+
let mut n = 0;
1517+
tables
1518+
.mutations()
1519+
.site_slice()
1520+
.iter()
1521+
.map(|site| {
1522+
if keep_sites[site.as_usize()] {
1523+
n += 1
1524+
};
1525+
n - 1
1526+
})
1527+
.collect()
1528+
};
1529+
1530+
let mut mutations_iter = tables.mutations().lending_iter();
1531+
while let Some(mutation_row) = mutations_iter.next() {
1532+
let old_id = mutation_row.site.to_usize().unwrap();
1533+
if keep_sites[old_id] {
1534+
let new_site = site_map[old_id];
1535+
let new_parent = {
1536+
if mutation_row.parent.is_null() {
1537+
mutation_row.parent.into()
1538+
} else {
1539+
mutation_map[mutation_row.parent.as_usize()]
1540+
}
1541+
};
1542+
new_mutations.add_row_with_metadata(
1543+
new_site,
1544+
mutation_row.node.into(),
1545+
new_parent,
1546+
mutation_row.time.into(),
1547+
mutation_row.derived_state,
1548+
mutation_row.metadata.unwrap_or(&[0u8; 0]),
1549+
)?;
1550+
}
1551+
}
1552+
1553+
// convert sys version of tables to non-sys version of tables
1554+
let new_edges = EdgeTable::new_from_table(new_edges.as_mut())?;
1555+
let new_migrations = MigrationTable::new_from_table(new_migrations.as_mut())?;
1556+
let new_mutations = MutationTable::new_from_table(new_mutations.as_mut())?;
1557+
let new_sites = SiteTable::new_from_table(new_sites.as_mut())?;
1558+
1559+
// replace old tables with new tables
1560+
tables.set_edges(&new_edges).map(|_| ())?;
1561+
tables.set_migrations(&new_migrations).map(|_| ())?;
1562+
tables.set_mutations(&new_mutations).map(|_| ())?;
1563+
tables.set_sites(&new_sites)?;
1564+
1565+
// sort tables
1566+
tables.full_sort(TableSortOptions::default())?;
1567+
1568+
// simplify tables
1569+
if simplify {
1570+
let samples = tables.samples_as_vector();
1571+
tables.simplify(samples.as_slice(), SimplificationOptions::default(), false)?;
1572+
}
1573+
1574+
// return None when edge table is empty
1575+
if tables.edges().num_rows() == 0 {
1576+
Ok(None)
1577+
} else {
1578+
Ok(Some(tables))
1579+
}
1580+
}
13751581
}

0 commit comments

Comments
 (0)