Skip to content

Commit d755bc4

Browse files
authored
Merge pull request #172 from ginkgobioworks/gfa-export-node-length
Set max node length for gfa export
2 parents bde522c + bd1c0cc commit d755bc4

File tree

2 files changed

+188
-37
lines changed

2 files changed

+188
-37
lines changed

src/exports/gfa.rs

Lines changed: 177 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use crate::models::{
44
block_group::BlockGroup, block_group_edge::BlockGroupEdge, collection::Collection, edge::Edge,
55
node::Node, path::Path, sample::Sample, strand::Strand,
66
};
7+
use itertools::Itertools;
78
use rusqlite::Connection;
89
use std::collections::{BTreeSet, HashMap, HashSet};
910
use std::fs::File;
@@ -15,7 +16,9 @@ pub fn export_gfa(
1516
collection_name: &str,
1617
filename: &PathBuf,
1718
sample_name: Option<String>,
19+
max_size: impl Into<Option<i64>>,
1820
) {
21+
let chunk_size = max_size.into().unwrap_or(i64::MAX);
1922
// General note about how we encode segment IDs. The node ID and the start coordinate in the
2023
// sequence are all that's needed, because the end coordinate can be inferred from the length of
2124
// the segment's sequence. So the segment ID is of the form <node ID>.<start coordinate>
@@ -52,36 +55,85 @@ pub fn export_gfa(
5255
let mut writer = BufWriter::new(file);
5356

5457
let mut segments = BTreeSet::new();
58+
let mut split_segments = HashMap::new();
5559
for block in &blocks {
5660
if !Node::is_terminal(block.node_id) {
57-
segments.insert(Segment {
58-
sequence: block.sequence(),
59-
node_id: block.node_id,
60-
sequence_start: block.start,
61-
sequence_end: block.end,
62-
// NOTE: We can't easily get the value for strand, but it doesn't matter
63-
// because this value is only used for writing segments
64-
strand: Strand::Forward,
65-
});
61+
if block.end - block.start > chunk_size {
62+
let mut sub_segments = vec![];
63+
let block_sequence = block.sequence();
64+
for (index, sub_start) in (block.start..block.end)
65+
.step_by(chunk_size as usize)
66+
.enumerate()
67+
{
68+
let sub_end = (sub_start + chunk_size).min(block.end);
69+
let seq_start = index as i64 * chunk_size;
70+
let seq_end =
71+
((index as i64 + 1) * chunk_size).min(block_sequence.len() as i64);
72+
segments.insert(Segment {
73+
sequence: block_sequence[seq_start as usize..seq_end as usize].to_string(),
74+
node_id: block.node_id,
75+
sequence_start: sub_start,
76+
sequence_end: sub_end,
77+
// NOTE: We can't easily get the value for strand, but it doesn't matter
78+
// because this value is only used for writing segments
79+
strand: Strand::Forward,
80+
});
81+
sub_segments.push((sub_start, sub_end));
82+
}
83+
split_segments.insert(block.node_id, sub_segments);
84+
} else {
85+
segments.insert(Segment {
86+
sequence: block.sequence(),
87+
node_id: block.node_id,
88+
sequence_start: block.start,
89+
sequence_end: block.end,
90+
// NOTE: We can't easily get the value for strand, but it doesn't matter
91+
// because this value is only used for writing segments
92+
strand: Strand::Forward,
93+
});
94+
}
6695
}
6796
}
6897

6998
let mut links = BTreeSet::new();
7099
for (source, target, edge_info) in graph.all_edges() {
71100
if !Node::is_terminal(source.node_id) && !Node::is_terminal(target.node_id) {
72-
let source_segment = Segment {
73-
sequence: "".to_string(),
74-
node_id: source.node_id,
75-
sequence_start: source.sequence_start,
76-
sequence_end: source.sequence_end,
77-
strand: edge_info[0].source_strand,
101+
let source_segment = if let Some(splits) = split_segments.get(&source.node_id) {
102+
let last_split = splits.last().unwrap();
103+
Segment {
104+
sequence: "".to_string(),
105+
node_id: source.node_id,
106+
sequence_start: last_split.0,
107+
sequence_end: last_split.1,
108+
strand: edge_info[0].source_strand,
109+
}
110+
} else {
111+
Segment {
112+
sequence: "".to_string(),
113+
node_id: source.node_id,
114+
sequence_start: source.sequence_start,
115+
sequence_end: source.sequence_end,
116+
strand: edge_info[0].source_strand,
117+
}
78118
};
79-
let target_segment = Segment {
80-
sequence: "".to_string(),
81-
node_id: target.node_id,
82-
sequence_start: target.sequence_start,
83-
sequence_end: target.sequence_end,
84-
strand: edge_info[0].target_strand,
119+
120+
let target_segment = if let Some(splits) = split_segments.get(&target.node_id) {
121+
let first_split = splits.first().unwrap();
122+
Segment {
123+
sequence: "".to_string(),
124+
node_id: target.node_id,
125+
sequence_start: first_split.0,
126+
sequence_end: first_split.1,
127+
strand: edge_info[0].source_strand,
128+
}
129+
} else {
130+
Segment {
131+
sequence: "".to_string(),
132+
node_id: target.node_id,
133+
sequence_start: target.sequence_start,
134+
sequence_end: target.sequence_end,
135+
strand: edge_info[0].target_strand,
136+
}
85137
};
86138

87139
links.insert(Link {
@@ -92,7 +144,33 @@ pub fn export_gfa(
92144
});
93145
}
94146
}
95-
let paths = get_paths(conn, collection_name, sample_name, &graph);
147+
148+
for (node_id, splits) in split_segments.iter() {
149+
for ((src_start, src_end), (dst_start, dst_end)) in splits.iter().tuple_windows() {
150+
let left = Segment {
151+
sequence: "".to_string(),
152+
node_id: *node_id,
153+
sequence_start: *src_start,
154+
sequence_end: *src_end,
155+
strand: Strand::Forward,
156+
};
157+
let right = Segment {
158+
sequence: "".to_string(),
159+
node_id: *node_id,
160+
sequence_start: *dst_start,
161+
sequence_end: *dst_end,
162+
strand: Strand::Forward,
163+
};
164+
links.insert(Link {
165+
source_segment_id: left.segment_id(),
166+
source_strand: Strand::Forward,
167+
target_segment_id: right.segment_id(),
168+
target_strand: Strand::Forward,
169+
});
170+
}
171+
}
172+
173+
let paths = get_paths(conn, collection_name, sample_name, &graph, &split_segments);
96174
write_segments(&mut writer, &segments.iter().collect::<Vec<&Segment>>());
97175
write_links(&mut writer, &links.iter().collect::<Vec<&Link>>());
98176
write_paths(&mut writer, paths);
@@ -103,6 +181,7 @@ fn get_paths(
103181
collection_name: &str,
104182
sample_name: Option<String>,
105183
graph: &GenGraph,
184+
split_segments: &HashMap<i64, Vec<(i64, i64)>>,
106185
) -> HashMap<String, Vec<(String, Strand)>> {
107186
let paths = Path::query_for_collection_and_sample(conn, collection_name, sample_name);
108187

@@ -127,19 +206,34 @@ fn get_paths(
127206
.iter()
128207
.filter_map(|(node, strand)| {
129208
if !Node::is_terminal(node.node_id) {
130-
Some((
131-
format!(
132-
"{id}.{ss}.{se}",
133-
id = node.node_id,
134-
ss = node.sequence_start,
135-
se = node.sequence_end
136-
),
137-
*strand,
138-
))
209+
if let Some(splits) = split_segments.get(&node.node_id) {
210+
Some(
211+
splits
212+
.iter()
213+
.map(|(start, end)| {
214+
(
215+
format!("{id}.{start}.{end}", id = node.node_id),
216+
*strand,
217+
)
218+
})
219+
.collect::<Vec<_>>(),
220+
)
221+
} else {
222+
Some(vec![(
223+
format!(
224+
"{id}.{ss}.{se}",
225+
id = node.node_id,
226+
ss = node.sequence_start,
227+
se = node.sequence_end
228+
),
229+
*strand,
230+
)])
231+
}
139232
} else {
140233
None
141234
}
142235
})
236+
.flatten()
143237
.collect::<Vec<_>>(),
144238
);
145239
} else {
@@ -320,7 +414,7 @@ mod tests {
320414
let mut gfa_path = PathBuf::from(temp_dir.path());
321415
gfa_path.push("intermediate.gfa");
322416

323-
export_gfa(conn, collection_name, &gfa_path, None);
417+
export_gfa(conn, collection_name, &gfa_path, None, None);
324418
// NOTE: Not directly checking file contents because segments are written in random order
325419
let _ = import_gfa(&gfa_path, "test collection 2", None, conn, op_conn);
326420

@@ -336,6 +430,53 @@ mod tests {
336430
assert_eq!(paths[0].sequence(conn), "AAAATTTTGGGGCCCC");
337431
}
338432

433+
#[test]
434+
fn test_splits_nodes() {
435+
setup_gen_dir();
436+
let conn = &get_connection(None);
437+
let db_uuid = metadata::get_db_uuid(conn);
438+
let op_conn = &get_operation_connection(None);
439+
setup_db(op_conn, &db_uuid);
440+
441+
let (bg_id, _path) = setup_block_group(conn);
442+
let all_sequences = BlockGroup::get_all_sequences(conn, bg_id, false);
443+
444+
let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
445+
let gfa_path = PathBuf::from(temp_dir.path()).join("split.gfa");
446+
447+
export_gfa(conn, "test", &gfa_path, None, 5);
448+
449+
let _ = import_gfa(&gfa_path, "test collection 2", None, conn, op_conn);
450+
451+
let block_group2 = Collection::get_block_groups(conn, "test collection 2")
452+
.pop()
453+
.unwrap();
454+
let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id, false);
455+
456+
assert_eq!(all_sequences, all_sequences2);
457+
458+
let graph = BlockGroup::get_graph(conn, block_group2.id);
459+
let graph_nodes = graph
460+
.nodes()
461+
.filter_map(|node| {
462+
if Node::is_terminal(node.node_id) {
463+
None
464+
} else {
465+
Some(node.node_id)
466+
}
467+
})
468+
.collect::<Vec<_>>();
469+
let node_sequences = Node::get_sequences_by_node_ids(conn, &graph_nodes);
470+
assert!(node_sequences.len() > 1);
471+
for sequence in node_sequences.values() {
472+
assert!(
473+
sequence.length <= 5,
474+
"Sequence length {l} > 5",
475+
l = sequence.length
476+
);
477+
}
478+
}
479+
339480
#[test]
340481
fn test_simple_round_trip() {
341482
setup_gen_dir();
@@ -355,7 +496,7 @@ mod tests {
355496
let mut gfa_path = PathBuf::from(temp_dir.path());
356497
gfa_path.push("intermediate.gfa");
357498

358-
export_gfa(conn, &collection_name, &gfa_path, None);
499+
export_gfa(conn, &collection_name, &gfa_path, None, None);
359500
let _ = import_gfa(&gfa_path, "test collection 2", None, conn, op_conn);
360501

361502
let block_group2 = Collection::get_block_groups(conn, "test collection 2")
@@ -385,7 +526,7 @@ mod tests {
385526
let mut gfa_path = PathBuf::from(temp_dir.path());
386527
gfa_path.push("intermediate.gfa");
387528

388-
export_gfa(conn, &collection_name, &gfa_path, None);
529+
export_gfa(conn, &collection_name, &gfa_path, None, None);
389530
let _ = import_gfa(&gfa_path, "anderson promoters 2", None, conn, op_conn);
390531

391532
let block_group2 = Collection::get_block_groups(conn, "anderson promoters 2")
@@ -415,7 +556,7 @@ mod tests {
415556
let mut gfa_path = PathBuf::from(temp_dir.path());
416557
gfa_path.push("intermediate.gfa");
417558

418-
export_gfa(conn, &collection_name, &gfa_path, None);
559+
export_gfa(conn, &collection_name, &gfa_path, None, None);
419560
let _ = import_gfa(&gfa_path, "test collection 2", None, conn, op_conn);
420561

421562
let block_group2 = Collection::get_block_groups(conn, "test collection 2")
@@ -504,7 +645,7 @@ mod tests {
504645
let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
505646
let mut gfa_path = PathBuf::from(temp_dir.path());
506647
gfa_path.push("intermediate.gfa");
507-
export_gfa(conn, "test", &gfa_path, None);
648+
export_gfa(conn, "test", &gfa_path, None, None);
508649
let _ = import_gfa(&gfa_path, "test collection 2", None, conn, op_conn);
509650

510651
let block_group2 = Collection::get_block_groups(conn, "test collection 2")

src/main.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,9 @@ enum Commands {
322322
/// The name of the GenBank file to export to
323323
#[arg(long)]
324324
gb: Option<String>,
325+
/// The max node size for gfa export
326+
#[arg(long)]
327+
node_max: Option<i64>,
325328
},
326329
/// Configure default options
327330
#[command(arg_required_else_help(true))]
@@ -1031,14 +1034,21 @@ fn main() {
10311034
gfa,
10321035
sample,
10331036
fasta,
1037+
node_max,
10341038
}) => {
10351039
let name = &name
10361040
.clone()
10371041
.unwrap_or_else(|| get_default_collection(&operation_conn));
10381042
conn.execute("BEGIN TRANSACTION", []).unwrap();
10391043
operation_conn.execute("BEGIN TRANSACTION", []).unwrap();
10401044
if let Some(gfa_path) = gfa {
1041-
export_gfa(&conn, name, &PathBuf::from(gfa_path), sample.clone());
1045+
export_gfa(
1046+
&conn,
1047+
name,
1048+
&PathBuf::from(gfa_path),
1049+
sample.clone(),
1050+
*node_max,
1051+
);
10421052
} else if let Some(fasta_path) = fasta {
10431053
export_fasta(
10441054
&conn,

0 commit comments

Comments
 (0)