@@ -4,6 +4,7 @@ use crate::models::{
44 block_group:: BlockGroup , block_group_edge:: BlockGroupEdge , collection:: Collection , edge:: Edge ,
55 node:: Node , path:: Path , sample:: Sample , strand:: Strand ,
66} ;
7+ use itertools:: Itertools ;
78use rusqlite:: Connection ;
89use std:: collections:: { BTreeSet , HashMap , HashSet } ;
910use std:: fs:: File ;
@@ -15,7 +16,9 @@ pub fn export_gfa(
1516 collection_name : & str ,
1617 filename : & PathBuf ,
1718 sample_name : Option < String > ,
19+ max_size : impl Into < Option < i64 > > ,
1820) {
21+ let chunk_size = max_size. into ( ) . unwrap_or ( i64:: MAX ) ;
1922 // General note about how we encode segment IDs. The node ID and the start coordinate in the
2023 // sequence are all that's needed, because the end coordinate can be inferred from the length of
2124 // the segment's sequence. So the segment ID is of the form <node ID>.<start coordinate>
@@ -52,36 +55,85 @@ pub fn export_gfa(
5255 let mut writer = BufWriter :: new ( file) ;
5356
5457 let mut segments = BTreeSet :: new ( ) ;
58+ let mut split_segments = HashMap :: new ( ) ;
5559 for block in & blocks {
5660 if !Node :: is_terminal ( block. node_id ) {
57- segments. insert ( Segment {
58- sequence : block. sequence ( ) ,
59- node_id : block. node_id ,
60- sequence_start : block. start ,
61- sequence_end : block. end ,
62- // NOTE: We can't easily get the value for strand, but it doesn't matter
63- // because this value is only used for writing segments
64- strand : Strand :: Forward ,
65- } ) ;
61+ if block. end - block. start > chunk_size {
62+ let mut sub_segments = vec ! [ ] ;
63+ let block_sequence = block. sequence ( ) ;
64+ for ( index, sub_start) in ( block. start ..block. end )
65+ . step_by ( chunk_size as usize )
66+ . enumerate ( )
67+ {
68+ let sub_end = ( sub_start + chunk_size) . min ( block. end ) ;
69+ let seq_start = index as i64 * chunk_size;
70+ let seq_end =
71+ ( ( index as i64 + 1 ) * chunk_size) . min ( block_sequence. len ( ) as i64 ) ;
72+ segments. insert ( Segment {
73+ sequence : block_sequence[ seq_start as usize ..seq_end as usize ] . to_string ( ) ,
74+ node_id : block. node_id ,
75+ sequence_start : sub_start,
76+ sequence_end : sub_end,
77+ // NOTE: We can't easily get the value for strand, but it doesn't matter
78+ // because this value is only used for writing segments
79+ strand : Strand :: Forward ,
80+ } ) ;
81+ sub_segments. push ( ( sub_start, sub_end) ) ;
82+ }
83+ split_segments. insert ( block. node_id , sub_segments) ;
84+ } else {
85+ segments. insert ( Segment {
86+ sequence : block. sequence ( ) ,
87+ node_id : block. node_id ,
88+ sequence_start : block. start ,
89+ sequence_end : block. end ,
90+ // NOTE: We can't easily get the value for strand, but it doesn't matter
91+ // because this value is only used for writing segments
92+ strand : Strand :: Forward ,
93+ } ) ;
94+ }
6695 }
6796 }
6897
6998 let mut links = BTreeSet :: new ( ) ;
7099 for ( source, target, edge_info) in graph. all_edges ( ) {
71100 if !Node :: is_terminal ( source. node_id ) && !Node :: is_terminal ( target. node_id ) {
72- let source_segment = Segment {
73- sequence : "" . to_string ( ) ,
74- node_id : source. node_id ,
75- sequence_start : source. sequence_start ,
76- sequence_end : source. sequence_end ,
77- strand : edge_info[ 0 ] . source_strand ,
101+ let source_segment = if let Some ( splits) = split_segments. get ( & source. node_id ) {
102+ let last_split = splits. last ( ) . unwrap ( ) ;
103+ Segment {
104+ sequence : "" . to_string ( ) ,
105+ node_id : source. node_id ,
106+ sequence_start : last_split. 0 ,
107+ sequence_end : last_split. 1 ,
108+ strand : edge_info[ 0 ] . source_strand ,
109+ }
110+ } else {
111+ Segment {
112+ sequence : "" . to_string ( ) ,
113+ node_id : source. node_id ,
114+ sequence_start : source. sequence_start ,
115+ sequence_end : source. sequence_end ,
116+ strand : edge_info[ 0 ] . source_strand ,
117+ }
78118 } ;
79- let target_segment = Segment {
80- sequence : "" . to_string ( ) ,
81- node_id : target. node_id ,
82- sequence_start : target. sequence_start ,
83- sequence_end : target. sequence_end ,
84- strand : edge_info[ 0 ] . target_strand ,
119+
120+ let target_segment = if let Some ( splits) = split_segments. get ( & target. node_id ) {
121+ let first_split = splits. first ( ) . unwrap ( ) ;
122+ Segment {
123+ sequence : "" . to_string ( ) ,
124+ node_id : target. node_id ,
125+ sequence_start : first_split. 0 ,
126+ sequence_end : first_split. 1 ,
127+ strand : edge_info[ 0 ] . source_strand ,
128+ }
129+ } else {
130+ Segment {
131+ sequence : "" . to_string ( ) ,
132+ node_id : target. node_id ,
133+ sequence_start : target. sequence_start ,
134+ sequence_end : target. sequence_end ,
135+ strand : edge_info[ 0 ] . target_strand ,
136+ }
85137 } ;
86138
87139 links. insert ( Link {
@@ -92,7 +144,33 @@ pub fn export_gfa(
92144 } ) ;
93145 }
94146 }
95- let paths = get_paths ( conn, collection_name, sample_name, & graph) ;
147+
148+ for ( node_id, splits) in split_segments. iter ( ) {
149+ for ( ( src_start, src_end) , ( dst_start, dst_end) ) in splits. iter ( ) . tuple_windows ( ) {
150+ let left = Segment {
151+ sequence : "" . to_string ( ) ,
152+ node_id : * node_id,
153+ sequence_start : * src_start,
154+ sequence_end : * src_end,
155+ strand : Strand :: Forward ,
156+ } ;
157+ let right = Segment {
158+ sequence : "" . to_string ( ) ,
159+ node_id : * node_id,
160+ sequence_start : * dst_start,
161+ sequence_end : * dst_end,
162+ strand : Strand :: Forward ,
163+ } ;
164+ links. insert ( Link {
165+ source_segment_id : left. segment_id ( ) ,
166+ source_strand : Strand :: Forward ,
167+ target_segment_id : right. segment_id ( ) ,
168+ target_strand : Strand :: Forward ,
169+ } ) ;
170+ }
171+ }
172+
173+ let paths = get_paths ( conn, collection_name, sample_name, & graph, & split_segments) ;
96174 write_segments ( & mut writer, & segments. iter ( ) . collect :: < Vec < & Segment > > ( ) ) ;
97175 write_links ( & mut writer, & links. iter ( ) . collect :: < Vec < & Link > > ( ) ) ;
98176 write_paths ( & mut writer, paths) ;
@@ -103,6 +181,7 @@ fn get_paths(
103181 collection_name : & str ,
104182 sample_name : Option < String > ,
105183 graph : & GenGraph ,
184+ split_segments : & HashMap < i64 , Vec < ( i64 , i64 ) > > ,
106185) -> HashMap < String , Vec < ( String , Strand ) > > {
107186 let paths = Path :: query_for_collection_and_sample ( conn, collection_name, sample_name) ;
108187
@@ -127,19 +206,34 @@ fn get_paths(
127206 . iter ( )
128207 . filter_map ( |( node, strand) | {
129208 if !Node :: is_terminal ( node. node_id ) {
130- Some ( (
131- format ! (
132- "{id}.{ss}.{se}" ,
133- id = node. node_id,
134- ss = node. sequence_start,
135- se = node. sequence_end
136- ) ,
137- * strand,
138- ) )
209+ if let Some ( splits) = split_segments. get ( & node. node_id ) {
210+ Some (
211+ splits
212+ . iter ( )
213+ . map ( |( start, end) | {
214+ (
215+ format ! ( "{id}.{start}.{end}" , id = node. node_id) ,
216+ * strand,
217+ )
218+ } )
219+ . collect :: < Vec < _ > > ( ) ,
220+ )
221+ } else {
222+ Some ( vec ! [ (
223+ format!(
224+ "{id}.{ss}.{se}" ,
225+ id = node. node_id,
226+ ss = node. sequence_start,
227+ se = node. sequence_end
228+ ) ,
229+ * strand,
230+ ) ] )
231+ }
139232 } else {
140233 None
141234 }
142235 } )
236+ . flatten ( )
143237 . collect :: < Vec < _ > > ( ) ,
144238 ) ;
145239 } else {
@@ -320,7 +414,7 @@ mod tests {
320414 let mut gfa_path = PathBuf :: from ( temp_dir. path ( ) ) ;
321415 gfa_path. push ( "intermediate.gfa" ) ;
322416
323- export_gfa ( conn, collection_name, & gfa_path, None ) ;
417+ export_gfa ( conn, collection_name, & gfa_path, None , None ) ;
324418 // NOTE: Not directly checking file contents because segments are written in random order
325419 let _ = import_gfa ( & gfa_path, "test collection 2" , None , conn, op_conn) ;
326420
@@ -336,6 +430,53 @@ mod tests {
336430 assert_eq ! ( paths[ 0 ] . sequence( conn) , "AAAATTTTGGGGCCCC" ) ;
337431 }
338432
433+ #[ test]
434+ fn test_splits_nodes ( ) {
435+ setup_gen_dir ( ) ;
436+ let conn = & get_connection ( None ) ;
437+ let db_uuid = metadata:: get_db_uuid ( conn) ;
438+ let op_conn = & get_operation_connection ( None ) ;
439+ setup_db ( op_conn, & db_uuid) ;
440+
441+ let ( bg_id, _path) = setup_block_group ( conn) ;
442+ let all_sequences = BlockGroup :: get_all_sequences ( conn, bg_id, false ) ;
443+
444+ let temp_dir = tempdir ( ) . expect ( "Couldn't get handle to temp directory" ) ;
445+ let gfa_path = PathBuf :: from ( temp_dir. path ( ) ) . join ( "split.gfa" ) ;
446+
447+ export_gfa ( conn, "test" , & gfa_path, None , 5 ) ;
448+
449+ let _ = import_gfa ( & gfa_path, "test collection 2" , None , conn, op_conn) ;
450+
451+ let block_group2 = Collection :: get_block_groups ( conn, "test collection 2" )
452+ . pop ( )
453+ . unwrap ( ) ;
454+ let all_sequences2 = BlockGroup :: get_all_sequences ( conn, block_group2. id , false ) ;
455+
456+ assert_eq ! ( all_sequences, all_sequences2) ;
457+
458+ let graph = BlockGroup :: get_graph ( conn, block_group2. id ) ;
459+ let graph_nodes = graph
460+ . nodes ( )
461+ . filter_map ( |node| {
462+ if Node :: is_terminal ( node. node_id ) {
463+ None
464+ } else {
465+ Some ( node. node_id )
466+ }
467+ } )
468+ . collect :: < Vec < _ > > ( ) ;
469+ let node_sequences = Node :: get_sequences_by_node_ids ( conn, & graph_nodes) ;
470+ assert ! ( node_sequences. len( ) > 1 ) ;
471+ for sequence in node_sequences. values ( ) {
472+ assert ! (
473+ sequence. length <= 5 ,
474+ "Sequence length {l} > 5" ,
475+ l = sequence. length
476+ ) ;
477+ }
478+ }
479+
339480 #[ test]
340481 fn test_simple_round_trip ( ) {
341482 setup_gen_dir ( ) ;
@@ -355,7 +496,7 @@ mod tests {
355496 let mut gfa_path = PathBuf :: from ( temp_dir. path ( ) ) ;
356497 gfa_path. push ( "intermediate.gfa" ) ;
357498
358- export_gfa ( conn, & collection_name, & gfa_path, None ) ;
499+ export_gfa ( conn, & collection_name, & gfa_path, None , None ) ;
359500 let _ = import_gfa ( & gfa_path, "test collection 2" , None , conn, op_conn) ;
360501
361502 let block_group2 = Collection :: get_block_groups ( conn, "test collection 2" )
@@ -385,7 +526,7 @@ mod tests {
385526 let mut gfa_path = PathBuf :: from ( temp_dir. path ( ) ) ;
386527 gfa_path. push ( "intermediate.gfa" ) ;
387528
388- export_gfa ( conn, & collection_name, & gfa_path, None ) ;
529+ export_gfa ( conn, & collection_name, & gfa_path, None , None ) ;
389530 let _ = import_gfa ( & gfa_path, "anderson promoters 2" , None , conn, op_conn) ;
390531
391532 let block_group2 = Collection :: get_block_groups ( conn, "anderson promoters 2" )
@@ -415,7 +556,7 @@ mod tests {
415556 let mut gfa_path = PathBuf :: from ( temp_dir. path ( ) ) ;
416557 gfa_path. push ( "intermediate.gfa" ) ;
417558
418- export_gfa ( conn, & collection_name, & gfa_path, None ) ;
559+ export_gfa ( conn, & collection_name, & gfa_path, None , None ) ;
419560 let _ = import_gfa ( & gfa_path, "test collection 2" , None , conn, op_conn) ;
420561
421562 let block_group2 = Collection :: get_block_groups ( conn, "test collection 2" )
@@ -504,7 +645,7 @@ mod tests {
504645 let temp_dir = tempdir ( ) . expect ( "Couldn't get handle to temp directory" ) ;
505646 let mut gfa_path = PathBuf :: from ( temp_dir. path ( ) ) ;
506647 gfa_path. push ( "intermediate.gfa" ) ;
507- export_gfa ( conn, "test" , & gfa_path, None ) ;
648+ export_gfa ( conn, "test" , & gfa_path, None , None ) ;
508649 let _ = import_gfa ( & gfa_path, "test collection 2" , None , conn, op_conn) ;
509650
510651 let block_group2 = Collection :: get_block_groups ( conn, "test collection 2" )
0 commit comments