Skip to content

Commit de89db6

Browse files
committed
Fix support to GFA2 & also support GFA1, fixes #59
1 parent e3e5848 commit de89db6

File tree

19 files changed

+242
-899
lines changed

19 files changed

+242
-899
lines changed

crates/api/example/src/main.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ fn main() {
2727
panic!("Unrecoverable error: {}", msg);
2828
}
2929
}),
30-
gfa_output: false,
3130
disk_optimization_level: 5,
3231
})
3332
.unwrap();
@@ -63,7 +62,7 @@ fn main() {
6362
true,
6463
1,
6564
ExtraElaboration::UnitigLinks,
66-
false,
65+
None,
6766
5,
6867
)
6968
.unwrap();

crates/api/src/lib.rs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use config::{MAX_BUCKET_CHUNK_SIZE, MIN_BUCKET_CHUNK_SIZE};
99
pub use ggcat_logging::MessageLevel;
1010
use ggcat_logging::UnrecoverableErrorLogging;
1111
use io::concurrent::structured_sequences::fasta::FastaWriterWrapper;
12-
use io::concurrent::structured_sequences::gfa::GFAWriterWrapper;
12+
use io::concurrent::structured_sequences::gfa::{GFAWriterWrapperV1, GFAWriterWrapperV2};
1313
use io::concurrent::structured_sequences::StructuredSequenceBackendWrapper;
1414
use io::sequences_stream::fasta::FastaFileSequencesStream;
1515
use io::sequences_stream::GenericSequencesStream;
@@ -59,11 +59,18 @@ pub mod debug {
5959
pub static BUCKETS_COUNT_LOG_FORCE: Mutex<Option<usize>> = Mutex::new(None);
6060
}
6161

62+
#[derive(Clone, Copy, Debug)]
6263
pub enum LoggingMode {
6364
Log,
6465
ForceStdout,
6566
}
6667

68+
#[derive(Clone, Copy, Debug)]
69+
pub enum GfaVersion {
70+
V1,
71+
V2,
72+
}
73+
6774
/// Main config of GGCAT. This config is global and should be passed to GGCATInstance::create
6875
pub struct GGCATConfig {
6976
/// Directory for temporary files
@@ -90,9 +97,6 @@ pub struct GGCATConfig {
9097
/// The messages callback, if present, no output will be automatically written to stdout
9198
pub messages_callback: Option<fn(MessageLevel, &str)>,
9299

93-
/// Output GFA format instead of FASTA
94-
pub gfa_output: bool,
95-
96100
/// Sets the level of disk usage reduction optimization
97101
pub disk_optimization_level: u32,
98102
}
@@ -231,7 +235,7 @@ impl GGCATInstance {
231235

232236
extra_elab: ExtraElaboration,
233237

234-
gfa_output: bool,
238+
gfa_output_version: Option<GfaVersion>,
235239

236240
disk_optimization_level: u32,
237241
) -> anyhow::Result<PathBuf> {
@@ -249,14 +253,14 @@ impl GGCATInstance {
249253
NonColoredManager::dynamic_dispatch_id()
250254
};
251255

252-
if gfa_output && colors {
256+
if gfa_output_version.is_some() && colors {
253257
anyhow::bail!("GFA output is not supported with colors");
254258
}
255259

256-
let output_mode = if gfa_output {
257-
GFAWriterWrapper::dynamic_dispatch_id()
258-
} else {
259-
FastaWriterWrapper::dynamic_dispatch_id()
260+
let output_mode = match gfa_output_version {
261+
None => FastaWriterWrapper::dynamic_dispatch_id(),
262+
Some(GfaVersion::V1) => GFAWriterWrapperV1::dynamic_dispatch_id(),
263+
Some(GfaVersion::V2) => GFAWriterWrapperV2::dynamic_dispatch_id(),
260264
};
261265

262266
let temp_dir = create_tempdir(self.0.temp_dir.clone());

crates/assembler/src/lib.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ use config::{
1919
use hashes::HashFunctionFactory;
2020
use io::concurrent::structured_sequences::binary::StructSeqBinaryWriter;
2121
use io::concurrent::structured_sequences::fasta::FastaWriterWrapper;
22-
use io::concurrent::structured_sequences::gfa::GFAWriterWrapper;
22+
use io::concurrent::structured_sequences::gfa::GFAWriterWrapperV1;
23+
use io::concurrent::structured_sequences::gfa::GFAWriterWrapperV2;
2324
use io::concurrent::structured_sequences::{
2425
IdentSequenceWriter, StructuredSequenceBackend, StructuredSequenceBackendInit,
2526
StructuredSequenceBackendWrapper, StructuredSequenceWriter,
@@ -95,7 +96,8 @@ fn get_writer<
9596
colors::non_colored::NonColoredManager,
9697
], OutputMode = [
9798
FastaWriterWrapper,
98-
#[cfg(not(feature = "devel-build"))] GFAWriterWrapper
99+
#[cfg(not(feature = "devel-build"))] GFAWriterWrapperV1,
100+
#[cfg(not(feature = "devel-build"))] GFAWriterWrapperV2
99101
])]
100102
pub fn run_assembler<
101103
MergingHash: HashFunctionFactory,

crates/assembler/src/pipeline/maximal_unitig_links.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ pub fn build_maximal_unitigs_links<
172172
index,
173173
MaximalUnitigPosition::Beginning,
174174
first_hash.is_forward(),
175+
0,
175176
),
176177
);
177178

@@ -188,6 +189,7 @@ pub fn build_maximal_unitigs_links<
188189
index,
189190
MaximalUnitigPosition::Beginning,
190191
!first_hash.is_forward(),
192+
0,
191193
),
192194
);
193195
}
@@ -204,6 +206,7 @@ pub fn build_maximal_unitigs_links<
204206
index,
205207
MaximalUnitigPosition::Ending,
206208
!last_hash.is_forward(),
209+
read_len as u64 - k as u64 + 1,
207210
),
208211
);
209212

@@ -220,6 +223,7 @@ pub fn build_maximal_unitigs_links<
220223
index,
221224
MaximalUnitigPosition::Ending,
222225
last_hash.is_forward(),
226+
read_len as u64 - k as u64 + 1,
223227
),
224228
);
225229
}
@@ -310,6 +314,7 @@ pub fn build_maximal_unitigs_links<
310314
.map(|v| {
311315
MaximalUnitigIndex::new(
312316
v.entry(),
317+
v.overlap_start(),
313318
MaximalUnitigFlags::new_direction(
314319
val.position() == MaximalUnitigPosition::Beginning,
315320
v.position() == MaximalUnitigPosition::Ending,

crates/assembler/src/pipeline/maximal_unitig_links/maximal_hash_entry.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ pub enum MaximalUnitigPosition {
1919
pub struct MaximalHashEntry<H: Copy> {
2020
pub hash: H,
2121
encoded: u64,
22+
overlap_start: u64,
2223
}
2324

2425
impl<H: Copy> MaximalHashEntry<H> {
@@ -31,6 +32,7 @@ impl<H: Copy> MaximalHashEntry<H> {
3132
entry: u64,
3233
position: MaximalUnitigPosition,
3334
direction_forward: bool,
35+
overlap_start: u64,
3436
) -> Self {
3537
Self {
3638
hash,
@@ -40,13 +42,18 @@ impl<H: Copy> MaximalHashEntry<H> {
4042
MaximalUnitigPosition::Beginning => 0,
4143
}) << Self::POSITION_OFFSET)
4244
| ((if direction_forward { 1 } else { 0 }) << Self::DIRECTION_OFFSET),
45+
overlap_start,
4346
}
4447
}
4548

4649
pub fn entry(&self) -> u64 {
4750
self.encoded >> Self::ENTRY_OFFSET
4851
}
4952

53+
pub fn overlap_start(&self) -> u64 {
54+
self.overlap_start
55+
}
56+
5057
pub fn position(&self) -> MaximalUnitigPosition {
5158
if (self.encoded >> Self::POSITION_OFFSET) & 0x1 == 0 {
5259
MaximalUnitigPosition::Beginning

crates/assembler/src/pipeline/maximal_unitig_links/maximal_unitig_index.rs

Lines changed: 69 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ impl MaximalUnitigFlags {
4444
#[derive(Copy, Clone, Eq)]
4545
pub struct MaximalUnitigIndex {
4646
index: u64,
47+
// The non reverse-complemented start of the link overlap (used for gfa2)
48+
overlap_start: u64,
4749
pub flags: MaximalUnitigFlags,
4850
}
4951

@@ -63,12 +65,18 @@ impl HasEmptyExtraBuffer for MaximalUnitigIndex {}
6365
impl SequenceExtraData for MaximalUnitigIndex {
6466
fn decode_extended(_: &mut (), reader: &mut impl Read) -> Option<Self> {
6567
let index = decode_varint(|| reader.read_u8().ok())?;
68+
let overlap_start = decode_varint(|| reader.read_u8().ok())?;
6669
let flags = reader.read_u8().ok()?;
67-
Some(MaximalUnitigIndex::new(index, MaximalUnitigFlags(flags)))
70+
Some(MaximalUnitigIndex::new(
71+
index,
72+
overlap_start,
73+
MaximalUnitigFlags(flags),
74+
))
6875
}
6976

7077
fn encode_extended(&self, _: &(), writer: &mut impl Write) {
7178
encode_varint(|b| writer.write_all(b).ok(), self.index() as u64).unwrap();
79+
encode_varint(|b| writer.write_all(b).ok(), self.overlap_start).unwrap();
7280
writer.write_all(&[self.flags.0]).unwrap();
7381
}
7482

@@ -88,8 +96,12 @@ impl Debug for MaximalUnitigIndex {
8896

8997
impl MaximalUnitigIndex {
9098
#[inline]
91-
pub fn new(index: u64, flags: MaximalUnitigFlags) -> Self {
92-
Self { index, flags }
99+
pub fn new(index: u64, overlap_start: u64, flags: MaximalUnitigFlags) -> Self {
100+
Self {
101+
index,
102+
overlap_start,
103+
flags,
104+
}
93105
}
94106

95107
#[inline]
@@ -148,6 +160,7 @@ impl BucketItemSerializer for MaximalUnitigLinkSerializer {
148160

149161
for entry in entries {
150162
encode_varint(|b| bucket.write_all(b), entry.index() as u64).unwrap();
163+
encode_varint(|b| bucket.write_all(b), entry.overlap_start).unwrap();
151164
bucket.push(entry.flags.0);
152165
}
153166
}
@@ -165,8 +178,13 @@ impl BucketItemSerializer for MaximalUnitigLinkSerializer {
165178
let start = read_buffer.len();
166179
for _i in 0..len {
167180
let index = decode_varint(|| stream.read_u8().ok())?;
181+
let overlap_start = decode_varint(|| stream.read_u8().ok())?;
168182
let flags = stream.read_u8().ok()?;
169-
read_buffer.push(MaximalUnitigIndex::new(index, MaximalUnitigFlags(flags)));
183+
read_buffer.push(MaximalUnitigIndex::new(
184+
index,
185+
overlap_start,
186+
MaximalUnitigFlags(flags),
187+
));
170188
}
171189

172190
Some(MaximalUnitigLink::new(entry, VecSlice::new(start, len)))
@@ -272,26 +290,61 @@ impl IdentSequenceWriter for DoubleMaximalUnitigLinks {
272290
}
273291

274292
#[allow(unused_variables)]
275-
fn write_as_gfa(
293+
fn write_as_gfa<const VERSION: u32>(
276294
&self,
277295
k: u64,
278296
index: u64,
297+
length: u64,
279298
stream: &mut impl Write,
280299
extra_buffer: &Self::TempBuffer,
281300
) {
282301
for entries in &self.links {
283302
let entries = entries.entries.get_slice(extra_buffer);
284-
for entry in entries {
285-
writeln!(
286-
stream,
287-
"L\t{}\t{}\t{}\t{}\t{}M",
288-
index,
289-
if entry.flags.flip_current() { "-" } else { "+" },
290-
entry.index,
291-
if entry.flags.flip_other() { "-" } else { "+" },
292-
k - 1
293-
)
294-
.unwrap();
303+
if VERSION == 1 {
304+
for entry in entries {
305+
// L <index> < +/- > <other_index> < +/- > <overlap>
306+
writeln!(
307+
stream,
308+
"L\t{}\t{}\t{}\t{}\t{}M",
309+
index,
310+
if entry.flags.flip_current() { '-' } else { '+' },
311+
entry.index,
312+
if entry.flags.flip_other() { '-' } else { '+' },
313+
k - 1
314+
)
315+
.unwrap();
316+
}
317+
} else if VERSION == 2 {
318+
for entry in entries {
319+
let (b1, e1, is_end1) = if entry.flags.flip_current() {
320+
(0, k - 1, false)
321+
} else {
322+
(length - k + 1, length, true)
323+
};
324+
325+
let (b2, e2, is_end2) = (
326+
entry.overlap_start,
327+
entry.overlap_start + k - 1,
328+
entry.overlap_start > 0,
329+
);
330+
331+
// E * <index>< +/- > <other_index>< +/- > <b1> <e1> <b2> <e2>
332+
writeln!(
333+
stream,
334+
"E\t*\t{}{}\t{}{}\t{}\t{}{}\t{}\t{}{}",
335+
index,
336+
if entry.flags.flip_current() { '-' } else { '+' },
337+
entry.index,
338+
if entry.flags.flip_other() { '-' } else { '+' },
339+
b1,
340+
e1,
341+
if is_end1 { "$" } else { "" },
342+
b2,
343+
e2,
344+
if is_end2 { "$" } else { "" }
345+
)
346+
.unwrap();
347+
}
295348
}
296349
}
297350
}

crates/capi/build.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ fn main() {
1111
std::env::var("OUT_DIR").unwrap(),
1212
"/cxxbridge/include/ggcat-cpp-bindings/src/lib.rs.h"
1313
),
14-
"ggcat-cpp-api/src/ggcat-cpp-bindings.hh",
14+
"ggcat-cpp-api/include/ggcat-cpp-bindings.hh",
1515
)
1616
.unwrap();
1717

0 commit comments

Comments
 (0)