Skip to content

Commit dacd5b7

Browse files
Merge pull request #99 from pangenome/impg_lace
Add `impg lace` command
2 parents bee75f7 + e1b6ee4 commit dacd5b7

25 files changed

+2706
-309
lines changed

Cargo.lock

Lines changed: 721 additions & 24 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ clap = { version = "4.5.42", features = ["derive"] }
1212
coitrees = "0.4.0"
1313
num_cpus = "1.17.0"
1414
rayon = "1.10.0"
15-
noodles = { version = "0.99.0", features = ["bgzf"] }
15+
noodles = { version = "0.100.0", features = ["bgzf"] }
1616
regex = "1.11.1"
1717
log = "0.4.27"
1818
env_logger = "0.11.8"
@@ -28,6 +28,14 @@ nalgebra = "0.33.2"
2828

2929
agc-rs = { git = "https://github.com/pangenome/agc-rs.git", optional = true }
3030

31+
# Additional dependencies for lace functionality
32+
handlegraph = { git = "https://github.com/chfi/rs-handlegraph", rev = "3ac575e4216ce16a16667503a8875e469a40a97a" }
33+
bitvec = "1.0.1"
34+
niffler = "2.7.0"
35+
tempfile = "3.20.0"
36+
zstd = { version = "0.13.3", features = ["zstdmt"] }
37+
gzp = "1.0.1"
38+
3139
[features]
3240
default = ["agc"]
3341
agc = ["agc-rs"]

README.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,71 @@ Print alignment statistics:
183183
impg stats -p alignments.paf
184184
```
185185

186+
### Lace
187+
188+
Combine multiple GFA files into a unified pangenome graph:
189+
190+
```bash
191+
# Combine multiple GFA files (you can mix compressed and uncompressed files)
192+
impg lace -g file1.gfa file2.gfa file3.gfa -o combined.gfa
193+
194+
# Use a list file containing GFA paths
195+
impg lace -l gfa_files.txt -o combined.gfa
196+
197+
# Fill gaps between contiguous path segments
198+
impg lace -g *.gfa -o combined.gfa --fill-gaps 1 # Fill with N's
199+
impg lace -g *.gfa -o combined.gfa --fill-gaps 1 --sequence-files sequence.fa # Fill with sequences
200+
201+
# Fill all gaps, including start and end gaps (end gap filling requires sequence files)
202+
impg lace -g *.gfa -o combined.gfa --fill-gaps 2 # Fill with N's
203+
impg lace -g *.gfa -o combined.gfa --fill-gaps 2 --sequence-files sequence.fa # Fill with sequences
204+
205+
# Control output compression
206+
impg lace -g *.gfa -o combined.gfa.gz --compress gzip
207+
impg lace -g *.gfa -o combined.gfa.bgz --compress bgzip
208+
impg lace -g *.gfa -o combined.gfa.zst --compress zstd
209+
210+
# Use custom temporary directory
211+
impg lace -g *.gfa -o combined.gfa --temp-dir /tmp/lace_work
212+
```
213+
214+
#### Path Name Format
215+
216+
The command expects path names in the format:
217+
218+
```
219+
NAME:START-END
220+
```
221+
222+
Example: `HG002#1#chr20:1000-2000`
223+
224+
The command uses these coordinates to:
225+
1. Identify which sequences belong together
226+
2. Order the sequences correctly
227+
3. Detect and handle overlaps or gaps
228+
229+
Note: `NAME` can contain ':' characters. When parsing coordinates, the command uses the last occurrence of ':' to separate the name from the coordinate range.
230+
231+
#### Post-processing recommendations
232+
233+
After combining the GFA files, the resulting graph will already have compacted node IDs ranging from `1` to the total number of nodes. However, it is strongly recommended to perform post-processing steps using **[ODGI](https://github.com/pangenome/odgi)** to unchop and sort the graph.
234+
235+
```bash
236+
odgi unchop -i combined.gfa -o - -t 16 | \
237+
odgi sort -i - -o - -p gYs -t 16 | \
238+
odgi view -i - -g > combined.final.gfa
239+
```
240+
241+
If overlaps were present, and then trimmed during the merging process, it's advisable to run **[GFAffix](https://github.com/marschall-lab/GFAffix)** before the ODGI pipeline to remove redundant nodes introduced by the overlap trimming.
242+
243+
```bash
244+
gfaffix combined.gfa -o combined.fix.gfa &> /dev/null
245+
246+
odgi unchop -i combined.fix.gfa -o - -t 16 | \
247+
odgi sort -i - -o - -p gYs -t 16 | \
248+
odgi view -i - -g > combined.final.gfa
249+
```
250+
186251
### Index
187252

188253
Create an IMPG index from PAF files:

src/agc_index.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ impl AgcIndex {
5353
if !agc.open(agc_path, true) {
5454
return Err(io::Error::new(
5555
io::ErrorKind::InvalidData,
56-
format!("Failed to open AGC file: {}", agc_path),
56+
format!("Failed to open AGC file: {agc_path}"),
5757
));
5858
}
5959

@@ -78,7 +78,7 @@ impl AgcIndex {
7878
for (sample, contigs) in sample_contigs {
7979
for contig in contigs {
8080
// Create a key that combines full contig name and sample name
81-
let key = format!("{}@{}", contig, sample);
81+
let key = format!("{contig}@{sample}");
8282
index.sample_contig_to_agc.insert(key.clone(), agc_idx);
8383

8484
// Also insert just the full contig name if it's unique
@@ -102,7 +102,7 @@ impl AgcIndex {
102102
// If short name differs from full name, also create mappings for short name
103103
if short_contig != contig {
104104
// Create key with short contig name and sample
105-
let short_key = format!("{}@{}", short_contig, sample);
105+
let short_key = format!("{short_contig}@{sample}");
106106
index
107107
.sample_contig_to_agc
108108
.entry(short_key)
@@ -154,7 +154,7 @@ impl AgcIndex {
154154
let agc_idx = agc_idx.ok_or_else(|| {
155155
io::Error::new(
156156
io::ErrorKind::NotFound,
157-
format!("Sequence '{}' not found in any AGC file", seq_name),
157+
format!("Sequence '{seq_name}' not found in any AGC file"),
158158
)
159159
})?;
160160

@@ -164,8 +164,7 @@ impl AgcIndex {
164164
.get_contig_sequence(&sample, &contig, start, end - 1)
165165
.map_err(|e| {
166166
io::Error::other(format!(
167-
"Failed to fetch sequence '{}@{}:{}:{}': {}",
168-
contig, sample, start, end, e
167+
"Failed to fetch sequence '{contig}@{sample}:{start}:{end}': {e}"
169168
))
170169
})?;
171170

@@ -174,11 +173,11 @@ impl AgcIndex {
174173

175174
pub fn get_sequence_length(&self, seq_name: &str) -> io::Result<usize> {
176175
let (sample, contig, agc_idx) = self.parse_query(seq_name);
177-
176+
178177
let agc_idx = agc_idx.ok_or_else(|| {
179178
io::Error::new(
180179
io::ErrorKind::NotFound,
181-
format!("Sequence '{}' not found in any AGC file", seq_name),
180+
format!("Sequence '{seq_name}' not found in any AGC file"),
182181
)
183182
})?;
184183

0 commit comments

Comments
 (0)