Skip to content

Commit 0be68f8

Browse files
committed
implement --low-memory arg for cluster/process
uses skani sketch to file and search to avoid having whole db in memory
1 parent 515ce26 commit 0be68f8

File tree

8 files changed

+338
-13
lines changed

8 files changed

+338
-13
lines changed

docs/preludes/cluster_prelude.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ galah cluster --ani 95 --precluster-ani 90 --precluster-method finch --genome-fa
1313
# Example: cluster a set of genomes and then their representatives against a set of reference genomes (reduces memory usage against clustering all together)
1414
galah cluster --genome-fasta-directory input_genomes/ --output-representative-list genome_reps.txt
1515
galah cluster --genome-fasta-list genome_reps.txt --reference-genomes-list reference_genomes.txt --output-cluster-definition clusters.tsv
16+
# Example: cluster a large set of genomes using low-memory mode
17+
galah cluster --low-memory --genome-fasta-directory input_genomes/ --output-representative-fasta-directory output_directory/
1618
```
1719

1820
### Precluster ANI

src/cluster_argument_parsing.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ pub struct GalahClustererCommandDefinition {
128128
pub dereplication_large_contigs_argument: String,
129129
pub dereplication_fraglen_argument: String,
130130
pub dereplication_cluster_contigs_argument: String,
131+
pub dereplication_low_memory_argument: String,
131132
pub dereplication_reference_genomes_argument: String,
132133
pub dereplication_reference_genomes_list_argument: String,
133134
// pub dereplication_ani_method_argument: String,
@@ -153,6 +154,7 @@ lazy_static! {
153154
dereplication_large_contigs_argument: "large-contigs".to_string(),
154155
dereplication_fraglen_argument: "fragment-length".to_string(),
155156
dereplication_cluster_contigs_argument: "cluster-contigs".to_string(),
157+
dereplication_low_memory_argument: "low-memory".to_string(),
156158
dereplication_reference_genomes_argument: "reference-genomes".to_string(),
157159
dereplication_reference_genomes_list_argument: "reference-genomes-list".to_string(),
158160
// dereplication_ani_method_argument: "ani-method".to_string(),
@@ -430,6 +432,14 @@ pub fn add_dereplication_clustering_parameters_to_section(
430432
.help("Do not use small-genomes settings in skani when clustering contigs. \
431433
Recommended for contigs >= 20kb. Mutually exclusive with --small-contigs."),
432434
)
435+
.flag(
436+
Flag::new()
437+
.long(&format!(
438+
"--{}",
439+
definition.dereplication_low_memory_argument
440+
))
441+
.help("Reduce memory use by sketching to file and searching it instead."),
442+
)
433443
.option(
434444
Opt::new("PATH ...")
435445
.long(&format!(
@@ -1279,6 +1289,8 @@ pub fn generate_galah_clusterer<'a>(
12791289
}),
12801290
num_kmers: 1000,
12811291
kmer_length: 21,
1292+
low_memory: clap_matches
1293+
.get_flag(&argument_definition.dereplication_low_memory_argument),
12821294
}),
12831295
"skani" => Preclusterer::Skani(SkaniPreclusterer {
12841296
threshold: {
@@ -1350,6 +1362,8 @@ pub fn generate_galah_clusterer<'a>(
13501362
}),
13511363
small_genomes,
13521364
threads,
1365+
low_memory: clap_matches
1366+
.get_flag(&argument_definition.dereplication_low_memory_argument),
13531367
}),
13541368
_ => panic!("Programming error"),
13551369
},
@@ -1663,15 +1677,23 @@ pub fn add_cluster_subcommand(app: clap::Command) -> clap::Command {
16631677
.action(clap::ArgAction::SetTrue)
16641678
.requires(&*GALAH_COMMAND_DEFINITION.dereplication_cluster_contigs_argument)
16651679
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_small_contigs_argument))
1680+
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
1681+
.long(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
1682+
.help("Reduce memory by sketching all genomes and searching instead of triangle")
1683+
.action(clap::ArgAction::SetTrue)
1684+
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
1685+
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
16661686
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
16671687
.long("reference-genomes")
16681688
.help("Reference genomes to cluster against. These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
16691689
.value_delimiter(' ')
16701690
.num_args(1..)
1691+
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
16711692
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
16721693
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument)
16731694
.long("reference-genomes-list")
16741695
.help("File containing paths to reference genomes (one per line). These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
1696+
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
16751697
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_argument))
16761698
.arg(Arg::new("threads")
16771699
.short('t')

src/clusterer.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,7 @@ mod tests {
548548
min_ani: 0.9,
549549
num_kmers: 1000,
550550
kmer_length: 21,
551+
low_memory: false,
551552
},
552553
&crate::fastani::FastaniClusterer {
553554
threshold: 95.0,
@@ -578,6 +579,7 @@ mod tests {
578579
min_ani: 0.9,
579580
num_kmers: 1000,
580581
kmer_length: 21,
582+
low_memory: false,
581583
},
582584
&crate::fastani::FastaniClusterer {
583585
threshold: 98.0,
@@ -608,6 +610,7 @@ mod tests {
608610
min_ani: 0.9,
609611
num_kmers: 1000,
610612
kmer_length: 21,
613+
low_memory: false,
611614
},
612615
&crate::fastani::FastaniClusterer {
613616
threshold: 98.0,
@@ -638,6 +641,7 @@ mod tests {
638641
min_ani: 0.9,
639642
num_kmers: 1000,
640643
kmer_length: 21,
644+
low_memory: false,
641645
},
642646
&crate::skani::SkaniClusterer {
643647
threshold: 95.0,
@@ -668,6 +672,7 @@ mod tests {
668672
min_ani: 0.9,
669673
num_kmers: 1000,
670674
kmer_length: 21,
675+
low_memory: false,
671676
},
672677
&crate::skani::SkaniClusterer {
673678
threshold: 99.0,
@@ -699,6 +704,7 @@ mod tests {
699704
min_aligned_threshold: 0.2,
700705
small_genomes: false,
701706
threads: 1,
707+
low_memory: false,
702708
},
703709
&crate::skani::SkaniClusterer {
704710
threshold: 99.0,
@@ -731,6 +737,41 @@ mod tests {
731737
min_aligned_threshold: 0.2,
732738
small_genomes: false,
733739
threads: 1,
740+
low_memory: false,
741+
},
742+
&crate::skani::SkaniClusterer {
743+
threshold: 99.0,
744+
min_aligned_threshold: 0.2,
745+
small_genomes: false,
746+
},
747+
false,
748+
None,
749+
None,
750+
);
751+
for cluster in clusters.iter_mut() {
752+
cluster.sort_unstable();
753+
}
754+
clusters.sort_unstable();
755+
assert_eq!(vec![vec![0, 1, 3], vec![2], vec![4]], clusters)
756+
}
757+
758+
#[test]
759+
fn test_skani_skani_low_memory() {
760+
init();
761+
let mut clusters = cluster(
762+
&[
763+
"tests/data/abisko4/73.20120800_S1X.13.fna",
764+
"tests/data/abisko4/73.20120600_S2D.19.fna",
765+
"tests/data/abisko4/73.20120700_S3X.12.fna",
766+
"tests/data/abisko4/73.20110800_S2D.13.fna",
767+
"tests/data/antonio_mags/BE_RX_R2_MAG52.fna",
768+
],
769+
&crate::skani::SkaniPreclusterer {
770+
threshold: 90.0,
771+
min_aligned_threshold: 0.2,
772+
small_genomes: false,
773+
threads: 1,
774+
low_memory: true,
734775
},
735776
&crate::skani::SkaniClusterer {
736777
threshold: 99.0,
@@ -758,6 +799,7 @@ mod tests {
758799
min_aligned_threshold: 0.2,
759800
small_genomes: false,
760801
threads: 1,
802+
low_memory: false,
761803
},
762804
&crate::skani::SkaniClusterer {
763805
threshold: 99.0,

src/finch.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,21 @@ pub struct FinchPreclusterer {
66
pub min_ani: f32,
77
pub num_kmers: usize,
88
pub kmer_length: u8,
9+
pub low_memory: bool,
910
}
1011

1112
impl PreclusterDistanceFinder for FinchPreclusterer {
1213
fn distances(&self, genome_fasta_paths: &[&str]) -> SortedPairGenomeDistanceCache {
13-
distances(
14-
genome_fasta_paths,
15-
self.min_ani,
16-
self.num_kmers,
17-
self.kmer_length,
18-
)
14+
if self.low_memory {
15+
panic!("Low-memory clustering currently only supported with skani preclusterer");
16+
} else {
17+
distances(
18+
genome_fasta_paths,
19+
self.min_ani,
20+
self.num_kmers,
21+
self.kmer_length,
22+
)
23+
}
1924
}
2025

2126
fn distances_contigs(

src/process_argument_parsing.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ lazy_static! {
2626
dereplication_small_contigs_argument: "small-contigs".to_string(),
2727
dereplication_large_contigs_argument: "large-contigs".to_string(),
2828
dereplication_fraglen_argument: "fragment-length".to_string(),
29+
dereplication_low_memory_argument: "low-memory".to_string(),
2930
dereplication_reference_genomes_argument: "reference-genomes".to_string(),
3031
dereplication_reference_genomes_list_argument: "reference-genomes-list".to_string(),
3132
dereplication_output_cluster_definition_file: "output-cluster-definition"
@@ -203,15 +204,23 @@ pub fn add_process_subcommand(app: clap::Command) -> clap::Command {
203204
.action(clap::ArgAction::SetTrue)
204205
.requires(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_cluster_contigs_argument)
205206
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_small_contigs_argument))
207+
.arg(Arg::new(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
208+
.long(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
209+
.help("Reduce memory by sketching all genomes and searching instead of triangle")
210+
.action(clap::ArgAction::SetTrue)
211+
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
212+
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
206213
.arg(Arg::new(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
207214
.long("reference-genomes")
208215
.help("Reference genomes to cluster against. These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
209216
.value_delimiter(' ')
210217
.num_args(1..)
218+
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
211219
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
212220
.arg(Arg::new(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument)
213221
.long("reference-genomes-list")
214222
.help("File containing paths to reference genomes (one per line). These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
223+
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
215224
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_argument))
216225
.arg(Arg::new("threads")
217226
.short('t')
@@ -385,6 +394,7 @@ pub fn process_full_help(program_basename: &str, program_version: &str) -> Manua
385394
dereplication_large_contigs_argument: "large-contigs".to_string(),
386395
dereplication_fraglen_argument: "fragment-length".to_string(),
387396
dereplication_cluster_contigs_argument: "cluster-contigs".to_string(),
397+
dereplication_low_memory_argument: "low-memory".to_string(),
388398
dereplication_reference_genomes_argument: "reference-genomes".to_string(),
389399
dereplication_reference_genomes_list_argument: "reference-genomes-list".to_string(),
390400
dereplication_output_cluster_definition_file: "output-cluster-definition".to_string(),

0 commit comments

Comments
 (0)