Skip to content

Commit dc852a2

Browse files
Split metawrap tool (#7365)
* split metawrap * update requirements * latest updates and polishing * bump * remove commented code * A is required * get small test data by supporting gz
1 parent 54a29e1 commit dc852a2

32 files changed

+196
-96
lines changed

tools/metawrapmg/macros.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<macros>
22
<token name="@TOOL_VERSION@">1.3.0</token>
3-
<token name="@VERSION_SUFFIX@">2</token>
4-
<token name="@PROFILE@">22.05</token>
3+
<token name="@VERSION_SUFFIX@">3</token>
4+
<token name="@PROFILE@">24.0</token>
55
<xml name="requirements">
66
<requirements>
77
<requirement type="package" version="@TOOL_VERSION@">metawrap-mg</requirement>

tools/metawrapmg/metawrapmg_binning.xml

Lines changed: 23 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
<tool id="metawrapmg_binning" name="MetaWRAP" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
2-
<description>metagenome binning pipeline</description>
2+
<description>metagenome binning</description>
33
<macros>
44
<import>macros.xml</import>
55
</macros>
66
<expand macro="xrefs"/>
7-
<expand macro="requirements"/>
7+
8+
<requirements>
9+
<requirement type="package" version="@TOOL_VERSION@">metawrap-binning</requirement>
10+
</requirements>
11+
812
<command detect_errors="exit_code"><![CDATA[
913
## set memory usage
1014
if [ -n "\${GALAXY_MEMORY_MB}" ] ; then
@@ -33,66 +37,20 @@
3337
-t "\${GALAXY_SLOTS:-4}"
3438
reads_1.fastq
3539
reads_2.fastq
36-
&&
37-
38-
## Check which binning programs produced bins
39-
bin_dirs=(INITIAL_BINNING/concoct_bins INITIAL_BINNING/maxbin2_bins INITIAL_BINNING/metabat2_bins) &&
40-
switches=('-A' '-B' '-C') &&
41-
42-
i=0 &&
43-
bin_string="" &&
44-
45-
for dir in "\${bin_dirs[@]}" ; do
46-
if [ "\$(find "\$dir" -mindepth 1 -maxdepth 1 -exec echo found \;)" ]; then
47-
bin_string+=" \${switches[\$i]} \$dir" ;
48-
((i++)) ;
49-
fi
50-
done &&
51-
52-
####################
53-
## BIN REFINEMENT ##
54-
####################
55-
56-
## The checkm database is in the conda package, see
57-
## https://github.com/bioconda/bioconda-recipes/pull/38299.
58-
59-
metawrap bin_refinement
60-
-t "\${GALAXY_SLOTS:-4}"
61-
-m "\${GALAXY_MEMORY_GB:-16}"
62-
'$hidden_quick'
63-
-c '${binning.c}'
64-
-x '${binning.x}'
65-
-o BIN_REFINEMENT
66-
## Only run bin_refinement on bins with contigs
67-
"\${bin_string}"
6840
]]></command>
6941
<inputs>
7042
<param name="metagenome" format="fasta" type="data" label="Metagenome" help="Metagenome co-assembly for binning"/>
7143
<param name="input" type="data_collection" collection_type="paired" label="Reads" help="Original reads that were used for the assembly"/>
72-
<section name="binning" title="Binning parameters" expanded="false">
73-
<param argument="-c" type="integer" value="70" min="50" max="100" label="Percent completion" help="Minimum % completion of bins"/>
74-
<param argument="-x" type="integer" value="10" min="0" max="100" label="Percent contamination" help="Maximum % contamination of bins that is acceptable"/>
75-
</section>
76-
<!-- the pplacer component requires 40 GB per thread. Skip pplacer for
77-
testing by setting this to "quick" -->
78-
<param name="hidden_quick" type="hidden" value=""/>
7944
</inputs>
8045
<outputs>
81-
<!-- contigs binned into fasta files -->
82-
<collection name="metawrap_bins" type="list" label="MetaWRAP on ${on_string}: bins">
83-
<discover_datasets pattern="metawrap_\d+_\d+_bins/(?P&lt;designation&gt;.+)\.fa" format="fasta" directory="BIN_REFINEMENT" recurse="true" match_relative_path="true"/>
46+
<collection name="concoct_bins" type="list" label="${tool.name} on ${on_string}: concoct bins">
47+
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.fa" format="fasta" directory="INITIAL_BINNING/concoct_bins" recurse="true" match_relative_path="true"/>
8448
</collection>
85-
<!-- summary figures -->
86-
<collection name="metawrap_figures" type="list" label="MetaWRAP on ${on_string}: summary figures">
87-
<discover_datasets pattern="__designation_and_ext__" directory="BIN_REFINEMENT/figures"/>
49+
<collection name="maxbin2_bins" type="list" label="${tool.name} on ${on_string}: maxbin2 bins">
50+
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.fa" format="fasta" directory="INITIAL_BINNING/maxbin2_bins" recurse="true" match_relative_path="true"/>
8851
</collection>
89-
<!-- statistics on binning -->
90-
<collection name="metawrap_stats" type="list" label="MetaWRAP on ${on_string}: stat files">
91-
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.stats" format="tabular" directory="BIN_REFINEMENT"/>
92-
</collection>
93-
<!-- which contig went into which bin -->
94-
<collection name="metawrap_contigs" type="list" label="MetaWRAP on ${on_string}: contig assignments">
95-
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.contigs" format="tabular" directory="BIN_REFINEMENT"/>
52+
<collection name="metabat_bins" type="list" label="${tool.name} on ${on_string}: metabat2 bins">
53+
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.fa" format="fasta" directory="INITIAL_BINNING/metabat2_bins" recurse="true" match_relative_path="true"/>
9654
</collection>
9755
</outputs>
9856
<tests>
@@ -105,24 +63,21 @@
10563
<element name="reverse" value="mapped_reads.r2.fastq.gz"/>
10664
</collection>
10765
</param>
108-
<section name="binning">
109-
<param name="c" value="60"/>
110-
<param name="x" value="15"/>
111-
</section>
112-
<param name="hidden_quick" value="--quick"/>
113-
<output_collection name="metawrap_bins" type="list">
66+
<output_collection name="concoct_bins" type="list" count="27">
11467
<element name="bin.1" ftype="fasta">
11568
<assert_contents>
116-
<has_text text="NODE_2_length_"/>
69+
<has_text text=">NODE_"/>
11770
</assert_contents>
11871
</element>
11972
</output_collection>
120-
<output_collection name="metawrap_stats" type="list">
121-
<element name="metawrap_60_15_bins" file="test02.stats" ftype="tabular"/>
122-
</output_collection>
123-
<output_collection name="metawrap_contigs" type="list">
124-
<element name="metawrap_60_15_bins" file="test02.contigs" ftype="tabular"/>
73+
<output_collection name="maxbin2_bins" type="list" count="2">
74+
<element name="bin.1" ftype="fasta">
75+
<assert_contents>
76+
<has_text text="NODE_"/>
77+
</assert_contents>
78+
</element>
12579
</output_collection>
80+
<output_collection name="metabat_bins" type="list" count="0"/>
12681
</test>
12782
</tests>
12883
<help><![CDATA[
@@ -146,41 +101,15 @@ First the metagenomic assembly is indexed with bwa-index, and then
146101
paired end reads from any number of samples are aligned to it. The
147102
alignments are sorted and compressed with samtools, and library insert
148103
size statistics are also gathered at the same time (insert size average
149-
and standard deviation). metaBAT2s jgi_summarize_bam_contig_depths
104+
and standard deviation). metaBAT2's jgi_summarize_bam_contig_depths
150105
function is used to generate contig adundance table, and it is then
151106
converted into the correct format for each of the three binners to take
152107
as input. After MaxBin2, metaBAT2, and CONCOCT finish binning the
153108
contigs with default settings, the final bins folders are created with
154-
formatted bin fasta files. CheckMs lineage_wf function is used to
109+
formatted bin fasta files. CheckM's lineage_wf function is used to
155110
predict essential genes and estimate the completion and contamination of
156111
each bin.
157112
158-
MetaWRAP bin refinement
159-
~~~~~~~~~~~~~~~~~~~~~~~
160-
161-
The metaWRAP::Bin_refinement module utilizes a hybrid approach to take
162-
in two or three bin sets that were obtained with different software and
163-
produces a consolidated, improved bin set. First, binning_refiner is
164-
used to create hybridized bins from every possible combination of sets.
165-
If there were three bin sets: A, B, and C, then the following hybrid
166-
sets will be produced with binning_refiner: AB, BC, AC, and ABC. CheckM
167-
is then run to evaluate the completion and contamination of the bins in
168-
each of the 7 bin sets (3 originals, 4 hybridized). The bins sets are
169-
then iteratively compared to each other, and each pair is consolidated
170-
into an improved bin set. To do this, the same bin is identified within
171-
the two bin sets based on a minimum of 80% overlap in genome length, and
172-
the better bin is determined based on which bin has the higher score.
173-
The scoring function is S=Completion-5*Contamination. After all bin sets
174-
are incorporated into the consolidated bin collection, a de-replication
175-
function removes any duplicate contigs. If a contig is present in more
176-
than one bin, it is removed from all but the best bin (based on scoring
177-
function). CheckM is then run on the final bin set and a final report
178-
file is generated showing the completion, contamination, and other
179-
statistics generated by CheckM for each bin. Completion and
180-
contamination rank plots are also generated to evaluate the success of
181-
the Bin_refinement module, and compare its output to the quality of the
182-
original bins.
183-
184113
--------------
185114
186115
MetaWRAP’s home page is
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
<tool id="metawrapmg_bin_refinement" name="MetaWRAP" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
2+
<description>metagenome bin refinement</description>
3+
<macros>
4+
<import>macros.xml</import>
5+
</macros>
6+
<expand macro="xrefs"/>
7+
<requirements>
8+
<requirement type="package" version="@TOOL_VERSION@">metawrap-refinement</requirement>
9+
</requirements>
10+
<command detect_errors="exit_code"><![CDATA[
11+
## set memory usage
12+
if [ -n "\${GALAXY_MEMORY_MB}" ] ; then
13+
export GALAXY_MEMORY_GB="\$((GALAXY_MEMORY_MB / 1024))" ;
14+
fi ;
15+
16+
mkdir -p INITIAL_BINNING/A &&
17+
#for i, a in enumerate($A)
18+
#if $a.ext.endswith(".gz")
19+
gunzip -c '$a' > INITIAL_BINNING/A/bin.${i}.fa &&
20+
#else
21+
cp '$a' INITIAL_BINNING/A/bin.${i}.fa &&
22+
#end if
23+
#end for
24+
25+
#if $B
26+
mkdir -p INITIAL_BINNING/B &&
27+
#for i, b in enumerate($B)
28+
#if $b.ext.endswith(".gz")
29+
gunzip -c '$b' > INITIAL_BINNING/B/bin.${i}.fa &&
30+
#else
31+
cp '$b' INITIAL_BINNING/B/bin.${i}.fa &&
32+
#end if
33+
#end for
34+
#end if
35+
36+
#if $C
37+
mkdir -p INITIAL_BINNING/C &&
38+
#for i, c in enumerate($C)
39+
#if $c.ext.endswith(".gz")
40+
gunzip -c '$c' > INITIAL_BINNING/C/bin.${i}.fa &&
41+
#else
42+
cp '$c' INITIAL_BINNING/C/bin.${i}.fa &&
43+
#end if
44+
#end for
45+
#end if
46+
47+
####################
48+
## BIN REFINEMENT ##
49+
####################
50+
51+
## The checkm database is in the conda package, see
52+
## https://github.com/bioconda/bioconda-recipes/pull/38299.
53+
54+
metawrap bin_refinement
55+
-t "\${GALAXY_SLOTS:-4}"
56+
-m "\${GALAXY_MEMORY_GB:-16}"
57+
'$hidden_quick'
58+
-c '${binning.c}'
59+
-x '${binning.x}'
60+
-o BIN_REFINEMENT
61+
## Only run bin_refinement on bins with contigs
62+
-A INITIAL_BINNING/A/
63+
#if $B and len($B)
64+
-B INITIAL_BINNING/B/
65+
#end if
66+
#if $C and len($C)
67+
-C INITIAL_BINNING/C/
68+
#end if
69+
]]></command>
70+
<inputs>
71+
<param argument="-A" type="data" multiple="true" format="fasta,fasta.gz" label="Metagenomic bins"/>
72+
<param argument="-B" type="data" multiple="true" optional="true" format="fasta,fasta.gz" label="Another set of metagenomic bins"/>
73+
<param argument="-C" type="data" multiple="true" optional="true" format="fasta,fasta.gz" label="Another set of metagenomic bins"/>
74+
<section name="binning" title="Binning parameters" expanded="false">
75+
<param argument="-c" type="integer" value="70" min="50" max="100" label="Percent completion" help="Minimum % completion of bins"/>
76+
<param argument="-x" type="integer" value="10" min="0" max="100" label="Percent contamination" help="Maximum % contamination of bins that is acceptable"/>
77+
</section>
78+
<!-- the pplacer component requires 40 GB per thread. Skip pplacer for
79+
testing by setting this to "quick" -->
80+
<param name="hidden_quick" type="hidden" value=""/>
81+
</inputs>
82+
<outputs>
83+
<!-- contigs binned into fasta files -->
84+
<collection name="metawrap_bins" type="list" label="MetaWRAP on ${on_string}: bins">
85+
<discover_datasets pattern="metawrap_\d+_\d+_bins/(?P&lt;designation&gt;.+)\.fa" format="fasta" directory="BIN_REFINEMENT" recurse="true" match_relative_path="true"/>
86+
</collection>
87+
<!-- summary figures -->
88+
<collection name="metawrap_figures" type="list" label="MetaWRAP on ${on_string}: summary figures">
89+
<discover_datasets pattern="__designation_and_ext__" directory="BIN_REFINEMENT/figures"/>
90+
</collection>
91+
<!-- statistics on binning -->
92+
<collection name="metawrap_stats" type="list" label="MetaWRAP on ${on_string}: stat files">
93+
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.stats" format="tabular" directory="BIN_REFINEMENT"/>
94+
</collection>
95+
<!-- which contig went into which bin -->
96+
<collection name="metawrap_contigs" type="list" label="MetaWRAP on ${on_string}: contig assignments">
97+
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.contigs" format="tabular" directory="BIN_REFINEMENT"/>
98+
</collection>
99+
</outputs>
100+
<tests>
101+
<!-- 01: basic function -->
102+
<test>
103+
<param name="A" ftype="fasta.gz" value="concoct_bins/bin.0.fa.gz,concoct_bins/bin.1.fa.gz,concoct_bins/bin.2.fa.gz,concoct_bins/bin.3.fa.gz,concoct_bins/bin.4.fa.gz,concoct_bins/bin.5.fa.gz,concoct_bins/bin.6.fa.gz,concoct_bins/bin.7.fa.gz,concoct_bins/bin.8.fa.gz,concoct_bins/bin.9.fa.gz,concoct_bins/bin.10.fa.gz,concoct_bins/bin.11.fa.gz,concoct_bins/bin.12.fa.gz,concoct_bins/bin.13.fa.gz,concoct_bins/bin.14.fa.gz,concoct_bins/bin.15.fa.gz,concoct_bins/bin.16.fa.gz,concoct_bins/bin.17.fa.gz,concoct_bins/bin.18.fa.gz,concoct_bins/bin.19.fa.gz,concoct_bins/bin.20.fa.gz,concoct_bins/bin.21.fa.gz,concoct_bins/bin.22.fa.gz,concoct_bins/bin.23.fa.gz,concoct_bins/bin.24.fa.gz,concoct_bins/bin.25.fa.gz,concoct_bins/bin.26.fa.gz"/>
104+
<param name="B" ftype="fasta.gz" value="maxbin2_bins/bin.0.fa.gz,maxbin2_bins/bin.1.fa.gz"/>
105+
<section name="binning">
106+
<param name="c" value="60"/>
107+
<param name="x" value="15"/>
108+
</section>
109+
<param name="hidden_quick" value="--quick"/>
110+
<output_collection name="metawrap_bins" type="list">
111+
<element name="bin.1" ftype="fasta">
112+
<assert_contents>
113+
<has_text text="NODE_2_length_"/>
114+
</assert_contents>
115+
</element>
116+
</output_collection>
117+
<output_collection name="metawrap_stats" type="list">
118+
<element name="metawrap_60_15_bins" file="test02.stats" ftype="tabular"/>
119+
</output_collection>
120+
<output_collection name="metawrap_contigs" type="list">
121+
<element name="metawrap_60_15_bins" file="test02.contigs" ftype="tabular"/>
122+
</output_collection>
123+
</test>
124+
</tests>
125+
<help><![CDATA[
126+
MetaWRAP
127+
--------
128+
129+
MetaWRAP aims to be an easy-to-use metagenomic wrapper suite that
130+
accomplishes the core tasks of metagenomic analysis. Additionally,
131+
metaWRAP takes bin extraction and analysis to the next level. metaWRAP
132+
is meant to be a fast and simple approach before you delve deeper into
133+
parameterization of your analysis. MetaWRAP can be applied to a variety
134+
of environments, including gut, water, and soil microbiomes (see
135+
metaWRAP paper for benchmarks).
136+
137+
MetaWRAP bin refinement
138+
~~~~~~~~~~~~~~~~~~~~~~~
139+
140+
The metaWRAP::Bin_refinement module utilizes a hybrid approach to take
141+
in two or three bin sets that were obtained with different software and
142+
produces a consolidated, improved bin set. First, binning_refiner is
143+
used to create hybridized bins from every possible combination of sets.
144+
If there were three bin sets: A, B, and C, then the following hybrid
145+
sets will be produced with binning_refiner: AB, BC, AC, and ABC. CheckM
146+
is then run to evaluate the completion and contamination of the bins in
147+
each of the 7 bin sets (3 originals, 4 hybridized). The bins sets are
148+
then iteratively compared to each other, and each pair is consolidated
149+
into an improved bin set. To do this, the same bin is identified within
150+
the two bin sets based on a minimum of 80% overlap in genome length, and
151+
the better bin is determined based on which bin has the higher score.
152+
The scoring function is S=Completion-5*Contamination. After all bin sets
153+
are incorporated into the consolidated bin collection, a de-replication
154+
function removes any duplicate contigs. If a contig is present in more
155+
than one bin, it is removed from all but the best bin (based on scoring
156+
function). CheckM is then run on the final bin set and a final report
157+
file is generated showing the completion, contamination, and other
158+
statistics generated by CheckM for each bin. Completion and
159+
contamination rank plots are also generated to evaluate the success of
160+
the Bin_refinement module, and compare its output to the quality of the
161+
original bins.
162+
163+
--------------
164+
165+
MetaWRAP’s home page is
166+
`bxlab/metaWRAP <https://github.com/bxlab/metaWRAP>`__.
167+
168+
This tool was wrapped by the Galaxy Australia team.
169+
]]></help>
170+
<expand macro="citations"/>
171+
</tool>
457 Bytes
Binary file not shown.
1.13 KB
Binary file not shown.
454 Bytes
Binary file not shown.
704 Bytes
Binary file not shown.
576 Bytes
Binary file not shown.
766 KB
Binary file not shown.
446 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)