From 972c7fd8594486e644b0a19e08f8fc2efd2a9b18 Mon Sep 17 00:00:00 2001 From: xens25 Date: Mon, 1 Dec 2025 12:58:51 +0100 Subject: [PATCH 1/5] Add KneadData tool wrapper --- tools/kneaddata/kneaddata.xml | 272 ++++++++++++++++++ tools/kneaddata/test-data/test_paired_1.fastq | 4 + tools/kneaddata/test-data/test_paired_2.fastq | 4 + tools/kneaddata/test-data/test_single.fastq | 8 + 4 files changed, 288 insertions(+) create mode 100644 tools/kneaddata/kneaddata.xml create mode 100644 tools/kneaddata/test-data/test_paired_1.fastq create mode 100644 tools/kneaddata/test-data/test_paired_2.fastq create mode 100644 tools/kneaddata/test-data/test_single.fastq diff --git a/tools/kneaddata/kneaddata.xml b/tools/kneaddata/kneaddata.xml new file mode 100644 index 00000000000..422730f1c20 --- /dev/null +++ b/tools/kneaddata/kneaddata.xml @@ -0,0 +1,272 @@ + + Quality control and contaminant removal for metagenomic data + + kneaddata + trimmomatic + bowtie2 + trf + fastqc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
+
+ + + +
+ +
+ + + read_type["select_read_type"] == "s" and trf_step["trf_bool"] == "include" + + + read_type["select_read_type"] == "s" and trf_step["trf_bool"] == "skip" + + + + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "include" + + + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "skip" + + + + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "include" + + + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "skip" + + + + + + + +
+ + +
+ +
+
+
+ + +
+ +
+ + + + +
+ + +
+ +
+
+
+ + + +
+ + +
+
+ + , --threads <1> number of threads [ Default : 1 ] + -p <1>, --processes <1> number of processes [ Default : 1 ] + -q , --quality-scores quality scores [phred33|phred64] [DEFAULT: phred33] + --run-bmtagger run BMTagger instead of Bowtie2 to identify contaminant reads + --bypass-trf option to bypass the removal of tandem repeats + --run-fastqc-start run fastqc at the beginning of the workflow + --run-fastqc-end run fastqc at the end of the workflow + --store-temp-output store temp output files [ DEFAULT : temp output files are removed ] + --cat-final-output concatenate all final output files [ DEFAULT : final output is not concatenated ] + --log-level level of log messages [DEFAULT: DEBUG] + --log LOG log file [ DEFAULT : $OUTPUT_DIR/$SAMPLE_kneaddata.log ] + --trimmomatic TRIMMOMATIC_PATH path to trimmomatic [ DEFAULT : $PATH ] + --max-memory MAX_MEMORY max amount of memory [ DEFAULT : 500m ] + --trimmomatic-options TRIMMOMATIC_OPTIONS options for trimmomatic [ DEFAULT : SLIDINGWINDOW:4:20 MINLEN:50 ] + MINLEN is set to 50 percent of total input read length. The user can alternatively specify a length (in bases) for MINLEN. + --sequencer-source options for sequencer-source [ DEFAULT: NexteraPE] Available sequencers: ["NexteraPE","TruSeq2","TruSeq3"] + --bowtie2 BOWTIE2_PATH path to bowtie2 [ DEFAULT : $PATH ] + --bowtie2-options BOWTIE2_OPTIONS options for bowtie2 [ DEFAULT : --very-sensitive ] + --bmtagger BMTAGGER_PATH path to BMTagger [ DEFAULT : $PATH ] + --bypass-trf bypass the TRF step + --trf TRF_PATH path to TRF [ DEFAULT : $PATH ] + --mismatch MISMATCH mismatching penalty [ DEFAULT : 7 ] + --delta DELTA indel penalty [ DEFAULT : 7 ] + --pm PM match probability [ DEFAULT : 80 ] + --pi PI indel probability [ DEFAULT : 10 ] + --minscore MINSCORE minimum alignment score to report [ DEFAULT : 50 ] + --maxperiod MAXPERIOD maximum period size to report [ DEFAULT : 500 ] + --fastqc FASTQC_PATH path to fastqc [ DEFAULT : $PATH ] + + ]]> + + + @software{kneaddata, + title = {KneadData}, + author = {Harvard School of Public Health}, + year = {2015}, + url = {https://github.com/biobakery/kneaddata}, + license = {MIT}, + note = {Quality control and contaminant removal tool for metagenomic sequencing data} + } + +
\ No newline at end of file diff --git a/tools/kneaddata/test-data/test_paired_1.fastq b/tools/kneaddata/test-data/test_paired_1.fastq new file mode 100644 index 00000000000..3370d2b3700 --- /dev/null +++ b/tools/kneaddata/test-data/test_paired_1.fastq @@ -0,0 +1,4 @@ +@test1 +ACGTACGT ++ +IIIIIIII diff --git a/tools/kneaddata/test-data/test_paired_2.fastq b/tools/kneaddata/test-data/test_paired_2.fastq new file mode 100644 index 00000000000..61bf5ac7124 --- /dev/null +++ b/tools/kneaddata/test-data/test_paired_2.fastq @@ -0,0 +1,4 @@ +@test1 +TGCTAGCT ++ +IIIIIIII diff --git a/tools/kneaddata/test-data/test_single.fastq b/tools/kneaddata/test-data/test_single.fastq new file mode 100644 index 00000000000..aca0066135d --- /dev/null +++ b/tools/kneaddata/test-data/test_single.fastq @@ -0,0 +1,8 @@ +@test1 +ACGTACGT ++ +IIIIIIII +@test2 +TGCTAGCT ++ +IIIIIIII From 9d626bcd20dd417ba9620a52b580a4b94cb9cf2c Mon Sep 17 00:00:00 2001 From: xens25 Date: Mon, 1 Dec 2025 13:25:35 +0100 Subject: [PATCH 2/5] Add .shed.yml file for kneaddata tool --- tools/kneaddata/.shed.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tools/kneaddata/.shed.yml diff --git a/tools/kneaddata/.shed.yml b/tools/kneaddata/.shed.yml new file mode 100644 index 00000000000..a0c6ca9bec5 --- /dev/null +++ b/tools/kneaddata/.shed.yml @@ -0,0 +1,9 @@ +name: kneaddata +owner: iuc +type: unrestricted +description: Quality control and contaminant removal for metagenomic data +homepage_url: https://github.com/biobakery/kneaddata +categories: + - Sequence Analysis + - Metagenomics + - Quality Control From 93dd1536490642e6fcbee165d256c4d222833082 Mon Sep 17 00:00:00 2001 From: xens25 Date: Mon, 1 Dec 2025 13:33:58 +0100 Subject: [PATCH 3/5] Fix linting issues: unique output labels, complete shed metadata --- tools/kneaddata/.shed.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/kneaddata/.shed.yml b/tools/kneaddata/.shed.yml index a0c6ca9bec5..10cb2da7f25 100644 --- a/tools/kneaddata/.shed.yml +++ b/tools/kneaddata/.shed.yml @@ -2,8 +2,14 @@ name: kneaddata owner: iuc type: unrestricted description: Quality control and contaminant removal for metagenomic data +long_description: > + KneadData is a tool designed to perform quality control on + metagenomic and metatranscriptomic sequencing data, especially + data from microbiome experiments. It performs adapter trimming, + quality filtering, and removal of host contamination using + Bowtie2/TRIMMOMATIC/TRF. homepage_url: https://github.com/biobakery/kneaddata +remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/kneaddata categories: - - Sequence Analysis - Metagenomics - - Quality Control + - Statistics From d7659192e4c55f33e80b9ead442fbc3246f7ea1d Mon Sep 17 00:00:00 2001 From: xens25 Date: Mon, 1 Dec 2025 13:41:02 +0100 Subject: [PATCH 4/5] Fix duplicate output labels for linting --- tools/kneaddata/kneaddata.xml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/kneaddata/kneaddata.xml b/tools/kneaddata/kneaddata.xml index 422730f1c20..ac03cf9db46 100644 --- a/tools/kneaddata/kneaddata.xml +++ b/tools/kneaddata/kneaddata.xml @@ -137,27 +137,27 @@ - + read_type["select_read_type"] == "s" and trf_step["trf_bool"] == "include" - + read_type["select_read_type"] == "s" and trf_step["trf_bool"] == "skip" - + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "include" - + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "skip" - + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "include" - + read_type["select_read_type"] == "p" and trf_step["trf_bool"] == "skip" - + From ec2560de4bf32e807b020b1a3c54746bba008cc7 Mon Sep 17 00:00:00 2001 From: Saim Momin Date: Mon, 1 Dec 2025 15:57:01 +0100 Subject: [PATCH 5/5] Add Bowtie2 data tables --- tools/kneaddata/test-data/bowtie2_indices.loc | 38 +++++++++++++++++++ .../tool-data/bowtie2_indices.loc.sample | 35 +++++++++++++++++ .../kneaddata/tool_data_table_conf.xml.sample | 8 ++++ tools/kneaddata/tool_data_table_conf.xml.test | 8 ++++ 4 files changed, 89 insertions(+) create mode 100644 tools/kneaddata/test-data/bowtie2_indices.loc create mode 100644 tools/kneaddata/tool-data/bowtie2_indices.loc.sample create mode 100644 tools/kneaddata/tool_data_table_conf.xml.sample create mode 100644 tools/kneaddata/tool_data_table_conf.xml.test diff --git a/tools/kneaddata/test-data/bowtie2_indices.loc b/tools/kneaddata/test-data/bowtie2_indices.loc new file mode 100644 index 00000000000..4d8f1c77d1e --- /dev/null +++ b/tools/kneaddata/test-data/bowtie2_indices.loc @@ -0,0 +1,38 @@ +# bowtie2_indices.loc.sample +# This is a *.loc.sample file distributed with Galaxy that enables tools +# to use a directory of indexed data files. This one is for Bowtie2 and Tophat2. +# See the wiki: http://wiki.galaxyproject.org/Admin/NGS%20Local%20Setup +# First create these data files and save them in your own data directory structure. +# Then, create a bowtie_indices.loc file to use those indexes with tools. +# Copy this file, save it with the same name (minus the .sample), +# follow the format examples, and store the result in this directory. +# The file should include an one line entry for each index set. +# The path points to the "basename" for the set, not a specific file. +# It has four text columns seperated by TABS. +# +# +# +# So, for example, if you had hg18 indexes stored in: +# +# /depot/data2/galaxy/hg19/bowtie2/ +# +# containing hg19 genome and hg19.*.bt2 files, such as: +# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.fa +# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.1.bt2 +# -rw-rw-r-- 1 james james 683M Feb 10 18:56 hg19canon.2.bt2 +# -rw-rw-r-- 1 james james 3.3K Feb 10 16:54 hg19canon.3.bt2 +# -rw-rw-r-- 1 james james 683M Feb 10 16:54 hg19canon.4.bt2 +# -rw-rw-r-- 1 james james 914M Feb 10 20:45 hg19canon.rev.1.bt2 +# -rw-rw-r-- 1 james james 683M Feb 10 20:45 hg19canon.rev.2.bt2 +# +# then the bowtie2_indices.loc entry could look like this: +# +#hg19 hg19 Human (hg19) /depot/data2/galaxy/hg19/bowtie2/hg19canon +# +#More examples: +# +#mm10 mm10 Mouse (mm10) /depot/data2/galaxy/mm10/bowtie2/mm10 +#dm3 dm3 D. melanogaster (dm3) /depot/data2/galaxy/mm10/bowtie2/dm3 +# +# +test_value test_dbkey test_name ${__HERE__}/bowtie2-ref \ No newline at end of file diff --git a/tools/kneaddata/tool-data/bowtie2_indices.loc.sample b/tools/kneaddata/tool-data/bowtie2_indices.loc.sample new file mode 100644 index 00000000000..9ad57953fcb --- /dev/null +++ b/tools/kneaddata/tool-data/bowtie2_indices.loc.sample @@ -0,0 +1,35 @@ +# bowtie2_indices.loc.sample +# This is a *.loc.sample file distributed with Galaxy that enables tools +# to use a directory of indexed data files. This one is for Bowtie2 and Tophat2. +# See the wiki: http://wiki.galaxyproject.org/Admin/NGS%20Local%20Setup +# First create these data files and save them in your own data directory structure. +# Then, create a bowtie_indices.loc file to use those indexes with tools. +# Copy this file, save it with the same name (minus the .sample), +# follow the format examples, and store the result in this directory. +# The file should include an one line entry for each index set. +# The path points to the "basename" for the set, not a specific file. +# It has four text columns seperated by TABS. +# +# +# +# So, for example, if you had hg18 indexes stored in: +# +# /depot/data2/galaxy/hg19/bowtie2/ +# +# containing hg19 genome and hg19.*.bt2 files, such as: +# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.fa +# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.1.bt2 +# -rw-rw-r-- 1 james james 683M Feb 10 18:56 hg19canon.2.bt2 +# -rw-rw-r-- 1 james james 3.3K Feb 10 16:54 hg19canon.3.bt2 +# -rw-rw-r-- 1 james james 683M Feb 10 16:54 hg19canon.4.bt2 +# -rw-rw-r-- 1 james james 914M Feb 10 20:45 hg19canon.rev.1.bt2 +# -rw-rw-r-- 1 james james 683M Feb 10 20:45 hg19canon.rev.2.bt2 +# +# then the bowtie2_indices.loc entry could look like this: +# +#hg19 hg19 Human (hg19) /depot/data2/galaxy/hg19/bowtie2/hg19canon +# +#More examples: +# +#mm10 mm10 Mouse (mm10) /depot/data2/galaxy/mm10/bowtie2/mm10 +#dm3 dm3 D. melanogaster (dm3) /depot/data2/galaxy/mm10/bowtie2/dm3 diff --git a/tools/kneaddata/tool_data_table_conf.xml.sample b/tools/kneaddata/tool_data_table_conf.xml.sample new file mode 100644 index 00000000000..7a775c577f5 --- /dev/null +++ b/tools/kneaddata/tool_data_table_conf.xml.sample @@ -0,0 +1,8 @@ + + + + + value, dbkey, name, path + +
+
\ No newline at end of file diff --git a/tools/kneaddata/tool_data_table_conf.xml.test b/tools/kneaddata/tool_data_table_conf.xml.test new file mode 100644 index 00000000000..a7d6738c943 --- /dev/null +++ b/tools/kneaddata/tool_data_table_conf.xml.test @@ -0,0 +1,8 @@ + + + + + value, dbkey, name, path + +
+
\ No newline at end of file