Skip to content

Commit 5dff96e

Browse files
authored
Merge pull request #113 from EBI-Metagenomics/update/detect_rna
Update detect_rna swf, remove rrna_extraction swf
2 parents 40e6082 + 503c03e commit 5dff96e

File tree

17 files changed

+617
-13034
lines changed

17 files changed

+617
-13034
lines changed

.github/workflows/nf-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ jobs:
9595
${{ runner.os }}-pip-pdiff
9696
9797
- name: Install Python dependencies
98-
run: python -m pip install --upgrade pip pdiff cryptography
98+
run: python -m pip install --upgrade pip pdiff cryptography nf-core
9999

100100
# Test the module
101101
- name: Run nf-test

nf-test.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@ config {
1717
plugins {
1818
load "nft-fasta@1.0.0"
1919
load "nft-bam@0.5.0"
20+
load "nft-utils@0.0.7"
2021
}
2122
}

subworkflows/ebi-metagenomics/detect_rna/main.nf

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,38 +5,56 @@
55
// Important note: .cm file should be cmpress-ed before execution
66
// Use cmsearch mode if input fasta is massive and models file contains chosen set of models (usecase: ASA)
77

8+
/* NF-CORE */
9+
include { SEQKIT_SPLIT2 } from '../../../modules/nf-core/seqkit/split2/main'
10+
include { CAT_CAT as CONCATENATE_CMSEARCH_DEOVERLAP } from '../../../modules/nf-core/cat/cat/main'
811

9-
include { INFERNAL_CMSEARCH } from '../../../modules/ebi-metagenomics/infernal/cmsearch/main'
10-
include { INFERNAL_CMSCAN } from '../../../modules/ebi-metagenomics/infernal/cmscan/main'
11-
include { CONVERTCMSCANTOCMSEARCH } from '../../../modules/ebi-metagenomics/convertcmscantocmsearch/main'
12-
include { CMSEARCHTBLOUTDEOVERLAP } from '../../../modules/ebi-metagenomics/cmsearchtbloutdeoverlap/main'
13-
include { EASEL_ESLSFETCH } from '../../../modules/ebi-metagenomics/easel/eslsfetch/main'
12+
/* EBI-METAGENOMICS */
13+
include { INFERNAL_CMSEARCH } from '../../../modules/ebi-metagenomics/infernal/cmsearch/main'
14+
include { INFERNAL_CMSCAN } from '../../../modules/ebi-metagenomics/infernal/cmscan/main'
15+
include { CONVERTCMSCANTOCMSEARCH } from '../../../modules/ebi-metagenomics/convertcmscantocmsearch/main'
16+
include { CMSEARCHTBLOUTDEOVERLAP } from '../../../modules/ebi-metagenomics/cmsearchtbloutdeoverlap/main'
17+
include { EASEL_ESLSFETCH } from '../../../modules/ebi-metagenomics/easel/eslsfetch/main'
18+
include { EXTRACTCOORDS } from '../../../modules/ebi-metagenomics/extractcoords/main'
1419

1520

1621
workflow DETECT_RNA {
1722

1823
take:
19-
ch_fasta // channel: [ val(meta), [ fasta ] ]
20-
rfam // folder: rfam for cmsearch/cmscan
21-
claninfo // file: claninfo for cmsearchtbloutdeoverlap
22-
mode // cmsearch/cmscan
24+
ch_fasta // channel: [ val(meta), [ fasta ] ]
25+
rfam // folder: rfam for cmsearch/cmscan
26+
claninfo // file: claninfo for cmsearchtbloutdeoverlap
27+
mode // cmsearch/cmscan
28+
separate_subunits // val: boolean (true: separate subnits (for Amplicon), false: don't separate (for ASA))
29+
chunk_flag // val: boolean (true: chunk (for ASA), false: no chunk (for Amplicon))
2330

2431
main:
2532

2633
ch_versions = Channel.empty()
2734
cmsearch_ch = Channel.empty()
2835

36+
ch_sequences = ch_fasta
37+
if (chunk_flag){
38+
// Chunk the fasta into files with at most params.proteins_chunksize sequences
39+
SEQKIT_SPLIT2(
40+
ch_fasta
41+
)
42+
ch_versions = ch_versions.mix(SEQKIT_SPLIT2.out.versions)
43+
44+
ch_sequences = SEQKIT_SPLIT2.out.reads.transpose()
45+
}
46+
2947
if ( mode == 'cmsearch' ) {
3048
INFERNAL_CMSEARCH(
31-
ch_fasta,
49+
ch_sequences,
3250
rfam
3351
)
3452
ch_versions = ch_versions.mix(INFERNAL_CMSEARCH.out.versions.first())
3553
cmsearch_ch = INFERNAL_CMSEARCH.out.cmsearch_tbl
3654
}
3755
else if (mode == 'cmscan') {
3856
INFERNAL_CMSCAN(
39-
ch_fasta,
57+
ch_sequences,
4058
rfam
4159
)
4260
ch_versions = ch_versions.mix(INFERNAL_CMSCAN.out.versions.first())
@@ -53,16 +71,42 @@ workflow DETECT_RNA {
5371
)
5472
ch_versions = ch_versions.mix(CMSEARCHTBLOUTDEOVERLAP.out.versions.first())
5573

74+
ch_cmsearchdeoverlap = CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped
75+
76+
if (chunk_flag){
77+
CONCATENATE_CMSEARCH_DEOVERLAP(
78+
CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped.groupTuple()
79+
)
80+
ch_versions = ch_versions.mix(CONCATENATE_CMSEARCH_DEOVERLAP.out.versions.first())
81+
ch_cmsearchdeoverlap = CONCATENATE_CMSEARCH_DEOVERLAP.out.file_out
82+
}
83+
5684
ch_easel = ch_fasta
57-
.join(CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped)
85+
.join(ch_cmsearchdeoverlap)
5886
EASEL_ESLSFETCH(
5987
ch_easel
6088
)
6189
ch_versions = ch_versions.mix(EASEL_ESLSFETCH.out.versions.first())
6290

91+
EXTRACTCOORDS(
92+
EASEL_ESLSFETCH.out.easel_coords,
93+
EASEL_ESLSFETCH.out.matched_seqs_with_coords,
94+
separate_subunits
95+
)
96+
ch_versions = ch_versions.mix(EXTRACTCOORDS.out.versions.first())
97+
6398
emit:
64-
cmsearch_deoverlap_out = CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped // channel: [ val(meta), [ deoverlapped ] ]
65-
easel_out = EASEL_ESLSFETCH.out.easel_coords // channel: [ val(meta), [ fasta ] ]
66-
versions = ch_versions // channel: [ versions.yml ]
99+
cmsearch_deoverlap_coords = CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped // channel: [ val(meta), [ deoverlapped ] ]
100+
easel_coords = EASEL_ESLSFETCH.out.easel_coords // channel: [ val(meta), [ fasta ] ]
101+
ssu_fasta = EXTRACTCOORDS.out.ssu_fasta // channel: [ val(meta), [ fasta ] ]
102+
lsu_fasta = EXTRACTCOORDS.out.lsu_fasta // channel: [ val(meta), [ fasta ] ]
103+
rrna_bacteria = EXTRACTCOORDS.out.rrna_bacteria // channel: [ val(meta), [ fasta ] ]
104+
rrna_archaea = EXTRACTCOORDS.out.rrna_archaea // channel: [ val(meta), [ fasta ] ]
105+
eukarya = EXTRACTCOORDS.out.eukarya // channel: [ val(meta), [ fasta ] ]
106+
fiveS_fasta = EXTRACTCOORDS.out.fiveS_fasta // channel: [ val(meta), [ fasta ] ]
107+
five_eightS_fasta = EXTRACTCOORDS.out.five_eightS_fasta // channel: [ val(meta), [ fasta ] ]
108+
ncrna_fasta = EXTRACTCOORDS.out.ncrna_fasta // channel: [ val(meta), [ fasta ] ]
109+
concat_ssu_lsu_coords = EXTRACTCOORDS.out.concat_ssu_lsu_coords // channel: [ val(meta), [ txt ] ]
110+
versions = ch_versions // channel: [ versions.yml ]
67111
}
68112

subworkflows/ebi-metagenomics/detect_rna/meta.yml

Lines changed: 157 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/ebi-metagenomics/nf-modules/master/subworkflows/yaml-schema.json
22
name: "detect_rna"
33
description: Extraction of specific cmsearch-identified RNA sequences from a fasta
44
file using EASEL
@@ -10,13 +10,23 @@ keywords:
1010
- cmscan
1111
- covariance models
1212
components:
13+
- seqkit/split2:
14+
git_remote: https://github.com/nf-core/modules.git
15+
- cat/cat:
16+
git_remote: https://github.com/nf-core/modules.git
1317
- infernal/cmsearch
1418
- infernal/cmscan
1519
- convertcmscantocmsearch
1620
- cmsearchtbloutdeoverlap
1721
- easel/eslsfetch
22+
- extractcoords
1823
input:
19-
- ch_fasta:
24+
- meta:
25+
type: map
26+
description: |
27+
Groovy Map containing sample information
28+
e.g. `[ id:'sample1', single_end:false ]`
29+
ch_fasta:
2030
type: file
2131
description: |
2232
The input channel containing the fasta files
@@ -36,16 +46,157 @@ input:
3646
- mode:
3747
type: value
3848
description: choose cmsearch or cmscan method to use
49+
- separate_subunits:
50+
type: boolean
51+
description: Specify true to separate hits into the different RNA subunits
52+
- chunk_flag:
53+
type: boolean
54+
description: |
55+
Specify true to use seqkit/split2 to chunk contigs into sequences of specific length e.g. 50M.
56+
IMPORTANT NOTE, YOU HAVE TO SPECIFY CHUNK LENGTH USING `ext.args`, e.g. `--by-length 50M`.
57+
See nextflow.config for unit test for a full example
3958
output:
4059
- versions:
4160
type: file
4261
description: |
4362
File containing software versions
4463
Structure: [ path(versions.yml) ]
4564
pattern: "versions.yml"
46-
- cmsearch_deoverlap_out:
47-
description: ""
48-
- easel_out:
49-
description: ""
65+
- cmsearch_deoverlap_coords:
66+
description: |
67+
Channel containing deoverlapped cmsearch .tblout files
68+
Structure: [ val(meta), path("*.tblout.deoverlapped") ]
69+
meta:
70+
type: map
71+
description: |
72+
Groovy Map containing sample information
73+
e.g. `[ id:'sample1', single_end:false ]`
74+
"*.tblout.deoverlapped":
75+
type: file
76+
description: Deoverlapped .tblout file
77+
pattern: "*.tblout.deoverlapped"
78+
- easel_coords:
79+
description: |
80+
Channel containing fasta output from esl-sfetch
81+
Structure: [ val(meta), path("*.fasta") ]
82+
meta:
83+
type: map
84+
description: |
85+
Groovy Map containing sample information
86+
e.g. `[ id:'sample1', single_end:false ]`
87+
"*.fasta":
88+
type: file
89+
description: Fasta file output from running esl-sfetch to extract sequences by name
90+
pattern: "*.{fasta}"
91+
- ssu_fasta:
92+
description: |
93+
Channel containing SSU fasta sequences
94+
Structure: [ val(meta), path("sequence-categorisation/*SSU.fasta") ]
95+
meta:
96+
type: map
97+
description: |
98+
Groovy Map containing sample information
99+
e.g. `[ id:'sample1', single_end:false ]`
100+
"sequence-categorisation/*SSU.fasta":
101+
type: file
102+
description: Fasta file containing the SSU sequences
103+
pattern: "*.fasta"
104+
ontologies: []
105+
- lsu_fasta:
106+
description: |
107+
Channel containing LSU fasta sequences
108+
Structure: [ val(meta), path("sequence-categorisation/*LSU.fasta") ]
109+
meta:
110+
type: map
111+
description: |
112+
Groovy Map containing sample information
113+
e.g. `[ id:'sample1', single_end:false ]`
114+
"sequence-categorisation/*LSU.fasta":
115+
type: file
116+
description: Fasta file containing the LSU sequences
117+
pattern: "*.fasta"
118+
ontologies: []
119+
- rrna_bacteria:
120+
description: |
121+
Channel containing bacterial rRNA sequences
122+
Structure: [ val(meta), path("sequence-categorisation/*rRNA_bacteria*.fasta") ]
123+
meta:
124+
type: map
125+
description: |
126+
Groovy Map containing sample information
127+
e.g. `[ id:'sample1', single_end:false ]`
128+
"sequence-categorisation/*rRNA_bacteria*.fasta":
129+
type: file
130+
description: Fasta file containing bacterial rRNA
131+
pattern: "*.fasta"
132+
ontologies: []
133+
- rrna_archaea:
134+
description: |
135+
Channel containing archaeal rRNA sequences
136+
Structure: [ val(meta), path("sequence-categorisation/*rRNA_archaea*.fasta") ]
137+
meta:
138+
type: map
139+
description: |
140+
Groovy Map containing sample information
141+
e.g. `[ id:'sample1', single_end:false ]`
142+
"sequence-categorisation/*rRNA_archaea*.fasta":
143+
type: file
144+
description: Fasta file containing archaeal rRNA
145+
pattern: "*.fasta"
146+
ontologies: []
147+
- eukarya:
148+
description: |
149+
Channel containing eukaryan rRNA sequences
150+
Structure: [ val(meta), path("sequence-categorisation/*rRNA_eukarya*.fasta") ]
151+
meta:
152+
type: map
153+
description: |
154+
Groovy Map containing sample information
155+
e.g. `[ id:'sample1', single_end:false ]`
156+
"sequence-categorisation/*rRNA_eukarya*.fasta":
157+
type: file
158+
description: Fasta file containing eukaryan rRNA
159+
pattern: "*.fasta"
160+
ontologies: []
161+
- fiveS_fasta:
162+
description: |
163+
Channel containing 5S rRNA sequences
164+
Structure: [ val(meta), path("sequence-categorisation/*5S.fasta") ]
165+
meta:
166+
type: map
167+
description: |
168+
Groovy Map containing sample information
169+
e.g. `[ id:'sample1', single_end:false ]`
170+
"sequence-categorisation/*5S.fasta":
171+
type: file
172+
description: "5S rRNA nucleotide sequences"
173+
ontologies: []
174+
- five_eightS_fasta:
175+
description: |
176+
Channel containing 5.8S rRNA sequences
177+
Structure: [ val(meta), path("sequence-categorisation/*5_8S.fasta") ]
178+
meta:
179+
type: map
180+
description: |
181+
Groovy Map containing sample information
182+
e.g. `[ id:'sample1', single_end:false ]`
183+
"sequence-categorisation/*5_8S.fasta":
184+
type: file
185+
description: "5 and 8S rRNA nucleotide sequences"
186+
ontologies: []
187+
- ncrna_fasta:
188+
description: |
189+
Channel containing non-coding RNA sequences
190+
Structure: [ val(meta), path("sequence-categorisation/*other_ncRNA.fasta") ]
191+
meta:
192+
type: map
193+
description: |
194+
Groovy Map containing sample information
195+
e.g. `[ id:'sample1', single_end:false ]`
196+
"sequence-categorisation/*other_ncRNA.fasta":
197+
type: file
198+
description: "non-coding RNA nucleotide sequences"
199+
ontologies: []
200+
50201
authors:
51202
- "@Kate_Sakharova"

0 commit comments

Comments
 (0)