EBI-Metagenomics
diff --git a/‎.github/workflows/nf-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nf-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nf-test.config‎
Lines changed: 1 addition & 0 deletions b/‎nf-test.config‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎subworkflows/ebi-metagenomics/detect_rna/main.nf‎
Lines changed: 59 additions & 15 deletions b/‎subworkflows/ebi-metagenomics/detect_rna/main.nf‎
Lines changed: 59 additions & 15 deletions
diff --git a/‎subworkflows/ebi-metagenomics/detect_rna/meta.yml‎
Lines changed: 157 additions & 6 deletions b/‎subworkflows/ebi-metagenomics/detect_rna/meta.yml‎
Lines changed: 157 additions & 6 deletions
@@ -95,7 +95,7 @@ jobs:
             ${{ runner.os }}-pip-pdiff
 
       - name: Install Python dependencies
-        run: python -m pip install --upgrade pip pdiff cryptography
+        run: python -m pip install --upgrade pip pdiff cryptography nf-core
 
       # Test the module
       - name: Run nf-test
 
@@ -17,5 +17,6 @@ config {
     plugins {
         load "nft-fasta@1.0.0"
         load "nft-bam@0.5.0"
+        load "nft-utils@0.0.7"
     }
 }
@@ -5,38 +5,56 @@
 // Important note: .cm file should be cmpress-ed before execution
 // Use cmsearch mode if input fasta is massive and models file contains chosen set of models (usecase: ASA)
 
+/* NF-CORE */
+include { SEQKIT_SPLIT2                             } from '../../../modules/nf-core/seqkit/split2/main'
+include { CAT_CAT as CONCATENATE_CMSEARCH_DEOVERLAP } from '../../../modules/nf-core/cat/cat/main'
 
-include { INFERNAL_CMSEARCH           } from '../../../modules/ebi-metagenomics/infernal/cmsearch/main'
-include { INFERNAL_CMSCAN             } from '../../../modules/ebi-metagenomics/infernal/cmscan/main'
-include { CONVERTCMSCANTOCMSEARCH     } from '../../../modules/ebi-metagenomics/convertcmscantocmsearch/main'
-include { CMSEARCHTBLOUTDEOVERLAP     } from '../../../modules/ebi-metagenomics/cmsearchtbloutdeoverlap/main'
-include { EASEL_ESLSFETCH             } from '../../../modules/ebi-metagenomics/easel/eslsfetch/main'
+/* EBI-METAGENOMICS */
+include { INFERNAL_CMSEARCH                         } from '../../../modules/ebi-metagenomics/infernal/cmsearch/main'
+include { INFERNAL_CMSCAN                           } from '../../../modules/ebi-metagenomics/infernal/cmscan/main'
+include { CONVERTCMSCANTOCMSEARCH                   } from '../../../modules/ebi-metagenomics/convertcmscantocmsearch/main'
+include { CMSEARCHTBLOUTDEOVERLAP                   } from '../../../modules/ebi-metagenomics/cmsearchtbloutdeoverlap/main'
+include { EASEL_ESLSFETCH                           } from '../../../modules/ebi-metagenomics/easel/eslsfetch/main'
+include { EXTRACTCOORDS                             } from '../../../modules/ebi-metagenomics/extractcoords/main'
 
 
 workflow DETECT_RNA {
 
     take:
-    ch_fasta     // channel: [ val(meta), [ fasta ] ]
-    rfam         // folder: rfam for cmsearch/cmscan
-    claninfo     // file: claninfo for cmsearchtbloutdeoverlap
-    mode         // cmsearch/cmscan
+    ch_fasta          // channel: [ val(meta), [ fasta ] ]
+    rfam              // folder: rfam for cmsearch/cmscan
+    claninfo          // file: claninfo for cmsearchtbloutdeoverlap
+    mode              // cmsearch/cmscan
+    separate_subunits // val: boolean (true: separate subnits (for Amplicon), false: don't separate (for ASA))
+    chunk_flag        // val: boolean (true: chunk (for ASA), false: no chunk (for Amplicon))
 
     main:
 
     ch_versions = Channel.empty()
     cmsearch_ch = Channel.empty()
 
+    ch_sequences = ch_fasta
+    if (chunk_flag){
+        // Chunk the fasta into files with at most params.proteins_chunksize sequences
+        SEQKIT_SPLIT2(
+            ch_fasta
+        )
+        ch_versions = ch_versions.mix(SEQKIT_SPLIT2.out.versions)
+
+        ch_sequences = SEQKIT_SPLIT2.out.reads.transpose()
+    }
+
     if ( mode == 'cmsearch' ) {
         INFERNAL_CMSEARCH(
-            ch_fasta,
+            ch_sequences,
             rfam
         )
         ch_versions = ch_versions.mix(INFERNAL_CMSEARCH.out.versions.first())
         cmsearch_ch = INFERNAL_CMSEARCH.out.cmsearch_tbl
     }
     else if (mode == 'cmscan') {
        INFERNAL_CMSCAN(
-            ch_fasta,
+            ch_sequences,
             rfam
        )
        ch_versions = ch_versions.mix(INFERNAL_CMSCAN.out.versions.first())
@@ -53,16 +71,42 @@ workflow DETECT_RNA {
     )
     ch_versions = ch_versions.mix(CMSEARCHTBLOUTDEOVERLAP.out.versions.first())
 
+    ch_cmsearchdeoverlap = CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped
+
+    if (chunk_flag){
+        CONCATENATE_CMSEARCH_DEOVERLAP(
+            CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped.groupTuple()
+        )
+        ch_versions = ch_versions.mix(CONCATENATE_CMSEARCH_DEOVERLAP.out.versions.first())
+        ch_cmsearchdeoverlap = CONCATENATE_CMSEARCH_DEOVERLAP.out.file_out
+    }
+
     ch_easel = ch_fasta
-                .join(CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped)
+                .join(ch_cmsearchdeoverlap)
     EASEL_ESLSFETCH(
         ch_easel
     )
     ch_versions = ch_versions.mix(EASEL_ESLSFETCH.out.versions.first())
 
+    EXTRACTCOORDS(
+        EASEL_ESLSFETCH.out.easel_coords,
+        EASEL_ESLSFETCH.out.matched_seqs_with_coords,
+        separate_subunits
+    )
+    ch_versions = ch_versions.mix(EXTRACTCOORDS.out.versions.first())
+
     emit:
-    cmsearch_deoverlap_out = CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped   // channel: [ val(meta), [ deoverlapped ] ]
-    easel_out              = EASEL_ESLSFETCH.out.easel_coords                           // channel: [ val(meta), [ fasta ] ]
-    versions               = ch_versions                                                // channel: [ versions.yml ]
+    cmsearch_deoverlap_coords = CMSEARCHTBLOUTDEOVERLAP.out.cmsearch_tblout_deoverlapped   // channel: [ val(meta), [ deoverlapped ] ]
+    easel_coords              = EASEL_ESLSFETCH.out.easel_coords                           // channel: [ val(meta), [ fasta ] ]
+    ssu_fasta                 = EXTRACTCOORDS.out.ssu_fasta                                // channel: [ val(meta), [ fasta ] ]
+    lsu_fasta                 = EXTRACTCOORDS.out.lsu_fasta                                // channel: [ val(meta), [ fasta ] ]
+    rrna_bacteria             = EXTRACTCOORDS.out.rrna_bacteria                            // channel: [ val(meta), [ fasta ] ]
+    rrna_archaea              = EXTRACTCOORDS.out.rrna_archaea                             // channel: [ val(meta), [ fasta ] ]
+    eukarya                   = EXTRACTCOORDS.out.eukarya                                  // channel: [ val(meta), [ fasta ] ]
+    fiveS_fasta               = EXTRACTCOORDS.out.fiveS_fasta                              // channel: [ val(meta), [ fasta ] ]
+    five_eightS_fasta         = EXTRACTCOORDS.out.five_eightS_fasta                        // channel: [ val(meta), [ fasta ] ]
+    ncrna_fasta               = EXTRACTCOORDS.out.ncrna_fasta                              // channel: [ val(meta), [ fasta ] ]
+    concat_ssu_lsu_coords     = EXTRACTCOORDS.out.concat_ssu_lsu_coords                    // channel: [ val(meta), [ txt ] ]
+    versions                  = ch_versions                                                // channel: [ versions.yml ]
 }
 
@@ -1,4 +1,4 @@
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+# yaml-language-server: $schema=https://raw.githubusercontent.com/ebi-metagenomics/nf-modules/master/subworkflows/yaml-schema.json
 name: "detect_rna"
 description: Extraction of specific cmsearch-identified RNA sequences from a fasta
   file using EASEL
@@ -10,13 +10,23 @@ keywords:
   - cmscan
   - covariance models
 components:
+  - seqkit/split2:
+    git_remote: https://github.com/nf-core/modules.git
+  - cat/cat:
+    git_remote: https://github.com/nf-core/modules.git
   - infernal/cmsearch
   - infernal/cmscan
   - convertcmscantocmsearch
   - cmsearchtbloutdeoverlap
   - easel/eslsfetch
+  - extractcoords
 input:
-  - ch_fasta:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'sample1', single_end:false ]`
+    ch_fasta:
       type: file
       description: |
         The input channel containing the fasta files
@@ -36,16 +46,157 @@ input:
   - mode:
       type: value
       description: choose cmsearch or cmscan method to use
+  - separate_subunits:
+      type: boolean
+      description: Specify true to separate hits into the different RNA subunits
+  - chunk_flag:
+      type: boolean
+      description: |
+        Specify true to use seqkit/split2 to chunk contigs into sequences of specific length e.g. 50M. 
+        IMPORTANT NOTE, YOU HAVE TO SPECIFY CHUNK LENGTH USING `ext.args`, e.g. `--by-length 50M`.
+        See nextflow.config for unit test for a full example
 output:
   - versions:
       type: file
       description: |
         File containing software versions
         Structure: [ path(versions.yml) ]
       pattern: "versions.yml"
-  - cmsearch_deoverlap_out:
-      description: ""
-  - easel_out:
-      description: ""
+  - cmsearch_deoverlap_coords:
+      description: |
+        Channel containing deoverlapped cmsearch .tblout files
+        Structure: [ val(meta), path("*.tblout.deoverlapped") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "*.tblout.deoverlapped":
+        type: file
+        description: Deoverlapped .tblout file
+        pattern: "*.tblout.deoverlapped"
+  - easel_coords:
+      description: |
+        Channel containing fasta output from esl-sfetch
+        Structure: [ val(meta), path("*.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "*.fasta":
+        type: file
+        description: Fasta file output from running esl-sfetch to extract sequences by name
+        pattern: "*.{fasta}"
+  - ssu_fasta:
+      description: |
+        Channel containing SSU fasta sequences
+        Structure: [ val(meta), path("sequence-categorisation/*SSU.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*SSU.fasta":
+        type: file
+        description: Fasta file containing the SSU sequences
+        pattern: "*.fasta"
+        ontologies: []
+  - lsu_fasta:
+      description: |
+        Channel containing LSU fasta sequences
+        Structure: [ val(meta), path("sequence-categorisation/*LSU.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*LSU.fasta":
+        type: file
+        description: Fasta file containing the LSU sequences
+        pattern: "*.fasta"
+        ontologies: []
+  - rrna_bacteria:
+      description: |
+        Channel containing bacterial rRNA sequences
+        Structure: [ val(meta), path("sequence-categorisation/*rRNA_bacteria*.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*rRNA_bacteria*.fasta":
+        type: file
+        description: Fasta file containing bacterial rRNA
+        pattern: "*.fasta"
+        ontologies: []
+  - rrna_archaea:
+      description: |
+        Channel containing archaeal rRNA sequences
+        Structure: [ val(meta), path("sequence-categorisation/*rRNA_archaea*.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*rRNA_archaea*.fasta":
+        type: file
+        description: Fasta file containing archaeal rRNA
+        pattern: "*.fasta"
+        ontologies: []
+  - eukarya:
+      description: |
+        Channel containing eukaryan rRNA sequences
+        Structure: [ val(meta), path("sequence-categorisation/*rRNA_eukarya*.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*rRNA_eukarya*.fasta":
+        type: file
+        description: Fasta file containing eukaryan rRNA
+        pattern: "*.fasta"
+        ontologies: []
+  - fiveS_fasta:
+      description: |
+        Channel containing 5S rRNA sequences
+        Structure: [ val(meta), path("sequence-categorisation/*5S.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*5S.fasta":
+        type: file
+        description: "5S rRNA nucleotide sequences"
+        ontologies: []
+  - five_eightS_fasta:
+      description: |
+        Channel containing 5.8S rRNA sequences
+        Structure: [ val(meta), path("sequence-categorisation/*5_8S.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*5_8S.fasta":
+        type: file
+        description: "5 and 8S rRNA nucleotide sequences"
+        ontologies: []
+  - ncrna_fasta:
+      description: |
+        Channel containing non-coding RNA sequences
+        Structure: [ val(meta), path("sequence-categorisation/*other_ncRNA.fasta") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+      "sequence-categorisation/*other_ncRNA.fasta":
+        type: file
+        description: "non-coding RNA nucleotide sequences"
+        ontologies: []
+
 authors:
   - "@Kate_Sakharova"
Original file line number	Diff line number	Diff line change
`@@ -17,5 +17,6 @@ config {`
`17`	`17`	`plugins {`
`18`	`18`	`load "nft-fasta@1.0.0"`
`19`	`19`	`load "nft-bam@0.5.0"`
	`20`	`+ load "nft-utils@0.0.7"`
`20`	`21`	`}`
`21`	`22`	`}`