Merge pull request #908 from drpatelh/updates

drpatelh · web-flow · commit 59a964867554 · 2022-12-19T10:09:46.000Z
Closing #896 #897 #900 #902 #907
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,12 +9,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Bump minimum Nextflow version from `21.10.3` -> `22.10.1`
 - Updated pipeline template to [nf-core/tools 2.7.1](https://github.com/nf-core/tools/releases/tag/2.7.1)
+- [[#896](https://github.com/nf-core/rnaseq/issues/896)] - Remove `copyTo` call for iGenomes README
+- [[#897](https://github.com/nf-core/rnaseq/issues/897)] - Use `--skip_preseq` by default
+- [[#900](https://github.com/nf-core/rnaseq/issues/900)] - Add `--recursive` option to `fastq_dir_to_samplesheet.py` script
+- [[#902](https://github.com/nf-core/rnaseq/issues/902)] - `check_samplesheet.py` script doesn't output optional columns in samplesheet
+- [[#907](https://github.com/nf-core/rnaseq/issues/907)] - Add `--extra_star_align_args` and `--extra_salmon_quant_args` parameter
 
 ### Parameters
 
-| Old parameter    | New parameter |
-| ---------------- | ------------- |
-| `--enable_conda` |               |
+| Old parameter    | New parameter               |
+| ---------------- | --------------------------- |
+| `--enable_conda` |                             |
+|                  | `--extra_star_align_args`   |
+|                  | `--extra_salmon_quant_args` |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present.
 > **NB:** Parameter has been **added** if just the new parameter information is present.
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -70,7 +70,7 @@ def check_samplesheet(file_in, file_out):
                         line,
                     )
 
-                num_cols = len([x for x in lspl if x])
+                num_cols = len([x for x in lspl[: len(HEADER)] if x])
                 if num_cols < MIN_COLS:
                     print_error(
                         f"Invalid number of populated columns (minimum = {MIN_COLS})!",
@@ -124,6 +124,7 @@ def check_samplesheet(file_in, file_out):
                     print_error("Invalid combination of columns provided!", "Line", line)
 
                 ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, strandedness ]]}
+                sample_info = sample_info + lspl[len(HEADER) :]
                 if sample not in sample_mapping_dict:
                     sample_mapping_dict[sample] = [sample_info]
                 else:
@@ -137,7 +138,9 @@ def check_samplesheet(file_in, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"]) + "\n")
+            fout.write(
+                ",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"] + header[len(HEADER) :]) + "\n"
+            )
             for sample in sorted(sample_mapping_dict.keys()):
 
                 ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
@@ -149,7 +152,7 @@ def check_samplesheet(file_in, file_out):
                     )
 
                 ## Check that multiple runs of the same sample are of the same strandedness
-                if not all(x[-1] == sample_mapping_dict[sample][0][-1] for x in sample_mapping_dict[sample]):
+                if not all(x[3] == sample_mapping_dict[sample][0][3] for x in sample_mapping_dict[sample]):
                     print_error(
                         f"Multiple runs of a sample must have the same strandedness!",
                         "Sample",
diff --git a/bin/fastq_dir_to_samplesheet.py b/bin/fastq_dir_to_samplesheet.py
@@ -67,6 +67,13 @@ def parse_args(args=None):
         default=1,
         help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.",
     )
+    parser.add_argument(
+        "-re",
+        "--recursive",
+        dest="RECURSIVE",
+        action="store_true",
+        help="Whether or not to search for FastQ files recursively in <FASTQ_DIR>.",
+    )
     return parser.parse_args(args)
 
 
@@ -80,6 +87,7 @@ def fastq_dir_to_samplesheet(
     sanitise_name=False,
     sanitise_name_delimiter="_",
     sanitise_name_index=1,
+    recursive=False,
 ):
     def sanitize_sample(path, extension):
         """Retrieve sample id from filename"""
@@ -90,27 +98,30 @@ def sanitize_sample(path, extension):
             )
         return sample
 
-    def get_fastqs(extension):
+    def get_fastqs(extension, recursive=False):
         """
         Needs to be sorted to ensure R1 and R2 are in the same order
         when merging technical replicates. Glob is not guaranteed to produce
         sorted results.
         See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered
         """
-        return sorted(glob.glob(os.path.join(fastq_dir, f"*{extension}"), recursive=False))
+        search_path = f"*{extension}"
+        if recursive:
+            search_path = f"**/*{extension}"
+        return sorted(glob.glob(os.path.join(fastq_dir, search_path), recursive=recursive))
 
     read_dict = {}
 
     ## Get read 1 files
-    for read1_file in get_fastqs(read1_extension):
+    for read1_file in get_fastqs(read1_extension, recursive):
         sample = sanitize_sample(read1_file, read1_extension)
         if sample not in read_dict:
             read_dict[sample] = {"R1": [], "R2": []}
         read_dict[sample]["R1"].append(read1_file)
 
     ## Get read 2 files
     if not single_end:
-        for read2_file in get_fastqs(read2_extension):
+        for read2_file in get_fastqs(read2_extension, recursive):
             sample = sanitize_sample(read2_file, read2_extension)
             read_dict[sample]["R2"].append(read2_file)
 
@@ -157,6 +168,7 @@ def main(args=None):
         sanitise_name=args.SANITISE_NAME,
         sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER,
         sanitise_name_index=args.SANITISE_NAME_INDEX,
+        recursive=args.RECURSIVE,
     )
 
 
diff --git a/conf/modules.config b/conf/modules.config
@@ -517,7 +517,8 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
                 '--outSAMattributes NH HI AS NM MD',
                 '--quantTranscriptomeBan Singleend',
                 '--outSAMstrandField intronMotif',
-                params.save_unaligned ? '--outReadsUnmapped Fastx' : ''
+                params.save_unaligned ? '--outReadsUnmapped Fastx' : '',
+                params.extra_star_align_args ?: ''
             ].join(' ').trim()
             publishDir = [
                 [
@@ -541,6 +542,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
         }
 
         withName: '.*:QUANTIFY_STAR_SALMON:SALMON_QUANT' {
+            ext.args   = params.extra_salmon_quant_args ?: ''
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}" },
                 mode: params.publish_dir_mode,
@@ -1045,6 +1047,7 @@ if (!params.skip_multiqc) {
 if (params.pseudo_aligner == 'salmon') {
     process {
         withName: '.*:QUANTIFY_SALMON:SALMON_QUANT' {
+            ext.args   = params.extra_salmon_quant_args ?: ''
             publishDir = [
                 path: { "${params.outdir}/${params.pseudo_aligner}" },
                 mode: params.publish_dir_mode,
diff --git a/nextflow.config b/nextflow.config
@@ -68,6 +68,8 @@ params {
     hisat2_build_memory        = '200.GB'  // Amount of memory required to build HISAT2 index with splice sites
     stringtie_ignore_gtf       = false
     min_mapped_reads           = 5
+    extra_star_align_args      = null
+    extra_salmon_quant_args    = null
     save_merged_fastq          = false
     save_unaligned             = false
     save_align_intermeds       = false
@@ -79,7 +81,7 @@ params {
     skip_bigwig                = false
     skip_stringtie             = false
     skip_fastqc                = false
-    skip_preseq                = false
+    skip_preseq                = true
     skip_dupradar              = false
     skip_qualimap              = false
     skip_rseqc                 = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -424,6 +424,16 @@
                     "description": "Perform reference-guided de novo assembly of transcripts using StringTie i.e. dont restrict to those in GTF file.",
                     "fa_icon": "fas fa-ban"
                 },
+                "extra_star_align_args": {
+                    "type": "string",
+                    "description": "Extra arguments to pass to STAR alignment command in addition to defaults defined by the pipeline.",
+                    "fa_icon": "fas fa-plus"
+                },
+                "extra_salmon_quant_args": {
+                    "type": "string",
+                    "description": "Extra arguments to pass to Salmon quant command in addition to defaults defined by the pipeline.",
+                    "fa_icon": "fas fa-plus"
+                },
                 "save_unaligned": {
                     "type": "boolean",
                     "fa_icon": "fas fa-save",
@@ -485,7 +495,8 @@
                 "skip_preseq": {
                     "type": "boolean",
                     "description": "Skip Preseq.",
-                    "fa_icon": "fas fa-fast-forward"
+                    "fa_icon": "fas fa-fast-forward",
+                    "default": true
                 },
                 "skip_dupradar": {
                     "type": "boolean",
diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf
@@ -56,13 +56,6 @@ if (params.bam_csi_index) {
     }
 }
 
-// Save AWS IGenomes file containing annotation version
-def anno_readme = params.genomes[ params.genome ]?.readme
-if (anno_readme && file(anno_readme).exists()) {
-    file("${params.outdir}/genome/").mkdirs()
-    file(anno_readme).copyTo("${params.outdir}/genome/")
-}
-
 // Stage dummy file to be used as an optional input where required
 ch_dummy_file = file("$projectDir/assets/dummy_file.txt", checkIfExists: true)
 

Original file line number	Diff line number	Diff line change
`@@ -56,13 +56,6 @@ if (params.bam_csi_index) {`
`56`	`56`	`}`
`57`	`57`	`}`
`58`	`58`
`59`		`-// Save AWS IGenomes file containing annotation version`
`60`		`-def anno_readme = params.genomes[ params.genome ]?.readme`
`61`		`-if (anno_readme && file(anno_readme).exists()) {`
`62`		`- file("${params.outdir}/genome/").mkdirs()`
`63`		`- file(anno_readme).copyTo("${params.outdir}/genome/")`
`64`		`-}`
`65`		`-`
`66`	`59`	`// Stage dummy file to be used as an optional input where required`
`67`	`60`	`ch_dummy_file = file("$projectDir/assets/dummy_file.txt", checkIfExists: true)`
`68`	`61`