diff --git a/RNAseq/Workflow_Documentation/NF_RCP/CHANGELOG.md b/RNAseq/Workflow_Documentation/NF_RCP/CHANGELOG.md
index 0cff3c98..7ecfd7f1 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP/CHANGELOG.md
+++ b/RNAseq/Workflow_Documentation/NF_RCP/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [2.0.1](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP_2.0.1/RNAseq/Workflow_Documentation/NF_RCP) - 2025-07-02
+
+### Fixed
+
+- Fixed fastqc metrics extraction in `parse_multiqc.py` script
+ - Added qc file validation output listing missing entries
+ - Updated multiqc parsing for fastqc metrics
+
## [2.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP_2.0.0/RNAseq/Workflow_Documentation/NF_RCP) - 2025-04-10
### Added
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/QC_metrics_README.md b/RNAseq/Workflow_Documentation/NF_RCP/QC_metrics_README.md
new file mode 100644
index 00000000..7b826adc
--- /dev/null
+++ b/RNAseq/Workflow_Documentation/NF_RCP/QC_metrics_README.md
@@ -0,0 +1,213 @@
+# QC Metrics Summary Specification
+
+## Description
+
+The qc_metrics summary file is a comma-separated file that lists a summary of metadata and qc
+metrics for a dataset created after processing is complete. [Metadata](#metadata-fields) for the
+sample and assay are pulled from the ISA.zip file. QC metrics include
+[raw and trimmed read metrics](#read-qc-metrics), [alignment metrics](#alignment-metrics),
+[gene count metrics](#gene-count-metrics), and [RSeQC metrics](#rseqc-metrics) are pulled from the
+MultiQC reports generated during processing.
+
+> **NOTE:** Eukaryotic and prokaryotic data use different tools for alignment and gene counting, so
+the metrics reported will differ for those data types. Similarly, paired-end data will include
+metrics for the reverse read, while single-end data will not. All data columns will be present in
+the output files regardless of data type. Any fields that are not relevant for a particular data
+type will be left empty.
+
+
+
+
+-----------
+### Metadata fields
+*Source: ISA.zip*
+List selected metadata fields for each sample
+
+| Column Name | Data Source | Description | Example(s) |
+|:----------------------|:--------------------|:----------------------------------------------------------------------------------------------------------|:-----------------------------------------|
+| osd_num | investigation table | Study identifier | "OSD-515" |
+| sample | sample table | Sample Identifier | "RR23_TMS_FLT_F1" |
+| organism | sample table | Genus and Species of primary organism under study | "Mus musculus" |
+| tissue | sample table | 'Material Type' listed in sample table (e.g. organ, tissue, cell culture, whole organism) | "Cells, cultured", "thymus" |
+| sequencing_instrument | assay table | Sequencer used to sequence the samples | "Illumina NovaSeq 6000" |
+| library_selection | assay table | Biomolecule selection (rRNA depletion, mRNA enrichment, target amplification) | "Ribo-depletion", "polyA enrichment" |
+| library_layout | assay table | RNA library construction parameter indicating whether the library was sequenced single- or paired-end | "SINGLE", "PAIRED" |
+| strandedness | assay table | RNAseq library construction parameter indicating whether the library preserves the transcript orientation | "STRANDED", "UNSTRANDED" |
+| read_depth | assay table | Total number of sequenced fragments | 82186286 |
+| read_length | assay table | Raw data read length in bases | 151 |
+| rrna_contamination | assay table | Percent rRNA found in the data | 2.74 |
+| rin | assay table | RNA integrity number | 7.4 |
+| organism_part | sample table | For plant samples, the part of the plant from which the sample was derived | "Plant Roots" |
+| cell_line | sample table | Cell line identifier | "IMR90 hiPSC" |
+| cell_type | sample table | The type of cell(s) used in the study | "Myocytes, Cardiac" |
+| secondary_organism | sample table | Additional organism(s) which may confound assay measurements (e.g. food source, host, symbiote) | "Vibrio fischeri ES114" |
+| strain | sample table | Strain, breed, ecotype, etc. | "C57BL/6J" |
+| animal_source | sample table | The source of the animal(s) used in the study | "Jackson Laboratory" |
+| seed_source | sample table | The source of the seed(s) used in the study | "Arabidopsis Biological Resource Center" |
+| source_accession | sample table | The source accession number for the animal(s)/seed(s)/cell(s) used in the study | "SALK_027956C" |
+| mix | assay table | The ERCC spike-in mix number used | "Mix 1" |
+
+
+
+
+-----------
+### Read QC metrics
+*Source: Raw and Trimmed MultiQC*
+QC metrics describing the read quality.
+> *NOTE:* the same fields are extracted for both raw and trimmed reads, the prefixes "raw_" and
+"trimmed_" indicate the source of the metric. Similarly, the same fields are also extracted for both
+forward and reverse reads with the suffixes "_f" and "_r" denoting the read type. The table below
+lists each metric only once.
+
+| Column Name | Type | Description | Example(s) |
+|:-----------------------|:------|:------------------------------------------------------------------------------------------------------------------|:------------|
+| total_sequences | int | Total number of sequences for this read type | 82186286 |
+| avg_sequence_length | float | Average sequence length for this read type | 138.7187431 |
+| median_sequence_length | float | Median sequence length for this read type | 151 |
+| quality_score_mean | float | Average quality score | 35.48372691 |
+| quality_score_median | float | Median quality score | 35.61967608 |
+| percent_duplicates | float | Percentage estimated sequence duplication, measured based on first 50bp of the first 100,000 reads in the dataset | 67.01577646 |
+| percent_gc | float | Overall %GC of all bases in all sequences for this read type | 48 |
+| gc_min_1pct | int | Minimum %GC value reached by at least 1% of the total reads for this read type | 31 |
+| gc_max_1pct | int | Maximum %GC value reached by at least 1% of the total reads for this read type | 65 |
+| gc_auc_25pct | int | %GC value at the 25th quartile of total reads for this read type | 42 |
+| gc_auc_50pct | int | %GC value at the 50th quartile of total reads for this read type | 49 |
+| gc_auc_75pct | int | %GC value at the 75th quartile of total reads for this read type | 57 |
+| n_content_sum | float | %N base calls summed across all base positions in the read for this read type | 1.510487284 |
+
+
+
+
+-----------
+### Alignment metrics
+
+#### STAR alignment metrics
+*Source: STAR MultiQC*
+Alignment metrics generated by STAR.
+> *Present only for data processed with the eukaryotic pipeline.*
+
+| Column Name | Type | Description | Example |
+|:----------------------------|:------|:------------------------------------------------------------------------------------|:--------|
+| uniquely_mapped_percent | float | Percent of reads (or fragments) mapped uniquely in the genome | 79.95 |
+| multimapped_percent | float | Percent of reads (or fragments) mapped to multiple loci in the genome | 13.14 |
+| multimapped_toomany_percent | float | Percent of reads (or fragments) mapped to 20 or more loci in the genome | 0.13 |
+| unmapped_tooshort_percent | float | Percent of reads (or fragments) where the best alignment is shorter than the allowed mapped length | 6.53 |
+| unmapped_other_percent | float | Percent of reads (or fragments) that couldn't be mapped at all | 0.25 |
+
+#### Bowtie2 alignment metrics
+*Source: bowtie2 MultiQC*
+Alignment metrics generated by bowtie2.
+> *Present only for data processed with the prokaryotic pipeline.*
+
+| Column Name | Type | Description | Example |
+|:-----------------------|:------|:-----------------------------------------------------------------------------------------------------------------|:---------|
+| total_reads | int | total input reads (or read-pairs for paired-end data) | 15066949 |
+| overall_alignment_rate | float | percentage of input reads that mapped to the genome (includes discordant and partial alignments). | 98.03 |
+| aligned_none | int | number of reads (or read-pairs) that aligned 0 times to the reference genome (concordantly if paired-end) | 516325 |
+| aligned_one | int | number of reads (or read-pairs) that aligned exactly 1 time to the reference genome (concordantly if paired-end) | 11294617 |
+| aligned_multi | int | number of reads (or read-pairs) that aligned > 1 times to the reference genome (concordantly if paired-end) | 3256007 |
+
+
+
+
+-----------
+### Gene count metrics
+
+#### Common gene count metrics
+*Source: RSEM_Unnormalized_Counts_GLbulkRNAseq.csv or FeatureCounts_Unnormalized_Counts_GLbulkRNAseq.csv*
+Summarizes the data in the Unnormalized_Counts_GLbulkRNASeq.csv files.
+> *Present for data processed with either the eukaryotic or prokaryotic pipelines.*
+
+| Column Name | Type | Description | Example |
+|:-----------------------|:------|:--------------------------------------------------|:-----------|
+| gene_total | int | total number of genes detected | 57186 |
+| gene_detected_gt10 | int | number of genes detected with > 10 counts | 19207 |
+| gene_detected_gt10_pct | float | percentage of genes detected with > 10 counts | 33.5868919 |
+
+
+#### RSEM gene count metrics
+*Source: RSEM MultiQC*
+Summarizes the alignment rates generated by RSEM during gene expression estimation.
+> *Present only for data processed with the eukaryotic pipeline.*
+
+| Column Name | Type | Description | Example |
+|:-----------------------|:------|:--------------------------------------------------------|:------------|
+| num_uniquely_aligned | int | number of reads (or fragments) aligned uniquely to a gene | 41152050 |
+| pct_uniquely_aligned | float | percentage of reads (or fragments) aligned uniquely to a gene | 74.48868329 |
+| pct_multi_aligned | float | percentage of reads (or fragments) aligned to multiple genes | 15.27982918 |
+| pct_filtered | float | percentage of reads (or fragments) filtered due to too many alignments | 0 |
+| pct_unalignable | float | percentage of reads (or fragments) unalignable to any gene | 10.23148753 |
+
+#### FeatureCounts gene count metrics
+*Source: FeatureCounts MultiQC*
+Summarizes the feature counts assigned to genome features.
+> *Present only for data processed with the prokaryotic pipeline.*
+
+| Column Name | Type | Description | Example |
+|:--------------------------|:------|:---------------------------------------------------------|:------------|
+| total_count | int | total number of input alignments | 496 |
+| num_assigned | int | number of alignments assigned to a feature | 431 |
+| pct_assigned | float | percentage of alignments assigned to a feature | 86.89516129 |
+| num_unassigned_nofeatures | int | number of alignments that overlap no features | 19 |
+| num_unassigned_ambiguity | int | number of alignments that overlap 2 or more features | 43 |
+| pct_unassigned_nofeatures | float | percentage of alignments that overlap no features | 3.830645161 |
+| pct_unassigned_ambiguity | float | percentage of alignments that overlap 2 or more features | 8.669354839 |
+
+
+
+
+-----------
+### RSeQC metrics
+
+#### Genebody coverage metrics
+*Source: RSeQC Gene Body Coverage MultiQC*
+Summarizes the read coverage over gene bodies. Used to check if the coverage is uniform and if
+there is any 5' or 3' bias.
+
+| Column Name | Type | Description | Example |
+|:--------------------------|:------|:-------------------------------------------------------------------------------------|:------------|
+| mean_genebody_cov_5_20 | float | average read (or fragment) coverage between the 5th and 20th gene body percentile from the 5' end | 70.60419705 |
+| mean_genebody_cov_40_60 | float | average read (or fragment) coverage between the 40th and 60th gene body percentile from the 5' end | 99.38086546 |
+| mean_genebody_cov_80_95 | float | average read (or fragment) coverage between the 80th and 95th gene body percentile from the 5' end | 87.1712315 |
+| ratio_genebody_cov_3_to_5 | float | ratio of the 3' (mean_genebody_cov_80_95) to 5' (mean_genebody_cov_5_20) gene body coverage | 1.234646595 |
+
+#### Infer experiment metrics
+*Source: RSeQC Infer Experiment MultiQC*
+Summarizes the percentage of reads and read pairs that match the strandedness of the overlapping
+transcripts. Can be used to infer whether the RNAseq library prep was stranded or unstranded.
+
+| Column Name | Type | Description | Example |
+|:--------------------------|:------|:-------------------------------------------------------------|:--------|
+| pct_sense | float | percentage of reads (or fragments) aligned to the sense strand | 91.16 |
+| pct_antisense | float | percentage of reads (or fragments) aligned to the antisense strand | 1.76 |
+| pct_undetermined | float | percentage of reads (or fragments) where the strand alignment could not be determined | 7.08 |
+
+#### Inner distance metrics
+*Source: RSeQC Inner Distance MultiQC*
+Summarizes the inner distance (or insert size) between two paired RNA reads. Note that this can be
+negative if the two reads overlap.
+> *Present only for paired-end data.*
+
+| Column Name | Type | Description | Example |
+|:--------------------------|:------|:----------------------------------------------------|:------------|
+| peak_inner_dist | float | The inner distance (distance between the 3' end of the forward mapped read and the 5' end of the reverse mapped read) at the peak of the distribution, a negative value indicates overlapping reads | -123.5 |
+| peak_inner_dist_pct_reads | float | percentage of reads at the peak of the distribution | 5.112384053 |
+
+#### Read distribution metrics
+*Source: RSeQC Read Distribution MultiQC*
+Summarizes the distribution of reads over genome features.
+
+| Column Name | Type | Description | Example. |
+|:--------------------------|:------|:---------------------------------------------------------------------------|:------------|
+| cds_exons_pct | float | percentage of reads (or fragments) aligned to CDS exons | 43.5637117 |
+| 5_utr_exons_pct | float | percentage of reads (or fragments) aligned to 5' UTRs | 8.886329966 |
+| 3_utr_exons_pct | float | percentage of reads (or fragments) aligned to 3' UTRs | 21.41150152 |
+| introns_pct | float | percentage of reads (or fragments) aligned to introns | 20.55254242 |
+| tss_up_1kb_pct | float | percentage of reads (or fragments) aligned 1 kb upstream of a transcription start site | 0.086612962 |
+| tss_up_1kb_5kb_pct | float | percentage of reads (or fragments) aligned 1-5 kb upstream of a transcription start site | 0.389163859 |
+| tss_up_5kb_10kb_pct | float | percentage of reads (or fragments) aligned 5-10 kb upstream of a transcription start site | 0.116168783 |
+| tes_down_1kb_pct | float | percentage of reads (or fragments) aligned 1 kb downstream of a transcription end site | 0.276327172 |
+| tes_down_1kb_5kb_pct | float | percentage of reads (or fragments) aligned 1-5 kb downstream of a transcription end site | 0.419487364 |
+| tes_down_5kb_10kb_pct | float | percentage of reads (or fragments) aligned 5-10 kb downstream of a transcription end site | 0.137565531 |
+| other_intergenic_pct | float | percentage of reads (or fragments) aligned to other intergenic regions | 4.160588723 |
+
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/README.md b/RNAseq/Workflow_Documentation/NF_RCP/README.md
index ba280393..81df0ea5 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP/README.md
+++ b/RNAseq/Workflow_Documentation/NF_RCP/README.md
@@ -128,9 +128,9 @@ All files required for utilizing the NF_RCP GeneLab workflow for processing RNAs
copy of latest NF_RCP version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands:
```bash
-wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_RCP_2.0.0/NF_RCP_2.0.0.zip
+wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_RCP_2.0.1/NF_RCP_2.0.1.zip
-unzip NF_RCP_2.0.0.zip
+unzip NF_RCP_2.0.1.zip
```
@@ -142,10 +142,10 @@ unzip NF_RCP_2.0.0.zip
Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210).
To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_RCP workflow:
-> Note: This command should be run in the location containing the `NF_RCP_2.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. Depending on your network speed, fetching the images will take ~20 minutes. Approximately 8GB of RAM is needed to download and build the Singularity images.
+> Note: This command should be run in the location containing the `NF_RCP_2.0.1` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. Depending on your network speed, fetching the images will take ~20 minutes. Approximately 8GB of RAM is needed to download and build the Singularity images.
```bash
-bash NF_RCP_2.0.0/bin/prepull_singularity.sh NF_RCP_2.0.0/config/software/by_docker_image.config
+bash NF_RCP_2.0.1/bin/prepull_singularity.sh NF_RCP_2.0.1/config/software/by_docker_image.config
```
@@ -161,7 +161,7 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity
### 4. Run the Workflow
-While in the location containing the `NF_RCP_2.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow.
+While in the location containing the `NF_RCP_2.0.1` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow.
Both workflows automatically load reference files and organism-specific gene annotation files from the [GeneLab annotations table](https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv). For organisms not listed in the table or to use alternative reference files, additional workflow parameters can be specified.
@@ -175,7 +175,7 @@ Both workflows automatically load reference files and organism-specific gene ann
#### 4a. Approach 1: Run the workflow on a GeneLab RNAseq dataset with automatic retrieval of reference fasta and gtf files
```bash
-nextflow run NF_RCP_2.0.0/main.nf \
+nextflow run NF_RCP_2.0.1/main.nf \
-profile singularity,local \
--accession OSD-194
```
@@ -187,7 +187,7 @@ nextflow run NF_RCP_2.0.0/main.nf \
#### 4b. Approach 2: Run the workflow on a GeneLab RNAseq dataset with custom reference fasta and gtf files
```bash
-nextflow run NF_RCP_2.0.0/main.nf \
+nextflow run NF_RCP_2.0.1/main.nf \
-profile singularity,local \
--accession OSD-194 \
--reference_version 112 \
@@ -205,7 +205,7 @@ nextflow run NF_RCP_2.0.0/main.nf \
#### 4c. Approach 3: Run the workflow on a non-GeneLab dataset using a user-created runsheet with automatic retrieval of reference fasta and gtf files
```bash
-nextflow run NF_RCP_2.0.0/main.nf \
+nextflow run NF_RCP_2.0.1/main.nf \
-profile singularity,local \
--runsheet_path
```
@@ -217,7 +217,7 @@ nextflow run NF_RCP_2.0.0/main.nf \
#### 4d. Approach 4: Run the workflow on a non-GeneLab dataset using a user-created runsheet with custom reference fasta and gtf files
```bash
-nextflow run NF_RCP_2.0.0/main.nf \
+nextflow run NF_RCP_2.0.1/main.nf \
-profile singularity \
--accession OSD-194 \
--reference_version 112 \
@@ -235,7 +235,7 @@ nextflow run NF_RCP_2.0.0/main.nf \
#### Required Parameters For All Approaches:
-* `NF_RCP_2.0.0/main.nf` - Instructs Nextflow to run the NF_RCP workflow
+* `NF_RCP_2.0.1/main.nf` - Instructs Nextflow to run the NF_RCP workflow
* `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow; use `local` for local execution ([local.config](workflow_code/conf/local.config)) or `slurm` for SLURM cluster execution ([slurm.config](workflow_code/conf/slurm.config))
> Note: The output directory will be named `GLDS-#` when using a OSD or GLDS accession as input, or `results` when running the workflow with only a runsheet as input.
@@ -313,7 +313,7 @@ nextflow run NF_RCP_2.0.0/main.nf \
All parameters listed above and additional optional arguments for the RCP workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command:
```bash
-nextflow run NF_RCP_2.0.0/main.nf --help
+nextflow run NF_RCP_2.0.1/main.nf --help
```
See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows.
@@ -354,6 +354,10 @@ The outputs from the Analysis Staging and V&V Pipeline Subworkflows are describe
- processing_info/nextflow_log_GLbulkRNAseq.txt (Nextflow execution logs captured via `nextflow log`)
- processing_info/nextflow_run_command_GLbulkRNAseq.txt (Exact command line used to initiate the workflow)
+**QC metrics summary**
+
+ - Output:
+ - GeneLab/qc_metrics_GLbulkRNAseq.csv (comma-separated text file containing a summary of qc metrics and metadata for the dataset, see the [QC metrics README](./QC_metrics_README.md) for a complete list of field definitions)
Standard Nextflow resource usage logs are also produced as follows:
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/parse_multiqc.py b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/parse_multiqc.py
index 7703e20a..f56019bf 100755
--- a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/parse_multiqc.py
+++ b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/parse_multiqc.py
@@ -14,7 +14,180 @@
import os
-def main(osd_num, paired_end, assay_suffix, mode):
+def get_runsheet_order(runsheet_path):
+ """Read runsheet and return ordered list of sample names"""
+ if not runsheet_path or not os.path.exists(runsheet_path):
+ return None
+ try:
+ df = pd.read_csv(runsheet_path)
+ if 'Sample Name' in df.columns:
+ return df['Sample Name'].tolist()
+ except Exception as e:
+ print(f"WARNING: Error reading runsheet: {str(e)}")
+ return None
+
+
+def generate_validation_report(fieldnames, populated_fields, mode, assay_suffix, paired_end):
+ """Generate a validation report showing which columns are missing data"""
+
+ # Define field categories
+ metadata_fields = [
+ 'osd_num', 'sample', 'organism', 'tissue', 'sequencing_instrument',
+ 'library_selection', 'library_layout', 'strandedness', 'read_depth',
+ 'read_length', 'rrna_contamination', 'rin', 'organism_part', 'cell_line',
+ 'cell_type', 'secondary_organism', 'strain', 'animal_source', 'seed_source',
+ 'source_accession', 'mix'
+ ]
+
+ gene_count_fields = ['gene_detected_gt10', 'gene_total', 'gene_detected_gt10_pct']
+
+ fastqc_raw_fields = [f for f in fieldnames if f.startswith('raw_')]
+ fastqc_trimmed_fields = [f for f in fieldnames if f.startswith('trimmed_')]
+
+ # For single-end data, exclude _r fields from FastQC sections
+ if not paired_end:
+ fastqc_raw_fields = [f for f in fastqc_raw_fields if not f.endswith('_r')]
+ fastqc_trimmed_fields = [f for f in fastqc_trimmed_fields if not f.endswith('_r')]
+
+ star_fields = [
+ 'uniquely_mapped_percent', 'multimapped_percent', 'multimapped_toomany_percent',
+ 'unmapped_tooshort_percent', 'unmapped_other_percent'
+ ]
+
+ bowtie2_fields = [
+ 'total_reads', 'overall_alignment_rate', 'aligned_none', 'aligned_one', 'aligned_multi'
+ ]
+
+ featurecounts_fields = [
+ 'total_count', 'num_assigned', 'pct_assigned', 'num_unassigned_nofeatures',
+ 'num_unassigned_ambiguity', 'pct_unassigned_nofeatures', 'pct_unassigned_ambiguity'
+ ]
+
+ rseqc_fields = [
+ 'mean_genebody_cov_5_20', 'mean_genebody_cov_40_60', 'mean_genebody_cov_80_95',
+ 'ratio_genebody_cov_3_to_5', 'pct_sense', 'pct_antisense', 'pct_undetermined',
+ 'cds_exons_pct', '5_utr_exons_pct', '3_utr_exons_pct', 'introns_pct',
+ 'tss_up_1kb_pct', 'tss_up_1kb_5kb_pct', 'tss_up_5kb_10kb_pct', 'tes_down_1kb_pct',
+ 'tes_down_1kb_5kb_pct', 'tes_down_5kb_10kb_pct', 'other_intergenic_pct'
+ ]
+
+ # Add inner distance fields only for paired-end data
+ if paired_end:
+ rseqc_fields.extend(['peak_inner_dist', 'peak_inner_dist_pct_reads'])
+
+ rsem_fields = [
+ 'num_uniquely_aligned', 'pct_uniquely_aligned', 'pct_multi_aligned',
+ 'pct_filtered', 'pct_unalignable'
+ ]
+
+ def get_missing_fields(field_list):
+ return [f for f in field_list if f in fieldnames and f not in populated_fields]
+
+ report_filename = f'qc_validation{assay_suffix}.txt'
+ with open(report_filename, 'w') as f:
+ # Header
+ f.write("QC metrics validation report\n\n")
+ mode_display = "Microbes" if mode == 'microbes' else "Default"
+ data_type = "Paired end" if paired_end else "Single end"
+ f.write(f"Mode: {mode_display}\n")
+ f.write(f"Data type: {data_type}\n\n")
+ f.write("-" * 40 + "\n\n")
+
+ # Calculate summary stats
+ missing_metadata_count = len([f for f in metadata_fields if f in fieldnames and f not in populated_fields])
+ total_metadata_count = len([f for f in metadata_fields if f in fieldnames])
+
+ # Count expected MultiQC fields based on mode (excluding metadata)
+ expected_multiqc_fields = gene_count_fields + fastqc_raw_fields + fastqc_trimmed_fields + rseqc_fields
+ if mode == 'microbes':
+ expected_multiqc_fields += bowtie2_fields + featurecounts_fields
+ else:
+ expected_multiqc_fields += star_fields + rsem_fields
+
+ missing_multiqc_count = len([f for f in expected_multiqc_fields if f in fieldnames and f not in populated_fields])
+ total_multiqc_count = len([f for f in expected_multiqc_fields if f in fieldnames])
+
+ # Summary section
+ f.write("Summary:\n")
+ f.write(f"* {missing_metadata_count}/{total_metadata_count} Metadata fields are empty\n")
+ f.write(f"* {missing_multiqc_count}/{total_multiqc_count} expected MultiQC fields are empty\n\n")
+
+ f.write("Categories missing entries:\n\n")
+
+ # Metadata
+ missing_metadata = get_missing_fields(metadata_fields)
+ if missing_metadata:
+ f.write("* Metadata:\n")
+ for field in missing_metadata:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+ # Gene count
+ missing_gene_count = get_missing_fields(gene_count_fields)
+ if missing_gene_count:
+ f.write("* Gene Count:\n")
+ for field in missing_gene_count:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+ # FastQC Raw
+ missing_fastqc_raw = get_missing_fields(fastqc_raw_fields)
+ if missing_fastqc_raw:
+ f.write("* FastQC Raw:\n")
+ for field in missing_fastqc_raw:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+ # FastQC Trimmed
+ missing_fastqc_trimmed = get_missing_fields(fastqc_trimmed_fields)
+ if missing_fastqc_trimmed:
+ f.write("* FastQC Trimmed:\n")
+ for field in missing_fastqc_trimmed:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+ # Mode-specific sections
+ if mode == 'microbes':
+ # For microbes mode, check Bowtie2 and FeatureCounts (STAR/RSEM expected to be empty)
+ missing_bowtie2 = get_missing_fields(bowtie2_fields)
+ if missing_bowtie2:
+ f.write("* Bowtie2 (Prokaryotes):\n")
+ for field in missing_bowtie2:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+ missing_featurecounts = get_missing_fields(featurecounts_fields)
+ if missing_featurecounts:
+ f.write("* FeatureCounts (Prokaryotes):\n")
+ for field in missing_featurecounts:
+ f.write(f"** {field}\n")
+ f.write("\n")
+ else:
+ # For eukaryotic mode, check STAR and RSEM (Bowtie2/FeatureCounts expected to be empty)
+ missing_star = get_missing_fields(star_fields)
+ if missing_star:
+ f.write("* STAR (Eukaryotes):\n")
+ for field in missing_star:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+ missing_rsem = get_missing_fields(rsem_fields)
+ if missing_rsem:
+ f.write("* RSEM (Eukaryotes):\n")
+ for field in missing_rsem:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+ # RSeQC (relevant for both modes)
+ missing_rseqc = get_missing_fields(rseqc_fields)
+ if missing_rseqc:
+ f.write("* RSeQC:\n")
+ for field in missing_rseqc:
+ f.write(f"** {field}\n")
+ f.write("\n")
+
+
+def main(osd_num, paired_end, assay_suffix, mode, runsheet=None):
osd_num = osd_num.split('-')[1]
@@ -47,6 +220,15 @@ def main(osd_num, paired_end, assay_suffix, mode):
samples = set([s for ss in multiqc_data for s in ss])
+ # Order samples according to runsheet if provided
+ runsheet_order = get_runsheet_order(runsheet)
+ if runsheet_order:
+ ordered_samples = [s for s in runsheet_order if s in samples]
+ extra_samples = [s for s in samples if s not in runsheet_order]
+ samples = ordered_samples + extra_samples
+ else:
+ samples = sorted(samples) # Fallback to alphabetical
+
metadata = get_metadata(osd_num)
fieldnames = [
@@ -80,7 +262,7 @@ def main(osd_num, paired_end, assay_suffix, mode):
'mean_genebody_cov_5_20', 'mean_genebody_cov_40_60', 'mean_genebody_cov_80_95', 'ratio_genebody_cov_3_to_5',
'pct_sense', 'pct_antisense', 'pct_undetermined',
'peak_inner_dist', 'peak_inner_dist_pct_reads',
- 'cds_exons_pct', '5_utr_exons_pct', '3_utr_exons_pct', 'introns_pct', 'tss_up_1kb_pct', 'tss_up_1kb_5kb_pct', 'tss_up_5kb_10kb_pct', 'tes_down_1kb_pct', 'tss_down_1kb_5kb_pct', 'tss_down_5kb_10kb_pct', 'other_intergenic_pct',
+ 'cds_exons_pct', '5_utr_exons_pct', '3_utr_exons_pct', 'introns_pct', 'tss_up_1kb_pct', 'tss_up_1kb_5kb_pct', 'tss_up_5kb_10kb_pct', 'tes_down_1kb_pct', 'tes_down_1kb_5kb_pct', 'tes_down_5kb_10kb_pct', 'other_intergenic_pct',
# RSEM
'num_uniquely_aligned', 'pct_uniquely_aligned', 'pct_multi_aligned', 'pct_filtered', 'pct_unalignable'
@@ -90,6 +272,10 @@ def main(osd_num, paired_end, assay_suffix, mode):
fieldnames_set = set(fieldnames)
output_filename = f'qc_metrics{assay_suffix}.csv'
+
+ # Track which fields have data for validation report
+ populated_fields = set()
+
with open(output_filename, mode='w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
@@ -103,13 +289,30 @@ def main(osd_num, paired_end, assay_suffix, mode):
# Only keep fields that are in the fieldnames list
if k in fieldnames_set:
all_fields[k] = v
+ if v is not None and v != '': # Track populated fields
+ populated_fields.add(k)
else:
# Optionally add debug output to see which fields are being skipped
# print(f"Skipping field not in fieldnames: {k}")
pass
- # Write the row with filtered fields
+ # Track fields that are always populated
+ populated_fields.add('osd_num')
+ populated_fields.add('sample')
+
+ # Track populated metadata fields
+ for k, v in metadata.items():
+ if v is not None and v != '':
+ populated_fields.add(k)
+
+ # Write rows with osd_num and sample fields
writer.writerow({'osd_num': 'OSD-' + osd_num, 'sample': sample, **metadata, **all_fields})
+
+ # Generate validation report
+ try:
+ generate_validation_report(fieldnames, populated_fields, mode, assay_suffix, paired_end)
+ except Exception as e:
+ print(f"WARNING: Failed to generate validation report: {str(e)}")
def get_metadata(osd_num):
@@ -187,9 +390,20 @@ def parse_fastqc(prefix, assay_suffix):
with open(f'{prefix}_multiqc{assay_suffix}_data/multiqc_data.json') as f:
j = json.loads(f.read())
+ # Find FastQC section by looking for FastQC-specific fields
+ fastqc_section = None
+ for section in j['report_general_stats_data']:
+ sample_data = next(iter(section.values()), {})
+ if 'total_sequences' in sample_data and 'percent_gc' in sample_data:
+ fastqc_section = section
+ break
+
+ if not fastqc_section:
+ return {}
+
# Group the samples by base name for paired end data
sample_groups = {}
- for sample in j['report_general_stats_data'][-1].keys():
+ for sample in fastqc_section.keys():
# Handle various naming patterns
if ' Read 1' in sample:
base_name = sample.replace(' Read 1', '')
@@ -225,13 +439,13 @@ def parse_fastqc(prefix, assay_suffix):
# Process forward read
if reads['f']:
- for k, v in j['report_general_stats_data'][-1][reads['f']].items():
+ for k, v in fastqc_section[reads['f']].items():
if k != 'percent_fails':
data[base_name][prefix + '_' + k + '_f'] = v
# Process reverse read
if reads['r']:
- for k, v in j['report_general_stats_data'][-1][reads['r']].items():
+ for k, v in fastqc_section[reads['r']].items():
if k != 'percent_fails':
data[base_name][prefix + '_' + k + '_r'] = v
@@ -439,8 +653,8 @@ def parse_read_dist(assay_suffix):
data[sample]['tss_up_1kb_5kb_pct'] = read_data['tss_up_5kb_tag_pct'] - read_data['tss_up_1kb_tag_pct']
data[sample]['tss_up_5kb_10kb_pct'] = read_data['tss_up_10kb_tag_pct'] - read_data['tss_up_5kb_tag_pct']
- data[sample]['tss_down_1kb_5kb_pct'] = read_data['tes_down_5kb_tag_pct'] - read_data['tes_down_1kb_tag_pct']
- data[sample]['tss_down_5kb_10kb_pct'] = read_data['tes_down_10kb_tag_pct'] - read_data['tes_down_5kb_tag_pct']
+ data[sample]['tes_down_1kb_5kb_pct'] = read_data['tes_down_5kb_tag_pct'] - read_data['tes_down_1kb_tag_pct']
+ data[sample]['tes_down_5kb_10kb_pct'] = read_data['tes_down_10kb_tag_pct'] - read_data['tes_down_5kb_tag_pct']
data = {s.replace('_read_dist', ''):{k.replace('_tag', ''):v for k, v in d.items()} for s, d in data.items()}
@@ -539,6 +753,7 @@ def get_genecount(assay_suffix, mode):
parser.add_argument('--paired', action='store_true')
parser.add_argument('--assay_suffix', default='_GLbulkRNAseq')
parser.add_argument('--mode', default='default')
+ parser.add_argument('--runsheet', default=None)
args = parser.parse_args()
- main(args.osd_num, args.paired, args.assay_suffix, args.mode)
+ main(args.osd_num, args.paired, args.assay_suffix, args.mode, args.runsheet)
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/parse_qc_metrics.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/parse_qc_metrics.nf
index ff3762b5..3e18642b 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/parse_qc_metrics.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/parse_qc_metrics.nf
@@ -9,6 +9,7 @@ process PARSE_QC_METRICS {
path(isa_zip)
path(all_multiqc_output)
path(rsem_counts)
+ path(runsheet_path)
output:
path("qc_metrics${params.assay_suffix}.csv"), emit: file
@@ -17,6 +18,6 @@ process PARSE_QC_METRICS {
def assay_suffix = params.assay_suffix ? "--assay_suffix ${params.assay_suffix}" : ""
def mode_param = params.mode ? "--mode ${params.mode}" : ""
"""
- parse_multiqc.py --osd-num ${ osd_accession } ${ assay_suffix } ${ meta.paired_end ? '--paired' : '' } ${ mode_param }
+ parse_multiqc.py --osd-num ${ osd_accession } ${ assay_suffix } ${ meta.paired_end ? '--paired' : '' } ${ mode_param } --runsheet ${ runsheet_path }
"""
}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow.config b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow.config
index aafdbf6d..cf13a5aa 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow.config
+++ b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow.config
@@ -166,5 +166,5 @@ manifest {
description = 'Nextflow workflow for Documents GL-DPPD-7101-G and GL-DPPD-7115.'
mainScript = 'main.nf'
nextflowVersion = '!>=24.10.5'
- version = '2.0.0'
+ version = '2.0.1'
}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq.nf
index 01c7e3b8..faf5c7e1 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq.nf
@@ -281,7 +281,8 @@ workflow RNASEQ {
ch_meta,
isa_archive.ifEmpty(file("ISA.zip")), // Use a placeholder if isa_archive is empty
all_multiqc_output,
- QUANTIFY_RSEM_GENES.out.publishables
+ QUANTIFY_RSEM_GENES.out.publishables,
+ runsheet_path
)
VV_RAW_READS(
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf
index 998f3753..29bb10a7 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf
+++ b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf
@@ -282,7 +282,8 @@ workflow RNASEQ_MICROBES {
ch_meta,
isa_archive.ifEmpty(file("ISA.zip")), // Use a placeholder if isa_archive is empty
all_multiqc_output,
- DGE_DESEQ2.out.norm_counts | map{ it -> it[1] }
+ DGE_DESEQ2.out.norm_counts | map{ it -> it[1] },
+ runsheet_path
)
VV_RAW_READS(
diff --git a/RNAseq/Workflow_Documentation/README.md b/RNAseq/Workflow_Documentation/README.md
index e6b9a20b..c2d1422e 100644
--- a/RNAseq/Workflow_Documentation/README.md
+++ b/RNAseq/Workflow_Documentation/README.md
@@ -8,8 +8,8 @@ GeneLab has wrapped each step of the pipeline into a workflow with validation an
|Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version|
|:---------------|:---------------------------------------------------------|:---------------|
-|*[GL-DPPD-7101-G.md](../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md)|[NF_RCP_2.0.0](NF_RCP)|24.10.5|
-|*[GL-DPPD-7115.md](../Pipeline_GL-DPPD-7115_Versions/GL-DPPD-7115.md)|[NF_RCP_2.0.0](NF_RCP)|24.10.5|
+|*[GL-DPPD-7101-G.md](../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md)|[NF_RCP_2.0.1](NF_RCP)|24.10.5|
+|*[GL-DPPD-7115.md](../Pipeline_GL-DPPD-7115_Versions/GL-DPPD-7115.md)|[NF_RCP_2.0.1](NF_RCP)|24.10.5|
|[GL-DPPD-7101-F.md](../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md)|[NF_RCP-F_1.0.4](NF_RCP-F)|22.10.1|
*Current GeneLab Pipeline/Workflow Implementation