diff --git a/.cirro/preprocess.py b/.cirro/preprocess.py index ede348a..45a4c95 100644 --- a/.cirro/preprocess.py +++ b/.cirro/preprocess.py @@ -29,8 +29,8 @@ # 3. Set workflow_level value based on form input ds.logger.info("Setting workflow_level") -levels = ['convert', 'sample', 'compare'] -flags = [ds.params['convert_lvl'], ds.params['sample_lvl'], ds.params['compare_lvl']] +levels = ['convert', 'sample', 'patient', 'compare'] +flags = [ds.params['convert_lvl'], ds.params['sample_lvl'], ds.params['patient_lvl'], ds.params['compare_lvl']] workflow_level = [lvl for lvl, flag in zip(levels, flags) if flag] ds.add_param('workflow_level', ','.join(workflow_level)) diff --git a/.cirro/process-form.json b/.cirro/process-form.json index f8dfb68..7201afd 100644 --- a/.cirro/process-form.json +++ b/.cirro/process-form.json @@ -10,6 +10,13 @@ "type": "boolean", "value": true }, + "patient_lvl": { + "default": true, + "description": "Run clustering and comparison modules on samples grouped by patient of origin", + "title": "Patient", + "type": "boolean", + "value": true + }, "compare_lvl": { "default": true, "description": "Comparative TCR repertoire analyses across samples and subjects", @@ -18,10 +25,10 @@ "value": true }, "olga_chunk_length": { - "default": 2000000, + "default": 100000, "description": "Divide total CDR3 list into chunks of n length for processing by OLGA. Larger length = reduced parallelization", "title": "olga_chunk_length", - "type": "int" + "type": "integer" }, "distance_metric": { "default": "tcrdist", diff --git a/.cirro/process-input.json b/.cirro/process-input.json index 57e325a..df5539b 100644 --- a/.cirro/process-input.json +++ b/.cirro/process-input.json @@ -2,6 +2,7 @@ "input_format": "airr", "convert_lvl": false, "sample_lvl": "$.params.dataset.paramJson.sample_lvl", + "patient_lvl": "$.params.dataset.paramJson.patient_lvl", "compare_lvl": "$.params.dataset.paramJson.compare_lvl", "olga_chunk_length": "$.params.dataset.paramJson.olga_chunk_length", "matrix_sparsity": "sparse", diff --git a/bin/patient_calc.py b/bin/patient_calc.py new file mode 100755 index 0000000..27564f2 --- /dev/null +++ b/bin/patient_calc.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Optimized TCR repertoire overlap (single-file input, vectorized) +""" + +import argparse +import pandas as pd +import numpy as np + + +# ------------------------- +# Morisita–Horn (vectorized) +# ------------------------- +def morisita_horn_matrix(M): + """ + M: (n_samples x n_features) count matrix + Returns: (n x n) similarity matrix + """ + X = M.sum(axis=1, keepdims=True) # (n,1) + + # Avoid division by zero + X_safe = np.where(X == 0, 1, X) + + lambda_vals = (M ** 2).sum(axis=1, keepdims=True) / (X_safe ** 2) + + # Pairwise dot products + prod = M @ M.T # (n x n) + + denom = (lambda_vals + lambda_vals.T) * (X @ X.T) + + # Avoid division by zero + denom = np.where(denom == 0, 1, denom) + + return (2 * prod) / denom + + +# ------------------------- +# Main +# ------------------------- +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="Optimized overlap metrics for TCR repertoires" + ) + parser.add_argument( + "-i", "--input", + required=True, + help="Concatenated TSV with sample, CDR3b, counts" + ) + parser.add_argument( + "-p", "--patient", + required=True, + help="String indicating patient of samples" + ) + args = parser.parse_args() + + # ------------------------- + # Load + clean + # ------------------------- + df = pd.read_csv( + args.input, + sep="\t", + usecols=["sample", "CDR3b", "counts"] + ) + + df = df.dropna(subset=["sample", "CDR3b"]) + + df["counts"] = pd.to_numeric( + df["counts"], errors="coerce" + ).fillna(0) + + # ------------------------- + # Pre-aggregate (critical optimization) + # ------------------------- + df = ( + df.groupby(["sample", "CDR3b"], as_index=False)["counts"] + .sum() + ) + + samples = df["sample"].unique() + n = len(samples) + + print(f"Loaded {n} samples") + + # ------------------------- + # Build count matrix + # ------------------------- + pivot = df.pivot( + index="sample", + columns="CDR3b", + values="counts" + ).fillna(0) + + # Ensure consistent ordering + pivot = pivot.loc[samples] + + M = pivot.to_numpy(dtype=float) + + # ------------------------- + # Presence/absence matrix + # ------------------------- + B = (M > 0).astype(np.int32) + + # ------------------------- + # Jaccard (vectorized) + # ------------------------- + intersection = B @ B.T # shared junction counts + + row_sums = B.sum(axis=1, keepdims=True) + union = row_sums + row_sums.T - intersection + + union = np.where(union == 0, 1, union) + + jaccard = intersection / union + + # ------------------------- + # Sørensen (Dice) + # ------------------------- + denom = row_sums + row_sums.T + denom = np.where(denom == 0, 1, denom) + + sorensen = (2 * intersection) / denom + + # ------------------------- + # Morisita–Horn (vectorized) + # ------------------------- + morisita = morisita_horn_matrix(M) + + # ------------------------- + # Fix diagonals + # ------------------------- + np.fill_diagonal(jaccard, 1.0) + np.fill_diagonal(sorensen, 1.0) + np.fill_diagonal(morisita, 1.0) + + # ------------------------- + # Write outputs + # ------------------------- + patient = args.patient + pd.DataFrame(jaccard, index=samples, columns=samples).to_csv(f"{patient}/jaccard_mat.csv") + pd.DataFrame(sorensen, index=samples, columns=samples).to_csv(f"{patient}/sorensen_mat.csv") + pd.DataFrame(morisita, index=samples, columns=samples).to_csv(f"{patient}/morisita_mat.csv") + + print("Finished writing all matrices") \ No newline at end of file diff --git a/modules/local/annotate/main.nf b/modules/local/annotate/main.nf index 2c65eb8..086fdbd 100644 --- a/modules/local/annotate/main.nf +++ b/modules/local/annotate/main.nf @@ -1,3 +1,57 @@ +process ANNOTATE_PROCESS { + tag "${sample_meta.sample}" + label 'process_low' + publishDir enabled: false + + input: + tuple val(sample_meta), path(count_table) + + output: + tuple val(sample_meta), path("${sample_meta.sample}_cdr3.tsv"), emit: "process" + + script: + """ + python - < giana.log 2>&1 + --Verbose + # > giana.log 2>&1 # Insert header after GIANA comments insert=\$(head -n 1 "${concat_cdr3}") @@ -40,8 +41,9 @@ process GIANA_CALC { /^##/ { print; next } !inserted { print insert; inserted=1 } { print } - ' giana_RotationEncodingBL62.txt > tmp && mv tmp giana_RotationEncodingBL62.txt + ' giana_RotationEncodingBL62.txt > ${patient}_giana.txt - mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt + # mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt + mv VgeneScores.txt ${patient}_VgeneScores.txt """ } diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index 68dd080..d1a4916 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -1,19 +1,20 @@ process GLIPH2_TURBOGLIPH { + tag "${patient}" label 'process_high' label 'process_high_compute' label 'process_high_memory' input: - path concat_cdr3 + tuple val(patient), path(concat_cdr3) output: - path "all_motifs.txt", emit: 'all_motifs' - path "clone_network.txt", emit: 'clone_network' - path "cluster_member_details.txt", emit: 'cluster_member_details' - path "convergence_groups.txt", emit: 'convergence_groups' - path "global_similarities.txt", emit: 'global_similarities' - path "local_similarities.txt", emit: 'local_similarities' - path "parameter.txt", emit: 'gliph2_parameters' + path "${patient}/all_motifs.txt", emit: 'all_motifs' + path "${patient}/clone_network.txt", emit: 'clone_network' + path "${patient}/cluster_member_details.txt", emit: 'cluster_member_details' + path "${patient}/convergence_groups.txt", emit: 'convergence_groups' + path "${patient}/global_similarities.txt", emit: 'global_similarities' + path "${patient}/local_similarities.txt", emit: 'local_similarities' + path "${patient}/parameter.txt", emit: 'gliph2_parameters' script: """ @@ -28,7 +29,7 @@ process GLIPH2_TURBOGLIPH { result <- turboGliph::gliph2( cdr3_sequences = df, - result_folder = "./", + result_folder = "./${patient}", lcminp = ${params.local_min_pvalue}, sim_depth = ${params.simulation_depth}, kmer_mindepth = ${params.kmer_min_depth}, @@ -37,16 +38,16 @@ process GLIPH2_TURBOGLIPH { n_cores = ${task.cpus} ) - df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE) + df3 <- read.csv('${patient}/cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE) df3[,'sample'] <- df3[,'patient'] df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE) df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')] - write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE) + write.table(df3, "${patient}/cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE) EOF # Rename local_similarities file to standardize output name - input_file="local_similarities_*.txt" - cat \$input_file > local_similarities.txt + input_file="${patient}/local_similarities_*.txt" + cat \$input_file > ${patient}/local_similarities.txt """ } diff --git a/modules/local/olga/main.nf b/modules/local/olga/main.nf index 93e608c..defe2d9 100644 --- a/modules/local/olga/main.nf +++ b/modules/local/olga/main.nf @@ -10,7 +10,21 @@ process OLGA_CALCULATE{ script: """ - olga-compute_pgen --humanTRB -i ${cdr3_chunk} -o "pgen_${cdr3_chunk}" + olga-compute_pgen --humanTRB -i ${cdr3_chunk} -o olga_pgen.tsv + cat olga_pgen.tsv \ + | awk -F'\t' -v OFS='\t' ' + BEGIN { log10_base = log(10) } + { + cdr3 = \$1 + pgen = \$2 + + if (pgen == "NA" || pgen == "" || pgen == 0) { + print cdr3, pgen, "NA" + } else { + print cdr3, pgen, log(pgen)/log10_base + } + } + ' > "pgen_${cdr3_chunk}" """ } @@ -22,23 +36,39 @@ process OLGA_CONCATENATE { output: path "olga_pgen.tsv", emit: cdr3_pgen + path "olga_pgen.stats.tsv", emit: cdr3_pgen_stats script: """ - printf "CDR3b\tpgen\tlog10_pgen\n" > olga_pgen.tsv - cat ${pgen_chunks} \ - | awk -F'\t' -v OFS='\t' ' - { + awk -F'\t' ' + BEGIN { + max_len = 0 + min_log = "" + max_log = "" + } + + { cdr3 = \$1 - pgen = \$2 - if (pgen == "NA" || pgen == "" || pgen == 0) { - print cdr3, pgen, "NA" - } else { - print cdr3, pgen, log(pgen)/log(10) - } + log10 = \$3 + + len = length(cdr3) + if (len > max_len) max_len = len + + if (log10 != "NA" && log10 != "") { + if (min_log == "" || log10 < min_log) min_log = log10 + if (max_log == "" || log10 > max_log) max_log = log10 } - ' \ - >> olga_pgen.tsv + } + + END { + printf "max_cdr3_length\t%d\\n", max_len > "olga_pgen.stats.tsv" + printf "min_log10_pgen\t%s\\n", min_log > "olga_pgen.stats.tsv" + printf "max_log10_pgen\t%s\\n", max_log > "olga_pgen.stats.tsv" + } + ' ${pgen_chunks} + + printf "CDR3b\tpgen\tlog10_pgen\n" > olga_pgen.tsv + cat ${pgen_chunks} >> olga_pgen.tsv """ } @@ -76,45 +106,126 @@ process OLGA_SAMPLE_MERGE { input: tuple val(sample_meta), path(count_table) path cdr3_pgen + val olga_stats output: tuple val(sample_meta), path("${sample_meta.sample}_tcr_pgen.tsv"), emit: olga_pgen - path "olga_xmin_value.txt", emit: olga_xmin - path "olga_xmax_value.txt", emit: olga_xmax script: """ # Extract vector of cdr3 aa, dropping null values python - < olga_xmin_value.txt - echo ${olga_global_xmax} > olga_xmax_value.txt - echo ${olga_global_ymax} > olga_ymax_value.txt - """ } \ No newline at end of file diff --git a/modules/local/patient/main.nf b/modules/local/patient/main.nf new file mode 100644 index 0000000..c5b9eb7 --- /dev/null +++ b/modules/local/patient/main.nf @@ -0,0 +1,37 @@ +process PATIENT_CONCATENATE { + tag "${patient}" + label "process_single" + + input: + tuple val(patient), path(files) + + output: + tuple val(patient), path("${patient}_cdr3.tsv"), emit: patient_cdr3 + + script: + """ + head -n 1 ${files[0]} > ${patient}_cdr3.tsv + tail -n +2 -q ${files.join(' ')} >> ${patient}_cdr3.tsv + """ +} + +process PATIENT_CALC { + tag "${patient}" + label 'process_single' + + input: + tuple val(patient), path(concat_cdr3) + + output: + path "${patient}/jaccard_mat.csv", emit: jaccard_mat + path "${patient}/sorensen_mat.csv", emit: sorensen_mat + path "${patient}/morisita_mat.csv", emit: morisita_mat + + script: + """ + mkdir -p "${patient}" + + patient_calc.py \ + -i ${concat_cdr3} -p "${patient}" \ + """ +} diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf index 633ed7e..a971d56 100644 --- a/modules/local/sample/sample_calc.nf +++ b/modules/local/sample/sample_calc.nf @@ -17,12 +17,4 @@ process SAMPLE_CALC { """ sample_calc.py -s '${sample_meta.sample}' -c ${count_table} """ - - stub: - """ - touch sample_stats.csv - touch v_family.csv - touch d_family.csv - touch j_family.csv - """ } diff --git a/nextflow.config b/nextflow.config index d777310..32f2048 100644 --- a/nextflow.config +++ b/nextflow.config @@ -41,7 +41,7 @@ params { vgene_x_cols = 'origin,timepoint' // OLGA parameters - olga_chunk_length = 2000000 // larger chunk size = less parallelization + olga_chunk_length = 100000 // larger chunk size = less parallelization // GIANA parameters threshold = 7.0 diff --git a/nextflow_schema.json b/nextflow_schema.json index e81a2e5..907fa17 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -69,8 +69,8 @@ "workflow_level": { "type": "string", "default": "sample,compare", - "enum": ["sample,compare", "sample", "compare", "convert"], - "description": "Comma-separated workflow stages (sample, compare)." + "description": "Comma-separated workflow stages.", + "pattern": "^(sample|patient|compare|convert)(,(sample|patient|compare|convert))*$" }, "project_name": { "type": "string", diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index fe0a9d2..c61beea 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -5,8 +5,8 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { ANNOTATE_CONCATENATE; ANNOTATE_SORT_CDR3; ANNOTATE_DEDUPLICATE_CDR3_TRBV; ANNOTATE_DEDUPLICATE_CDR3 } from '../../modules/local/annotate' -include { OLGA_CONCATENATE; OLGA_CALCULATE } from '../../modules/local/olga' +include { ANNOTATE_PROCESS; ANNOTATE_SORT_CDR3; ANNOTATE_DEDUPLICATE_CDR3_TRBV; ANNOTATE_DEDUPLICATE_CDR3 } from '../../modules/local/annotate' +include { OLGA_CONCATENATE as ANNOTATE_OLGA_CONCATENATE; OLGA_CALCULATE as ANNOTATE_OLGA_CALCULATE} from '../../modules/local/olga' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -16,14 +16,19 @@ include { OLGA_CONCATENATE; OLGA_CALCULATE } from '../../modules/local/olga' workflow ANNOTATE { take: - samplesheet_resolved - all_sample_files + sample_map main: - ANNOTATE_CONCATENATE( samplesheet_resolved, - all_sample_files ) + ANNOTATE_PROCESS( sample_map ) + + processed_samples = ANNOTATE_PROCESS.out.process + + processed_samples + .map { _meta, file -> file } + .collectFile(name: 'concat_cdr3.tsv', keepHeader: true, skip: 1) + .set { concat_cdr3 } - ANNOTATE_SORT_CDR3( ANNOTATE_CONCATENATE.out.concat_cdr3 ) + ANNOTATE_SORT_CDR3( concat_cdr3 ) concat_cdr3_sorted = ANNOTATE_SORT_CDR3.out.concat_cdr3_sorted ANNOTATE_DEDUPLICATE_CDR3_TRBV( concat_cdr3_sorted ) @@ -32,13 +37,13 @@ workflow ANNOTATE { ANNOTATE_DEDUPLICATE_CDR3_TRBV.out.unique_cdr3_trbv ) - OLGA_CALCULATE( + ANNOTATE_OLGA_CALCULATE( ANNOTATE_DEDUPLICATE_CDR3.out.unique_cdr3 .splitText(by: params.olga_chunk_length, file: true) ) - OLGA_CONCATENATE ( - OLGA_CALCULATE.out.pgen_chunk + ANNOTATE_OLGA_CONCATENATE ( + ANNOTATE_OLGA_CALCULATE.out.pgen_chunk .collectFile( name: 'olga_pgen_body.tsv', sort: { f -> @@ -47,9 +52,18 @@ workflow ANNOTATE { } ) ) - cdr3_pgen = OLGA_CONCATENATE.out.cdr3_pgen + cdr3_pgen = ANNOTATE_OLGA_CONCATENATE.out.cdr3_pgen + olga_stats = ANNOTATE_OLGA_CONCATENATE.out.cdr3_pgen_stats + .map { f -> + def _m = f.readLines() + .collect{ stats -> stats.split('\t') } + .collectEntries{ stats -> [(stats[0]): stats[1]] } + } + .first() emit: + processed_samples concat_cdr3_sorted cdr3_pgen + olga_stats } \ No newline at end of file diff --git a/subworkflows/local/compare.nf b/subworkflows/local/compare.nf index d647098..9e2453e 100644 --- a/subworkflows/local/compare.nf +++ b/subworkflows/local/compare.nf @@ -5,12 +5,8 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { COMPARE_CALC } from '../../modules/local/compare/compare_calc' -include { COMPARE_PLOT } from '../../modules/local/compare/compare_plot' include { TCRSHARING_CALC; TCRSHARING_HISTOGRAM; TCRSHARING_SCATTERPLOT } from '../../modules/local/compare/tcrsharing' include { OLGA_MERGE as TCRSHARING_OLGA_MERGE } from '../../modules/local/olga' -include { GLIPH2_TURBOGLIPH; GLIPH2_PLOT } from '../../modules/local/compare/gliph2' -include { GIANA_CALC } from '../../modules/local/compare/giana' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -20,37 +16,10 @@ include { GIANA_CALC } from '../../modules/local/compare/giana' workflow COMPARE { take: - samplesheet_resolved - all_sample_files concat_cdr3_sorted cdr3_pgen main: - COMPARE_CALC( samplesheet_resolved, - all_sample_files ) - - COMPARE_PLOT( samplesheet_resolved, - COMPARE_CALC.out.jaccard_mat, - COMPARE_CALC.out.sorensen_mat, - COMPARE_CALC.out.morisita_mat, - file(params.compare_stats_template), - params.project_name, - all_sample_files - ) - - GIANA_CALC( - concat_cdr3_sorted, - params.threshold, - params.threshold_score, - params.threshold_vgene - ) - - if(params.use_gliph2) { - GLIPH2_TURBOGLIPH( - concat_cdr3_sorted - ) - } - TCRSHARING_OLGA_MERGE (concat_cdr3_sorted, cdr3_pgen) TCRSHARING_CALC( diff --git a/subworkflows/local/patient.nf b/subworkflows/local/patient.nf new file mode 100644 index 0000000..d8a55d3 --- /dev/null +++ b/subworkflows/local/patient.nf @@ -0,0 +1,54 @@ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { PATIENT_CONCATENATE; PATIENT_CALC } from '../../modules/local/patient' +include { GLIPH2_TURBOGLIPH } from '../../modules/local/compare/gliph2' +include { GIANA_CALC } from '../../modules/local/compare/giana' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow PATIENT { + take: + processed_samples + + main: + processed_samples + .map { meta, file -> [ meta.patient, file ] } + .groupTuple() + .set { patient_groups } + + PATIENT_CONCATENATE ( patient_groups ) + + PATIENT_CALC( PATIENT_CONCATENATE.out.patient_cdr3 ) + + // TODO: disabling plotting until notebook updated + // COMPARE_PLOT( samplesheet_resolved, + // COMPARE_CALC.out.jaccard_mat, + // COMPARE_CALC.out.sorensen_mat, + // COMPARE_CALC.out.morisita_mat, + // file(params.compare_stats_template), + // params.project_name, + // all_sample_files + // ) + + GIANA_CALC( + PATIENT_CONCATENATE.out.patient_cdr3, + params.threshold, + params.threshold_score, + params.threshold_vgene + ) + + if(params.use_gliph2) { + GLIPH2_TURBOGLIPH( + PATIENT_CONCATENATE.out.patient_cdr3 + ) + } +} \ No newline at end of file diff --git a/subworkflows/local/sample.nf b/subworkflows/local/sample.nf index d62b147..1454d1f 100644 --- a/subworkflows/local/sample.nf +++ b/subworkflows/local/sample.nf @@ -12,7 +12,7 @@ include { SAMPLE_AGGREGATE as SAMPLE_AGG_V } from '../../modules/local/sample/sa include { SAMPLE_AGGREGATE as SAMPLE_AGG_D } from '../../modules/local/sample/sample_aggregate' include { SAMPLE_AGGREGATE as SAMPLE_AGG_J } from '../../modules/local/sample/sample_aggregate' include { TCRDIST3_MATRIX; TCRDIST3_HISTOGRAM_CALC; TCRDIST3_HISTOGRAM_PLOT} from '../../modules/local/sample/tcrdist3' -include { OLGA_SAMPLE_MERGE; OLGA_HISTOGRAM_CALC; OLGA_HISTOGRAM_PLOT; OLGA_WRITE_MAX } from '../../modules/local/olga' +include { OLGA_SAMPLE_MERGE; OLGA_HISTOGRAM_CALC; OLGA_HISTOGRAM_PLOT } from '../../modules/local/olga' include { CONVERGENCE } from '../../modules/local/sample/convergence' include { TCRPHENO } from '../../modules/local/sample/tcrpheno' include { VDJDB_GET; VDJDB_VDJMATCH } from '../../modules/local/sample/tcrspecificity' @@ -28,6 +28,7 @@ workflow SAMPLE { take: sample_map cdr3_pgen + olga_stats main: @@ -87,25 +88,11 @@ workflow SAMPLE { global_y_max_value ) - cdr3_pgen_file = cdr3_pgen.first() OLGA_SAMPLE_MERGE ( sample_map, - cdr3_pgen_file ) + cdr3_pgen.first(), + olga_stats ) - OLGA_SAMPLE_MERGE.out.olga_xmin - .map { xmin -> xmin.text.trim().toDouble() } - .collect() - .map { values -> values.min() } - .set { olga_x_min_value } - olga_x_min_value.view { olga_xmin -> "Olga x min matrix value: $olga_xmin" } - - OLGA_SAMPLE_MERGE.out.olga_xmax - .map { xmax -> xmax.text.trim().toDouble() } - .collect() - .map { values -> values.max() } - .set { olga_x_max_value } - olga_x_max_value.view { olga_xmax -> "Olga x max matrix value: $olga_xmax" } - - OLGA_HISTOGRAM_CALC ( OLGA_SAMPLE_MERGE.out.olga_pgen, olga_x_min_value, olga_x_max_value ) + OLGA_HISTOGRAM_CALC ( OLGA_SAMPLE_MERGE.out.olga_pgen, olga_stats ) OLGA_HISTOGRAM_CALC.out.olga_ymax .map { ymax -> ymax.text.trim().toDouble() } @@ -116,12 +103,6 @@ workflow SAMPLE { OLGA_HISTOGRAM_PLOT( OLGA_HISTOGRAM_CALC.out.olga_histogram, olga_y_max_value ) - OLGA_WRITE_MAX( - olga_x_min_value, - olga_x_max_value, - olga_y_max_value - ) - CONVERGENCE ( sample_map ) TCRPHENO ( sample_map ) diff --git a/workflows/tcrtoolkit.nf b/workflows/tcrtoolkit.nf index f7bc758..9e3ac04 100644 --- a/workflows/tcrtoolkit.nf +++ b/workflows/tcrtoolkit.nf @@ -13,6 +13,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { CONVERT } from '../subworkflows/local/convert' include { RESOLVE_SAMPLESHEET } from '../subworkflows/local/resolve_samplesheet' include { SAMPLE } from '../subworkflows/local/sample' +include { PATIENT } from '../subworkflows/local/patient' include { COMPARE } from '../subworkflows/local/compare' include { VALIDATE_PARAMS } from '../subworkflows/local/validate_params' include { ANNOTATE } from '../subworkflows/local/annotate' @@ -43,6 +44,16 @@ workflow TCRTOOLKIT { } } + if (levels.contains('patient')) { + def samplesheeet_header = file(params.samplesheet).readLines().first().split(',') + def has_patient = samplesheeet_header.contains('patient') + + if (!has_patient) { + println("\u001B[33m[WARN]\u001B[0m Patient workflow was specified but metadata was not found in samplesheet; please specify patient IDs for samples using the 'patient' column.") + return + } + } + // Checking input tables INPUT_CHECK( file(params.samplesheet) ) ch_samplesheet_utf8 = INPUT_CHECK.out.samplesheet_utf8 @@ -80,26 +91,33 @@ workflow TCRTOOLKIT { // --- Main Analysis --- - RESOLVE_SAMPLESHEET( ch_samplesheet_utf8, - sample_map_final ) + // RESOLVE_SAMPLESHEET( ch_samplesheet_utf8, + // sample_map_final ) - ANNOTATE( RESOLVE_SAMPLESHEET.out.samplesheet_resolved, - RESOLVE_SAMPLESHEET.out.all_sample_files ) + if (levels.intersect(['sample','compare'])) { + ANNOTATE( sample_map_final ) + } // Running sample level analysis if (levels.contains('sample')) { SAMPLE( sample_map_final, - ANNOTATE.out.cdr3_pgen + ANNOTATE.out.cdr3_pgen, + ANNOTATE.out.olga_stats ) } + // Running patient analysis + if (levels.contains('patient')) { + PATIENT( ANNOTATE.out.processed_samples ) + } + // Running comparison analysis if (levels.contains('compare')) { - COMPARE( RESOLVE_SAMPLESHEET.out.samplesheet_resolved, - RESOLVE_SAMPLESHEET.out.all_sample_files, + COMPARE( ANNOTATE.out.concat_cdr3_sorted, - ANNOTATE.out.cdr3_pgen) + ANNOTATE.out.cdr3_pgen + ) } }