KarchinLab · dltamayo · Feb 6, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/.cirro/preprocess.py b/.cirro/preprocess.py
@@ -29,8 +29,8 @@
 # 3. Set workflow_level value based on form input
 ds.logger.info("Setting workflow_level")
 
-levels = ['convert', 'sample', 'compare']
-flags = [ds.params['convert_lvl'], ds.params['sample_lvl'], ds.params['compare_lvl']]
+levels = ['convert', 'sample', 'patient', 'compare']
+flags = [ds.params['convert_lvl'], ds.params['sample_lvl'], ds.params['patient_lvl'], ds.params['compare_lvl']]
 workflow_level = [lvl for lvl, flag in zip(levels, flags) if flag]
 
 ds.add_param('workflow_level', ','.join(workflow_level))

diff --git a/.cirro/process-form.json b/.cirro/process-form.json
@@ -10,6 +10,13 @@
                 "type": "boolean",
                 "value": true
             },
+            "patient_lvl": {
+                "default": true,
+                "description": "Run clustering and comparison modules on samples grouped by patient of origin",
+                "title": "Patient",
+                "type": "boolean",
+                "value": true
+            },
             "compare_lvl": {
                 "default": true,
                 "description": "Comparative TCR repertoire analyses across samples and subjects",
@@ -18,10 +25,10 @@
                 "value": true
             },
             "olga_chunk_length": {
-                "default": 2000000,
+                "default": 100000,
                 "description": "Divide total CDR3 list into chunks of n length for processing by OLGA. Larger length = reduced parallelization",
                 "title": "olga_chunk_length",
-                "type": "int"
+                "type": "integer"
             },
             "distance_metric": {
                 "default": "tcrdist",

diff --git a/.cirro/process-input.json b/.cirro/process-input.json
@@ -2,6 +2,7 @@
     "input_format": "airr",
     "convert_lvl": false,
     "sample_lvl": "$.params.dataset.paramJson.sample_lvl",
+    "patient_lvl": "$.params.dataset.paramJson.patient_lvl",
     "compare_lvl": "$.params.dataset.paramJson.compare_lvl",
     "olga_chunk_length": "$.params.dataset.paramJson.olga_chunk_length",
     "matrix_sparsity": "sparse",

diff --git a/bin/patient_calc.py b/bin/patient_calc.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Optimized TCR repertoire overlap (single-file input, vectorized)
+"""
+
+import argparse
+import pandas as pd
+import numpy as np
+
+
+# -------------------------
+# Morisita–Horn (vectorized)
+# -------------------------
+def morisita_horn_matrix(M):
+    """
+    M: (n_samples x n_features) count matrix
+    Returns: (n x n) similarity matrix
+    """
+    X = M.sum(axis=1, keepdims=True)  # (n,1)
+
+    # Avoid division by zero
+    X_safe = np.where(X == 0, 1, X)
+
+    lambda_vals = (M ** 2).sum(axis=1, keepdims=True) / (X_safe ** 2)
+
+    # Pairwise dot products
+    prod = M @ M.T  # (n x n)
+
+    denom = (lambda_vals + lambda_vals.T) * (X @ X.T)
+
+    # Avoid division by zero
+    denom = np.where(denom == 0, 1, denom)
+
+    return (2 * prod) / denom
+
+
+# -------------------------
+# Main
+# -------------------------
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="Optimized overlap metrics for TCR repertoires"
+    )
+    parser.add_argument(
+        "-i", "--input",
+        required=True,
+        help="Concatenated TSV with sample, CDR3b, counts"
+    )
+    parser.add_argument(
+        "-p", "--patient",
+        required=True,
+        help="String indicating patient of samples"
+    )
+    args = parser.parse_args()
+
+    # -------------------------
+    # Load + clean
+    # -------------------------
+    df = pd.read_csv(
+        args.input,
+        sep="\t",
+        usecols=["sample", "CDR3b", "counts"]
+    )
+
+    df = df.dropna(subset=["sample", "CDR3b"])
+
+    df["counts"] = pd.to_numeric(
+        df["counts"], errors="coerce"
+    ).fillna(0)
+
+    # -------------------------
+    # Pre-aggregate (critical optimization)
+    # -------------------------
+    df = (
+        df.groupby(["sample", "CDR3b"], as_index=False)["counts"]
+        .sum()
+    )
+
+    samples = df["sample"].unique()
+    n = len(samples)
+
+    print(f"Loaded {n} samples")
+
+    # -------------------------
+    # Build count matrix
+    # -------------------------
+    pivot = df.pivot(
+        index="sample",
+        columns="CDR3b",
+        values="counts"
+    ).fillna(0)
+
+    # Ensure consistent ordering
+    pivot = pivot.loc[samples]
+
+    M = pivot.to_numpy(dtype=float)
+
+    # -------------------------
+    # Presence/absence matrix
+    # -------------------------
+    B = (M > 0).astype(np.int32)
+
+    # -------------------------
+    # Jaccard (vectorized)
+    # -------------------------
+    intersection = B @ B.T  # shared junction counts
+
+    row_sums = B.sum(axis=1, keepdims=True)
+    union = row_sums + row_sums.T - intersection
+
+    union = np.where(union == 0, 1, union)
+
+    jaccard = intersection / union
+
+    # -------------------------
+    # Sørensen (Dice)
+    # -------------------------
+    denom = row_sums + row_sums.T
+    denom = np.where(denom == 0, 1, denom)
+
+    sorensen = (2 * intersection) / denom
+
+    # -------------------------
+    # Morisita–Horn (vectorized)
+    # -------------------------
+    morisita = morisita_horn_matrix(M)
+
+    # -------------------------
+    # Fix diagonals
+    # -------------------------
+    np.fill_diagonal(jaccard, 1.0)
+    np.fill_diagonal(sorensen, 1.0)
+    np.fill_diagonal(morisita, 1.0)
+
+    # -------------------------
+    # Write outputs
+    # -------------------------
+    patient = args.patient
+    pd.DataFrame(jaccard, index=samples, columns=samples).to_csv(f"{patient}/jaccard_mat.csv")
+    pd.DataFrame(sorensen, index=samples, columns=samples).to_csv(f"{patient}/sorensen_mat.csv")
+    pd.DataFrame(morisita, index=samples, columns=samples).to_csv(f"{patient}/morisita_mat.csv")
+
+    print("Finished writing all matrices")
diff --git a/modules/local/annotate/main.nf b/modules/local/annotate/main.nf
@@ -1,3 +1,57 @@
+process ANNOTATE_PROCESS {
+    tag "${sample_meta.sample}"
+    label 'process_low'
+    publishDir enabled: false
+
+    input:
+    tuple val(sample_meta), path(count_table)
+
+    output:
+    tuple val(sample_meta), path("${sample_meta.sample}_cdr3.tsv"), emit: "process"
+
+    script:
+    """
+    python - <<EOF
+import pandas as pd
+
+USECOLS = [
+    "junction_aa",
+    "v_call",
+    "j_call",
+    "duplicate_count"
+]
+
+COLMAP = {
+    "junction_aa": "CDR3b",
+    "v_call": "TRBV",
+    "j_call": "TRBJ",
+    "duplicate_count": "counts"
+}
+
+df = pd.read_csv(
+    "${count_table}",
+    sep='\t',
+    usecols=USECOLS,
+    dtype={
+        "junction_aa": "string",
+        "v_call": "string",
+        "j_call": "string",
+        "duplicate_count": "int"
+    })
+
+df = (
+    df[df.junction_aa.notna()]
+    .rename(columns=COLMAP)
+    [["CDR3b", "TRBV", "TRBJ", "counts"]]
+)
+df["sample"] = "${sample_meta.sample}"
+
+df.to_csv("${sample_meta.sample}_cdr3.tsv", sep="\t", index=False)
+
+EOF
+    """
+}
+
 process ANNOTATE_CONCATENATE {
     label 'process_low'
 

diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf
@@ -1,17 +1,18 @@
 process GIANA_CALC {
+    tag "${patient}"
     label 'process_medium'
 
     input:
-    path concat_cdr3
+    tuple val(patient), path(concat_cdr3)
     val threshold
     val threshold_score
     val threshold_vgene
 
     output:
-    path "VgeneScores.txt"
-    path "giana_RotationEncodingBL62.txt"
-    path "giana_EncodingMatrix.txt"
-    path "giana.log"
+    path "${patient}_VgeneScores.txt"
+    path "${patient}_giana.txt"
+    // path "giana_EncodingMatrix.txt"
+    // path "giana.log"
 
     script:   
     """
@@ -23,8 +24,8 @@ process GIANA_CALC {
         --threshold_score ${threshold_score} \
         --threshold_vgene ${threshold_vgene} \
         --NumberOfThreads ${task.cpus} \
-        --Verbose \
-        > giana.log 2>&1
+        --Verbose
+        # > giana.log 2>&1
 
     # Insert header after GIANA comments
     insert=\$(head -n 1 "${concat_cdr3}")
@@ -40,8 +41,9 @@ process GIANA_CALC {
     /^##/ { print; next }
     !inserted { print insert; inserted=1 }
     { print }
-    ' giana_RotationEncodingBL62.txt > tmp && mv tmp giana_RotationEncodingBL62.txt
+    ' giana_RotationEncodingBL62.txt > ${patient}_giana.txt
 
-    mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
+    # mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
+    mv VgeneScores.txt ${patient}_VgeneScores.txt
     """
 }
diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
@@ -1,19 +1,20 @@
 process GLIPH2_TURBOGLIPH {
+    tag "${patient}"
     label 'process_high'
     label 'process_high_compute'
     label 'process_high_memory'
 
     input:
-    path concat_cdr3
+    tuple val(patient), path(concat_cdr3)
 
     output:
-    path "all_motifs.txt", emit: 'all_motifs'
-    path "clone_network.txt", emit: 'clone_network'
-    path "cluster_member_details.txt", emit: 'cluster_member_details'
-    path "convergence_groups.txt", emit: 'convergence_groups'
-    path "global_similarities.txt", emit: 'global_similarities'
-    path "local_similarities.txt", emit: 'local_similarities'
-    path "parameter.txt", emit: 'gliph2_parameters'
+    path "${patient}/all_motifs.txt", emit: 'all_motifs'
+    path "${patient}/clone_network.txt", emit: 'clone_network'
+    path "${patient}/cluster_member_details.txt", emit: 'cluster_member_details'
+    path "${patient}/convergence_groups.txt", emit: 'convergence_groups'
+    path "${patient}/global_similarities.txt", emit: 'global_similarities'
+    path "${patient}/local_similarities.txt", emit: 'local_similarities'
+    path "${patient}/parameter.txt", emit: 'gliph2_parameters'
 
     script:
     """
@@ -28,7 +29,7 @@ process GLIPH2_TURBOGLIPH {
 
     result <- turboGliph::gliph2(
         cdr3_sequences = df,
-        result_folder = "./",
+        result_folder = "./${patient}",
         lcminp = ${params.local_min_pvalue},
         sim_depth = ${params.simulation_depth},
         kmer_mindepth = ${params.kmer_min_depth},
@@ -37,16 +38,16 @@ process GLIPH2_TURBOGLIPH {
         n_cores = ${task.cpus}
     )
 
-    df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
+    df3 <- read.csv('${patient}/cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
     df3[,'sample'] <- df3[,'patient']
     df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE)
     df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')]
-    write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
+    write.table(df3, "${patient}/cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
     EOF
 
     # Rename local_similarities file to standardize output name
-    input_file="local_similarities_*.txt"
-    cat \$input_file > local_similarities.txt
+    input_file="${patient}/local_similarities_*.txt"
+    cat \$input_file > ${patient}/local_similarities.txt
     """
 }