KarchinLab · dltamayo · Feb 6, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 26, 2026
diff --git a/.cirro/process-form.json b/.cirro/process-form.json
@@ -17,6 +17,12 @@
                 "type": "boolean",
                 "value": true
             },
+            "olga_chunk_length": {
+                "default": 2000000,
+                "description": "Divide total CDR3 list into chunks of n length for processing by OLGA. Larger length = reduced parallelization",
+                "title": "olga_chunk_length",
+                "type": "int"
+            },
             "distance_metric": {
                 "default": "tcrdist",
                 "description": "Use default TCRdist3 or Levenshtein distance metric.",

diff --git a/.cirro/process-input.json b/.cirro/process-input.json
@@ -3,6 +3,7 @@
     "convert_lvl": false,
     "sample_lvl": "$.params.dataset.paramJson.sample_lvl",
     "compare_lvl": "$.params.dataset.paramJson.compare_lvl",
+    "olga_chunk_length": "$.params.dataset.paramJson.olga_chunk_length",
     "matrix_sparsity": "sparse",
     "distance_metric": "$.params.dataset.paramJson.distance_metric",
     "kmer_min_depth": "$.params.dataset.paramJson.kmer_min_depth",

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 
+Copyright (c) 2026 Karchin Lab, Break Through Cancer
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/bin/compare_concatenate.py b/bin/compare_concatenate.py
@@ -8,7 +8,6 @@
 
 # Import modules
 import argparse
-import os
 import pandas as pd
 
 def main():
@@ -26,20 +25,31 @@ def main():
     samplesheet = pd.read_csv(args.samplesheet, header=0)
     dfs = []
     for _, row in samplesheet.iterrows():
-        # Read the TSV file into a dataframe
-        file_path = str(row['file'])
-        df = pd.read_csv(file_path, sep="\t", header=0)
+        df = pd.read_csv(
+            row['file'],
+            sep="\t",
+            usecols=[
+                'junction_aa',
+                'v_call',
+                'j_call',
+                'duplicate_count',
+                'productive'
+            ]
+        )
 
-        # Add patient column
-        df['sample'] = row['sample']
+        # Retain only productive CDR3 sequences
+        df = df[
+            (df['productive']) &
+            (df['junction_aa'].notna()) &
+            (df['v_call'].notna()) # also remove rows with a CDR3 sequence but no Vgene called
+        ]
 
-        # Select relevant columns
+        df['sample'] = row['sample']
         df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'sample']]
-        dfs.append(df)
 
+        dfs.append(df)
 
-    # Concatenate all the dataframes into one
-    df_combined = pd.concat(dfs)
+    df_combined = pd.concat(dfs, ignore_index=True)
 
     # Rename columns as required
     df_combined = df_combined.rename(columns={
@@ -48,9 +58,8 @@ def main():
         'j_call': 'TRBJ',
         'duplicate_count': 'counts'
     })
-    df_combined = df_combined[df_combined['CDR3b'].notna()]
 
-    df_combined.to_csv(f"concatenated_cdr3.txt", sep="\t", index=False, header=True)
+    df_combined.to_csv(f"concatenated_cdr3.tsv", sep="\t", index=False)
 
 if __name__ == "__main__":
     main()
diff --git a/bin/sample_calc.py b/bin/sample_calc.py
@@ -112,7 +112,7 @@ def main():
                         help='sample name')
     parser.add_argument('-c', '--count_table', 
                         metavar='count_table', 
-                        type=argparse.FileType('r'), 
+                        type=str, 
                         help='counts file in TSV format')
 
     args = parser.parse_args() 
@@ -122,11 +122,11 @@ def main():
     # Read in the counts file
     counts = pd.read_csv(args.count_table, sep='\t')
 
-    calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv')
-    calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv')
-    calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv')
-    
-    calc_sample_stats(sample, counts, f'stats/sample_stats_{sample}.csv')
+    calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'v_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'd_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'j_family_{sample}.csv')
+
+    calc_sample_stats(sample, counts, f'sample_stats_{sample}.csv')
 
 if __name__ == "__main__":
     main()
diff --git a/modules/local/annotate/main.nf b/modules/local/annotate/main.nf
@@ -0,0 +1,78 @@
+process ANNOTATE_CONCATENATE {
+    label 'process_low'
+
+    input:
+    path samplesheet_utf8
+    path all_sample_files
+
+    output:
+    path "concatenated_cdr3.tsv", emit: concat_cdr3
+
+    script:
+    """
+    # Concatenate input Adaptive files and process metadata
+    compare_concatenate.py $samplesheet_utf8
+    """
+}
+
+process ANNOTATE_SORT_CDR3 {
+    label 'process_medium'
+
+    input:
+    path concat_cdr3
+
+    output:
+    path 'concatenated_cdr3_sorted.tsv', emit: concat_cdr3_sorted
+
+    script:
+    """
+    head -n 1 ${concat_cdr3} > concatenated_cdr3_sorted.tsv
+
+    tail -n +2 ${concat_cdr3} \
+        | LC_ALL=C sort \
+            -t \$'\t' \
+            -k1,1 -k2,5 \
+            --parallel=${task.cpus} \
+            -S 50% \
+        >> concatenated_cdr3_sorted.tsv
+    """
+}
+
+process ANNOTATE_DEDUPLICATE_CDR3_TRBV {
+    label 'process_low'
+
+    input:
+    path concat_cdr3
+
+    output:
+    path 'unique_cdr3_trbv.tsv', emit: unique_cdr3_trbv
+    path 'unique_cdr3_trbv_with_vcall.tsv', emit: unique_cdr3_trbv_with_vcall
+
+    script:
+    """
+    tail -n +2 ${concat_cdr3} \
+        | awk -F'\t' '{print toupper(\$1) "\t" toupper(\$2)}' \
+        | LC_ALL=C sort -u \
+        > unique_cdr3_trbv.tsv
+
+    # additional file with blank TRBV calls removed for GIANA
+    awk -F'\t' 'NF>=2 && \$2 ~ /^TRBV/' unique_cdr3_trbv.tsv > unique_cdr3_trbv_with_vcall.tsv
+    """
+}
+
+process ANNOTATE_DEDUPLICATE_CDR3 {
+    label 'process_single'
+
+    input:
+    path unique_cdr3_trbv
+
+    output:
+    path 'unique_cdr3.txt', emit: unique_cdr3
+
+    script:
+    """
+    cut -f1 ${unique_cdr3_trbv} \
+        | LC_ALL=C sort -u \
+        > unique_cdr3.txt
+    """
+}
diff --git a/modules/local/compare/compare_concatenate.nf b/modules/local/compare/compare_concatenate.nf
diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf
@@ -5,7 +5,7 @@ process TCRSHARING_CALC {
     path concat_cdr3
 
     output:
-    path "cdr3_sharing_pgen.tsv", emit: "shared_cdr3"
+    path "cdr3_sharing.tsv", emit: "shared_cdr3"
     path "sample_mapping.tsv", emit: "sample_mapping"
 
     script:
@@ -18,6 +18,9 @@ process TCRSHARING_CALC {
     # Load data
     df = pd.read_csv("${concat_cdr3}", sep="\t")
 
+    # Remove rows where pgen = 0
+    df = df[df['pgen'] != 0]
+
     # Map sample to integer codes
     df['sample'] = df['sample'].astype('category')
     df['sample_id'] = df['sample'].cat.codes + 1
@@ -31,9 +34,12 @@ process TCRSHARING_CALC {
 
     # Get unique sample_ids per CDR3b — vectorized
     grouped = (
-        df.groupby('CDR3b')['sample_id']
-        .unique()     # UNIQUE — fast & vectorized
-        .apply(np.sort)  # SORT — vectorized
+        df.groupby('CDR3b')
+        .agg(
+            sample_id=('sample_id', 'unique'),
+            pgen=('pgen', 'first'),
+            log10_pgen=('log10_pgen', 'first')
+        )
         .reset_index()
     )
 
@@ -44,29 +50,12 @@ process TCRSHARING_CALC {
     )
 
     # Drop raw list
-    final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
+    final_df = grouped[['CDR3b', 'pgen', 'log10_pgen', 'total_samples', 'samples_present']]
     final_df = final_df.sort_values(by="total_samples", ascending=False)
 
     # Export final list
     final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
     EOF
-
-
-    olga-compute_pgen --humanTRB -i cdr3_sharing.tsv -o pgen_sharing.tsv
-
-
-    python - <<EOF
-    import pandas as pd
-
-    # Load TSVs for shared cdr3s and corresponding pgen values
-    left_df = pd.read_csv('pgen_sharing.tsv', sep='\t', header=None, usecols=[0, 1], names=['CDR3b', 'pgen'])
-    right_df = pd.read_csv('cdr3_sharing.tsv', sep='\t')
-
-    # Drop rows where pgen == 0 and merge
-    left_df = left_df[left_df['pgen'] != 0]
-    merged_df = pd.merge(left_df, right_df, on='CDR3b', how='left')
-    merged_df.to_csv('cdr3_sharing_pgen.tsv', sep='\t', index=False)
-    EOF
     """
 }
 
@@ -86,7 +75,7 @@ process TCRSHARING_HISTOGRAM {
     import pandas as pd
     import matplotlib.pyplot as plt
 
-    merged_df = pd.read_csv('$shared_cdr3', sep='\t')
+    merged_df = pd.read_csv('${shared_cdr3}', sep='\t')
 
     # Plot histogram
     sharing = merged_df['total_samples'].values
@@ -127,10 +116,9 @@ process TCRSHARING_SCATTERPLOT {
     import matplotlib.pyplot as plt
     from matplotlib.ticker import MaxNLocator
 
-    merged_df = pd.read_csv('$shared_cdr3', sep='\t')
+    merged_df = pd.read_csv('${shared_cdr3}', sep='\t')
 
     # Create scatter plot with log-transform pgen
-    merged_df["log10_pgen"] = np.log10(merged_df["pgen"])
     plt.figure(figsize=(8, 6))
     plt.grid(True)
     plt.scatter(merged_df["log10_pgen"], merged_df["total_samples"], c='blue', alpha=0.7)