Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .cirro/process-form.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
"type": "boolean",
"value": true
},
"olga_chunk_length": {
"default": 2000000,
"description": "Divide total CDR3 list into chunks of n length for processing by OLGA. Larger length = reduced parallelization",
"title": "olga_chunk_length",
"type": "int"
},
"distance_metric": {
"default": "tcrdist",
"description": "Use default TCRdist3 or Levenshtein distance metric.",
Expand Down
1 change: 1 addition & 0 deletions .cirro/process-input.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"convert_lvl": false,
"sample_lvl": "$.params.dataset.paramJson.sample_lvl",
"compare_lvl": "$.params.dataset.paramJson.compare_lvl",
"olga_chunk_length": "$.params.dataset.paramJson.olga_chunk_length",
"matrix_sparsity": "sparse",
"distance_metric": "$.params.dataset.paramJson.distance_metric",
"kmer_min_depth": "$.params.dataset.paramJson.kmer_min_depth",
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c)
Copyright (c) 2026 Karchin Lab, Break Through Cancer

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
33 changes: 21 additions & 12 deletions bin/compare_concatenate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

# Import modules
import argparse
import os
import pandas as pd

def main():
Expand All @@ -26,20 +25,31 @@ def main():
samplesheet = pd.read_csv(args.samplesheet, header=0)
dfs = []
for _, row in samplesheet.iterrows():
# Read the TSV file into a dataframe
file_path = str(row['file'])
df = pd.read_csv(file_path, sep="\t", header=0)
df = pd.read_csv(
row['file'],
sep="\t",
usecols=[
'junction_aa',
'v_call',
'j_call',
'duplicate_count',
'productive'
]
)

# Add patient column
df['sample'] = row['sample']
# Retain only productive CDR3 sequences
df = df[
(df['productive']) &
(df['junction_aa'].notna()) &
(df['v_call'].notna()) # also remove rows with a CDR3 sequence but no Vgene called
]

# Select relevant columns
df['sample'] = row['sample']
df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'sample']]
dfs.append(df)

dfs.append(df)

# Concatenate all the dataframes into one
df_combined = pd.concat(dfs)
df_combined = pd.concat(dfs, ignore_index=True)

# Rename columns as required
df_combined = df_combined.rename(columns={
Expand All @@ -48,9 +58,8 @@ def main():
'j_call': 'TRBJ',
'duplicate_count': 'counts'
})
df_combined = df_combined[df_combined['CDR3b'].notna()]

df_combined.to_csv(f"concatenated_cdr3.txt", sep="\t", index=False, header=True)
df_combined.to_csv(f"concatenated_cdr3.tsv", sep="\t", index=False)

if __name__ == "__main__":
main()
12 changes: 6 additions & 6 deletions bin/sample_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def main():
help='sample name')
parser.add_argument('-c', '--count_table',
metavar='count_table',
type=argparse.FileType('r'),
type=str,
help='counts file in TSV format')

args = parser.parse_args()
Expand All @@ -122,11 +122,11 @@ def main():
# Read in the counts file
counts = pd.read_csv(args.count_table, sep='\t')

calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv')
calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv')
calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv')
calc_sample_stats(sample, counts, f'stats/sample_stats_{sample}.csv')
calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'v_family_{sample}.csv')
calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'd_family_{sample}.csv')
calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'j_family_{sample}.csv')

calc_sample_stats(sample, counts, f'sample_stats_{sample}.csv')

if __name__ == "__main__":
main()
78 changes: 78 additions & 0 deletions modules/local/annotate/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
process ANNOTATE_CONCATENATE {
label 'process_low'

input:
path samplesheet_utf8
path all_sample_files

output:
path "concatenated_cdr3.tsv", emit: concat_cdr3

script:
"""
# Concatenate input Adaptive files and process metadata
compare_concatenate.py $samplesheet_utf8
"""
}

process ANNOTATE_SORT_CDR3 {
label 'process_medium'

input:
path concat_cdr3

output:
path 'concatenated_cdr3_sorted.tsv', emit: concat_cdr3_sorted

script:
"""
head -n 1 ${concat_cdr3} > concatenated_cdr3_sorted.tsv

tail -n +2 ${concat_cdr3} \
| LC_ALL=C sort \
-t \$'\t' \
-k1,1 -k2,5 \
--parallel=${task.cpus} \
-S 50% \
>> concatenated_cdr3_sorted.tsv
"""
}

process ANNOTATE_DEDUPLICATE_CDR3_TRBV {
label 'process_low'

input:
path concat_cdr3

output:
path 'unique_cdr3_trbv.tsv', emit: unique_cdr3_trbv
path 'unique_cdr3_trbv_with_vcall.tsv', emit: unique_cdr3_trbv_with_vcall

script:
"""
tail -n +2 ${concat_cdr3} \
| awk -F'\t' '{print toupper(\$1) "\t" toupper(\$2)}' \
| LC_ALL=C sort -u \
> unique_cdr3_trbv.tsv

# additional file with blank TRBV calls removed for GIANA
awk -F'\t' 'NF>=2 && \$2 ~ /^TRBV/' unique_cdr3_trbv.tsv > unique_cdr3_trbv_with_vcall.tsv
"""
}

process ANNOTATE_DEDUPLICATE_CDR3 {
label 'process_single'

input:
path unique_cdr3_trbv

output:
path 'unique_cdr3.txt', emit: unique_cdr3

script:
"""
cut -f1 ${unique_cdr3_trbv} \
| LC_ALL=C sort -u \
> unique_cdr3.txt
"""
}
16 changes: 0 additions & 16 deletions modules/local/compare/compare_concatenate.nf

This file was deleted.

38 changes: 13 additions & 25 deletions modules/local/compare/tcrsharing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process TCRSHARING_CALC {
path concat_cdr3

output:
path "cdr3_sharing_pgen.tsv", emit: "shared_cdr3"
path "cdr3_sharing.tsv", emit: "shared_cdr3"
path "sample_mapping.tsv", emit: "sample_mapping"

script:
Expand All @@ -18,6 +18,9 @@ process TCRSHARING_CALC {
# Load data
df = pd.read_csv("${concat_cdr3}", sep="\t")

# Remove rows where pgen = 0
df = df[df['pgen'] != 0]

# Map sample to integer codes
df['sample'] = df['sample'].astype('category')
df['sample_id'] = df['sample'].cat.codes + 1
Expand All @@ -31,9 +34,12 @@ process TCRSHARING_CALC {

# Get unique sample_ids per CDR3b — vectorized
grouped = (
df.groupby('CDR3b')['sample_id']
.unique() # UNIQUE — fast & vectorized
.apply(np.sort) # SORT — vectorized
df.groupby('CDR3b')
.agg(
sample_id=('sample_id', 'unique'),
pgen=('pgen', 'first'),
log10_pgen=('log10_pgen', 'first')
)
.reset_index()
)

Expand All @@ -44,29 +50,12 @@ process TCRSHARING_CALC {
)

# Drop raw list
final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
final_df = grouped[['CDR3b', 'pgen', 'log10_pgen', 'total_samples', 'samples_present']]
final_df = final_df.sort_values(by="total_samples", ascending=False)

# Export final list
final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
EOF
Comment on lines 52 to 58
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In TCRSHARING_CALC you now load a table that is expected to already contain pgen and log10_pgen columns (via the OLGA merge step), but the output filename was changed from cdr3_sharing_pgen.tsv to cdr3_sharing.tsv here. Please make sure any documentation, downstream consumers, or plotting scripts do not still expect the old _pgen suffix; otherwise they will fail to find the file even though the schema still includes pgen fields.

Copilot uses AI. Check for mistakes.


olga-compute_pgen --humanTRB -i cdr3_sharing.tsv -o pgen_sharing.tsv


python - <<EOF
import pandas as pd

# Load TSVs for shared cdr3s and corresponding pgen values
left_df = pd.read_csv('pgen_sharing.tsv', sep='\t', header=None, usecols=[0, 1], names=['CDR3b', 'pgen'])
right_df = pd.read_csv('cdr3_sharing.tsv', sep='\t')

# Drop rows where pgen == 0 and merge
left_df = left_df[left_df['pgen'] != 0]
merged_df = pd.merge(left_df, right_df, on='CDR3b', how='left')
merged_df.to_csv('cdr3_sharing_pgen.tsv', sep='\t', index=False)
EOF
"""
}

Expand All @@ -86,7 +75,7 @@ process TCRSHARING_HISTOGRAM {
import pandas as pd
import matplotlib.pyplot as plt

merged_df = pd.read_csv('$shared_cdr3', sep='\t')
merged_df = pd.read_csv('${shared_cdr3}', sep='\t')

# Plot histogram
sharing = merged_df['total_samples'].values
Expand Down Expand Up @@ -127,10 +116,9 @@ process TCRSHARING_SCATTERPLOT {
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

merged_df = pd.read_csv('$shared_cdr3', sep='\t')
merged_df = pd.read_csv('${shared_cdr3}', sep='\t')

# Create scatter plot with log-transform pgen
merged_df["log10_pgen"] = np.log10(merged_df["pgen"])
plt.figure(figsize=(8, 6))
plt.grid(True)
plt.scatter(merged_df["log10_pgen"], merged_df["total_samples"], c='blue', alpha=0.7)
Expand Down
Loading
Loading