Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .cirro/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
# 3. Set workflow_level value based on form input
ds.logger.info("Setting workflow_level")

levels = ['convert', 'sample', 'compare']
flags = [ds.params['convert_lvl'], ds.params['sample_lvl'], ds.params['compare_lvl']]
levels = ['convert', 'sample', 'patient', 'compare']
flags = [ds.params['convert_lvl'], ds.params['sample_lvl'], ds.params['patient_lvl'], ds.params['compare_lvl']]
workflow_level = [lvl for lvl, flag in zip(levels, flags) if flag]

ds.add_param('workflow_level', ','.join(workflow_level))
Expand Down
11 changes: 9 additions & 2 deletions .cirro/process-form.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
"type": "boolean",
"value": true
},
"patient_lvl": {
"default": true,
"description": "Run clustering and comparison modules on samples grouped by patient of origin",
"title": "Patient",
"type": "boolean",
"value": true
},
"compare_lvl": {
"default": true,
"description": "Comparative TCR repertoire analyses across samples and subjects",
Expand All @@ -18,10 +25,10 @@
"value": true
},
"olga_chunk_length": {
"default": 2000000,
"default": 100000,
"description": "Divide total CDR3 list into chunks of n length for processing by OLGA. Larger length = reduced parallelization",
"title": "olga_chunk_length",
"type": "int"
"type": "integer"
},
"distance_metric": {
"default": "tcrdist",
Expand Down
1 change: 1 addition & 0 deletions .cirro/process-input.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"input_format": "airr",
"convert_lvl": false,
"sample_lvl": "$.params.dataset.paramJson.sample_lvl",
"patient_lvl": "$.params.dataset.paramJson.patient_lvl",
"compare_lvl": "$.params.dataset.paramJson.compare_lvl",
"olga_chunk_length": "$.params.dataset.paramJson.olga_chunk_length",
"matrix_sparsity": "sparse",
Expand Down
144 changes: 144 additions & 0 deletions bin/patient_calc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""
Optimized TCR repertoire overlap (single-file input, vectorized)
"""

import argparse
import pandas as pd
import numpy as np


# -------------------------
# Morisita–Horn (vectorized)
# -------------------------
def morisita_horn_matrix(M):
"""
M: (n_samples x n_features) count matrix
Returns: (n x n) similarity matrix
"""
X = M.sum(axis=1, keepdims=True) # (n,1)

# Avoid division by zero
X_safe = np.where(X == 0, 1, X)

lambda_vals = (M ** 2).sum(axis=1, keepdims=True) / (X_safe ** 2)

# Pairwise dot products
prod = M @ M.T # (n x n)

denom = (lambda_vals + lambda_vals.T) * (X @ X.T)

# Avoid division by zero
denom = np.where(denom == 0, 1, denom)

return (2 * prod) / denom


# -------------------------
# Main
# -------------------------
if __name__ == "__main__":

parser = argparse.ArgumentParser(
description="Optimized overlap metrics for TCR repertoires"
)
parser.add_argument(
"-i", "--input",
required=True,
help="Concatenated TSV with sample, CDR3b, counts"
)
parser.add_argument(
"-p", "--patient",
required=True,
help="String indicating patient of samples"
)
args = parser.parse_args()

# -------------------------
# Load + clean
# -------------------------
df = pd.read_csv(
args.input,
sep="\t",
usecols=["sample", "CDR3b", "counts"]
)

df = df.dropna(subset=["sample", "CDR3b"])

df["counts"] = pd.to_numeric(
df["counts"], errors="coerce"
).fillna(0)

# -------------------------
# Pre-aggregate (critical optimization)
# -------------------------
df = (
df.groupby(["sample", "CDR3b"], as_index=False)["counts"]
.sum()
)

samples = df["sample"].unique()
n = len(samples)

print(f"Loaded {n} samples")

# -------------------------
# Build count matrix
# -------------------------
pivot = df.pivot(
index="sample",
columns="CDR3b",
values="counts"
).fillna(0)

# Ensure consistent ordering
pivot = pivot.loc[samples]

M = pivot.to_numpy(dtype=float)

# -------------------------
# Presence/absence matrix
# -------------------------
B = (M > 0).astype(np.int32)

# -------------------------
# Jaccard (vectorized)
# -------------------------
intersection = B @ B.T # shared junction counts

row_sums = B.sum(axis=1, keepdims=True)
union = row_sums + row_sums.T - intersection

union = np.where(union == 0, 1, union)

jaccard = intersection / union

# -------------------------
# Sørensen (Dice)
# -------------------------
denom = row_sums + row_sums.T
denom = np.where(denom == 0, 1, denom)

sorensen = (2 * intersection) / denom

# -------------------------
# Morisita–Horn (vectorized)
# -------------------------
morisita = morisita_horn_matrix(M)

# -------------------------
# Fix diagonals
# -------------------------
np.fill_diagonal(jaccard, 1.0)
np.fill_diagonal(sorensen, 1.0)
np.fill_diagonal(morisita, 1.0)

# -------------------------
# Write outputs
# -------------------------
patient = args.patient
pd.DataFrame(jaccard, index=samples, columns=samples).to_csv(f"{patient}/jaccard_mat.csv")
pd.DataFrame(sorensen, index=samples, columns=samples).to_csv(f"{patient}/sorensen_mat.csv")
pd.DataFrame(morisita, index=samples, columns=samples).to_csv(f"{patient}/morisita_mat.csv")

print("Finished writing all matrices")
54 changes: 54 additions & 0 deletions modules/local/annotate/main.nf
Original file line number Diff line number Diff line change
@@ -1,3 +1,57 @@
process ANNOTATE_PROCESS {
tag "${sample_meta.sample}"
label 'process_low'
publishDir enabled: false

input:
tuple val(sample_meta), path(count_table)

output:
tuple val(sample_meta), path("${sample_meta.sample}_cdr3.tsv"), emit: "process"

script:
"""
python - <<EOF
import pandas as pd

USECOLS = [
"junction_aa",
"v_call",
"j_call",
"duplicate_count"
]

COLMAP = {
"junction_aa": "CDR3b",
"v_call": "TRBV",
"j_call": "TRBJ",
"duplicate_count": "counts"
}

df = pd.read_csv(
"${count_table}",
sep='\t',
usecols=USECOLS,
dtype={
"junction_aa": "string",
"v_call": "string",
"j_call": "string",
"duplicate_count": "int"
})

df = (
df[df.junction_aa.notna()]
.rename(columns=COLMAP)
[["CDR3b", "TRBV", "TRBJ", "counts"]]
)
df["sample"] = "${sample_meta.sample}"

df.to_csv("${sample_meta.sample}_cdr3.tsv", sep="\t", index=False)

EOF
"""
}

process ANNOTATE_CONCATENATE {
label 'process_low'

Expand Down
20 changes: 11 additions & 9 deletions modules/local/compare/giana.nf
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
process GIANA_CALC {
tag "${patient}"
label 'process_medium'

input:
path concat_cdr3
tuple val(patient), path(concat_cdr3)
val threshold
val threshold_score
val threshold_vgene

output:
path "VgeneScores.txt"
path "giana_RotationEncodingBL62.txt"
path "giana_EncodingMatrix.txt"
path "giana.log"
path "${patient}_VgeneScores.txt"
path "${patient}_giana.txt"
// path "giana_EncodingMatrix.txt"
// path "giana.log"

script:
"""
Expand All @@ -23,8 +24,8 @@ process GIANA_CALC {
--threshold_score ${threshold_score} \
--threshold_vgene ${threshold_vgene} \
--NumberOfThreads ${task.cpus} \
--Verbose \
> giana.log 2>&1
--Verbose
# > giana.log 2>&1

# Insert header after GIANA comments
insert=\$(head -n 1 "${concat_cdr3}")
Expand All @@ -40,8 +41,9 @@ process GIANA_CALC {
/^##/ { print; next }
!inserted { print insert; inserted=1 }
{ print }
' giana_RotationEncodingBL62.txt > tmp && mv tmp giana_RotationEncodingBL62.txt
' giana_RotationEncodingBL62.txt > ${patient}_giana.txt

mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
# mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
mv VgeneScores.txt ${patient}_VgeneScores.txt
"""
}
27 changes: 14 additions & 13 deletions modules/local/compare/gliph2.nf
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
process GLIPH2_TURBOGLIPH {
tag "${patient}"
label 'process_high'
label 'process_high_compute'
label 'process_high_memory'

input:
path concat_cdr3
tuple val(patient), path(concat_cdr3)

output:
path "all_motifs.txt", emit: 'all_motifs'
path "clone_network.txt", emit: 'clone_network'
path "cluster_member_details.txt", emit: 'cluster_member_details'
path "convergence_groups.txt", emit: 'convergence_groups'
path "global_similarities.txt", emit: 'global_similarities'
path "local_similarities.txt", emit: 'local_similarities'
path "parameter.txt", emit: 'gliph2_parameters'
path "${patient}/all_motifs.txt", emit: 'all_motifs'
path "${patient}/clone_network.txt", emit: 'clone_network'
path "${patient}/cluster_member_details.txt", emit: 'cluster_member_details'
path "${patient}/convergence_groups.txt", emit: 'convergence_groups'
path "${patient}/global_similarities.txt", emit: 'global_similarities'
path "${patient}/local_similarities.txt", emit: 'local_similarities'
path "${patient}/parameter.txt", emit: 'gliph2_parameters'

script:
"""
Expand All @@ -28,7 +29,7 @@ process GLIPH2_TURBOGLIPH {

result <- turboGliph::gliph2(
cdr3_sequences = df,
result_folder = "./",
result_folder = "./${patient}",
lcminp = ${params.local_min_pvalue},
sim_depth = ${params.simulation_depth},
kmer_mindepth = ${params.kmer_min_depth},
Expand All @@ -37,16 +38,16 @@ process GLIPH2_TURBOGLIPH {
n_cores = ${task.cpus}
)

df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
df3 <- read.csv('${patient}/cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
df3[,'sample'] <- df3[,'patient']
df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE)
df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')]
write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
write.table(df3, "${patient}/cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
EOF

# Rename local_similarities file to standardize output name
input_file="local_similarities_*.txt"
cat \$input_file > local_similarities.txt
input_file="${patient}/local_similarities_*.txt"
cat \$input_file > ${patient}/local_similarities.txt
"""
}

Expand Down
Loading
Loading