Skip to content

Commit 87fb49c

Browse files
authored
Merge pull request #30 from KarchinLab/dltamayo-dev
Implement TCR convergence and plots, code cleanup
2 parents 36d8e4f + 0bc9cc0 commit 87fb49c

File tree

7 files changed

+233
-27
lines changed

7 files changed

+233
-27
lines changed

.cirro/preprocess.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,19 @@
1717
ds.logger.info("Checking samplesheet parameter")
1818
ds.logger.info(ds.samplesheet)
1919
samplesheet = ds.samplesheet
20-
21-
ds.logger.info("Dropping incorrect file path & Merging ds.files w samplesheets")
22-
samplesheet = samplesheet.drop(columns=['file'])
23-
samplesheet2 = samplesheet.merge(ds.files, on='sample', how='left')
24-
25-
samplesheet2.to_csv('samplesheet.csv', index=None)
20+
samplesheet.to_csv('samplesheet.csv', index=None)
2621
ds.add_param("samplesheet", "samplesheet.csv")
2722

2823

2924
# 3. Set workflow_level value based on form input
3025
ds.logger.info("Setting workflow_level")
31-
if ds.params['sample_lvl'] == ds.params['compare_lvl'] == ds.params['cluster_lvl'] == True:
26+
if ds.params['sample_lvl'] == ds.params['compare_lvl'] == True:
3227
workflow_level = ['complete']
3328
else:
34-
workflow_lvls = ['sample', 'compare', 'cluster']
35-
chosen_lvls = [ds.params['sample_lvl'], ds.params['compare_lvl'], ds.params['cluster_lvl']]
29+
workflow_lvls = ['sample', 'compare']
30+
chosen_lvls = [ds.params['sample_lvl'], ds.params['compare_lvl']]
3631
workflow_level = [i for i, j in zip(workflow_lvls, chosen_lvls) if j]
3732

3833
ds.add_param('workflow_level', ','.join(workflow_level))
3934

4035
ds.logger.info(ds.params)
41-
42-
##

modules/local/compare_clonal_publicity.nf

Lines changed: 90 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,99 @@ process COMPARE_CLONAL_PUBLICITY {
66
path concat_cdr3
77

88
output:
9-
path "cdr3_sharing.tsv", emit: "shared_cdr3"
9+
path "cdr3_sharing_pgen.tsv", emit: "shared_cdr3"
1010
path "sample_mapping.tsv", emit: "sample_mapping"
11+
path "sharing_histogram.png"
12+
path "sharing_pgen_scatterplot.png"
1113

1214
script:
1315
"""
14-
# Concatenate input Adaptive files and process metadata
15-
compare_clonal_publicity.py $concat_cdr3
16+
python - <<EOF
17+
import pandas as pd
18+
import numpy as np
19+
import matplotlib.pyplot as plt
20+
21+
# Load data
22+
df = pd.read_csv("${concat_cdr3}", sep="\t")
23+
24+
# Step 1: Map samples to integers
25+
sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
26+
df['sample_id'] = df['sample'].map(sample_mapping)
27+
28+
# Step 2: Group by CDR3b and aggregate sample_ids
29+
grouped = (
30+
df.groupby('CDR3b')['sample_id']
31+
.apply(lambda x: sorted(set(x))) # remove duplicates if any
32+
.reset_index()
33+
)
34+
35+
# Step 3: Add comma-separated list and total count
36+
grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
37+
grouped['total_samples'] = grouped['sample_id'].apply(len)
38+
39+
# Step 4: Final output — drop raw list
40+
final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
41+
final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
42+
43+
# Step 5: Export both outputs
44+
final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
45+
46+
# Also export the sample mapping
47+
sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
48+
sample_map_df.columns = ['patient', 'sample_id']
49+
sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)
50+
51+
52+
# Plot histogram
53+
sharing = final_df['total_samples'].values
54+
55+
# Create integer bin edges from 0 to max(data)
56+
bins = np.arange(min(sharing), max(sharing) + 2) # +2 to include the last value as a bin edge
57+
58+
plt.figure(figsize=(8, 5))
59+
plt.hist(sharing, bins=bins, edgecolor='black', align='left')
60+
plt.xticks(bins[:-1]) # whole number positions and labels
61+
plt.yscale('log')
62+
63+
plt.xlabel('Number of Shared Samples')
64+
plt.ylabel('TCR Sequence Frequency (log scale)')
65+
plt.title('TCR Sharing Histogram')
66+
67+
# Save to file
68+
plt.savefig("sharing_histogram.png", dpi=300, bbox_inches="tight")
69+
plt.close()
70+
EOF
71+
72+
olga-compute_pgen --humanTRB -i cdr3_sharing.tsv -o pgen_sharing.tsv
73+
74+
python - <<EOF
75+
import pandas as pd
76+
import numpy as np
77+
import matplotlib.pyplot as plt
78+
from matplotlib.ticker import MaxNLocator
79+
80+
# Load TSVs for shared cdr3s and corresponding pgen values
81+
left_df = pd.read_csv('pgen_sharing.tsv', sep='\t', header=None, usecols=[0, 1], names=['CDR3b', 'pgen'])
82+
right_df = pd.read_csv('cdr3_sharing.tsv', sep='\t')
83+
84+
# Drop rows where pgen == 0 and merge
85+
left_df = left_df[left_df['pgen'] != 0]
86+
merged_df = pd.merge(left_df, right_df, on='CDR3b', how='left')
87+
merged_df.to_csv('cdr3_sharing_pgen.tsv', sep='\t', index=False)
88+
89+
# Create scatter plot with log-transform pgen
90+
merged_df["log10_pgen"] = np.log10(merged_df["pgen"])
91+
plt.figure(figsize=(8, 6))
92+
plt.grid(True)
93+
plt.scatter(merged_df["log10_pgen"], merged_df["total_samples"], c='blue', alpha=0.7)
94+
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
95+
96+
plt.xlabel("log10(Probability)")
97+
plt.ylabel("Number of Shared Samples")
98+
plt.title("Scatterplot of Shared TCRs vs log10(Generation Probability)")
99+
plt.tight_layout()
100+
plt.savefig("sharing_pgen_scatterplot.png", dpi=300, bbox_inches="tight")
101+
plt.close()
102+
EOF
16103
"""
17104
}

modules/local/convergence.nf

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
process CONVERGENCE {
2+
tag "${sample_meta[0]}"
3+
label 'process_low'
4+
container "ghcr.io/break-through-cancer/bulktcr:latest"
5+
6+
input:
7+
tuple val(sample_meta), path(count_table)
8+
9+
output:
10+
path "${count_table.baseName}_tcr_convergence.tsv", emit: "convergence_output"
11+
path "${count_table.baseName}_tcr_convergence_histogram.png"
12+
13+
script:
14+
"""
15+
# Extract vector of cdr3 aa, dropping null values
16+
python - <<EOF
17+
import pandas as pd
18+
import numpy as np
19+
import matplotlib.pyplot as plt
20+
21+
# Load your TCR data (make sure the file has 'cdr3_aa' and 'cdr3_nt' columns)
22+
df = pd.read_csv("${count_table}", sep="\t", usecols=["aminoAcid", "nucleotide"])
23+
df = df.dropna(subset=["aminoAcid"])
24+
25+
# Group by amino acid sequence and count unique nucleotide sequences (convergence)
26+
convergence_df = (
27+
df.groupby("aminoAcid")["nucleotide"]
28+
.nunique()
29+
.reset_index(name="convergence")
30+
)
31+
32+
# Sort by convergence count, descending
33+
convergence_df = convergence_df.sort_values(by="convergence", ascending=False)
34+
35+
# Export
36+
convergence_df.to_csv("${count_table.baseName}_tcr_convergence.tsv", sep="\t", index=False)
37+
38+
# Plot histogram
39+
convergence = convergence_df['convergence'].values
40+
average_convergence = convergence_df["convergence"].mean()
41+
42+
# Create integer bin edges from 0 to max(data)
43+
bins = np.arange(min(convergence), max(convergence) + 2) # +2 to include the last value as a bin edge
44+
45+
plt.figure(figsize=(8, 5))
46+
plt.hist(convergence, bins=bins, edgecolor='black', align='left')
47+
plt.xticks(bins[:-1]) # whole number positions and labels
48+
plt.yscale('log')
49+
50+
plt.xlabel('TCR Convergence Number')
51+
plt.ylabel('TCR Convergence Frequency (log scale)')
52+
plt.title(f'${count_table.baseName} TCR Convergence Histogram, Average: {average_convergence:.2f}')
53+
54+
# Save to file
55+
plt.savefig("${count_table.baseName}_tcr_convergence_histogram.png", dpi=300, bbox_inches="tight")
56+
plt.close()
57+
58+
EOF
59+
"""
60+
}

modules/local/olga.nf

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,55 @@ process OLGA {
22
tag "${sample_meta[0]}"
33
label 'process_low'
44
container "ghcr.io/break-through-cancer/bulktcr:latest"
5-
5+
66
input:
77
tuple val(sample_meta), path(count_table)
8-
8+
99
output:
10-
path "${count_table.baseName}_tcr_generation_probabilities.tsv", emit: "olga_output"
11-
10+
path "${count_table.baseName}_tcr_pgen.tsv", emit: "olga_output"
11+
path "${count_table.baseName}_tcr_pgen_histogram.png"
12+
1213
script:
1314
"""
1415
# Extract vector of cdr3 aa, dropping null values
15-
1616
cat > dropAA.py <<EOF
17-
1817
import pandas as pd
19-
18+
2019
df = pd.read_csv("${count_table}", sep="\t")
2120
df = df.dropna(subset=["aminoAcid"])
2221
df = df["aminoAcid"]
2322
df.to_csv("output.tsv", sep="\t", index=False, header=False)
24-
2523
EOF
26-
24+
2725
python dropAA.py
28-
29-
olga-compute_pgen --humanTRB -i output.tsv -o "${count_table.baseName}_tcr_generation_probabilities.tsv"
26+
27+
olga-compute_pgen --humanTRB -i output.tsv -o "${count_table.baseName}_tcr_pgen.tsv"
28+
29+
python - <<EOF
30+
import pandas as pd
31+
import numpy as np
32+
import matplotlib.pyplot as plt
33+
34+
# Load TSV with no header
35+
df = pd.read_csv('${count_table.baseName}_tcr_pgen.tsv', sep='\t', header=None, usecols=[0, 1], names=['CDR3b', 'probability'])
36+
37+
# Drop rows where pgen is 0
38+
df = df[df['probability'] != 0]
39+
log_probs = np.log10(df['probability'])
40+
41+
# Plot histogram
42+
plt.figure(figsize=(8, 5))
43+
plt.hist(log_probs, bins=30, density=True, edgecolor='black')
44+
45+
# Label with LaTeX formatting
46+
plt.xlabel('log_10 Generation Probability')
47+
plt.ylabel('Probability Density')
48+
plt.title(f'${count_table.baseName} TCR Generation Probability Histogram')
49+
# plt.grid(True)
50+
51+
# Save to file
52+
plt.savefig("${count_table.baseName}_tcr_pgen_histogram.png", dpi=300, bbox_inches="tight")
53+
plt.close()
54+
EOF
3055
"""
31-
}
56+
}

modules/local/tcrdist3_matrix.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ process TCRDIST3_MATRIX {
99
path ref_db
1010

1111
output:
12-
path "${count_table.baseName}_distance_matrix.csv", emit: 'distance_matrix'
12+
tuple val(sample_meta), path("${count_table.baseName}_distance_matrix.csv"), emit: 'tcr_output'
1313
path "${count_table.baseName}_clone_df.csv", emit: 'clone_df'
1414

1515
script:

modules/local/tcrdist3_plot.nf

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
process TCRDIST3_PLOT {
2+
tag "${sample_meta[0]}"
3+
label 'process_high'
4+
label 'process_high_memory'
5+
container "ghcr.io/break-through-cancer/bulktcr:latest"
6+
7+
input:
8+
tuple val(sample_meta), path(distance_matrix)
9+
10+
output:
11+
path "${sample_meta[0]}_pairwise_distance_distribution.png", emit: 'beta_histogram'
12+
13+
script:
14+
"""
15+
python - <<EOF
16+
import numpy as np
17+
import matplotlib.pyplot as plt
18+
19+
distances = np.loadtxt("${distance_matrix}", delimiter=',')
20+
lower_triangle = distances[np.tril_indices(distances.shape[0], k=-1)]
21+
counts, bin_edges = np.histogram(lower_triangle, bins=100)
22+
23+
plt.figure(figsize=(8, 5))
24+
plt.bar((bin_edges[:-1] + bin_edges[1:]) / 2, counts, width=np.diff(bin_edges), edgecolor='black')
25+
plt.xlabel("Pairwise Distance")
26+
plt.ylabel("Frequency (log scale)")
27+
plt.yscale("log")
28+
plt.title("Distribution of Beta Chain Pairwise Distances - ${sample_meta[0]}")
29+
plt.savefig("${sample_meta[0]}_pairwise_distance_distribution.png", dpi=300, bbox_inches="tight")
30+
plt.close()
31+
EOF
32+
"""
33+
}

subworkflows/local/sample.nf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
include { SAMPLE_CALC } from '../../modules/local/sample_calc'
99
include { SAMPLE_PLOT } from '../../modules/local/sample_plot'
1010
include { TCRDIST3_MATRIX } from '../../modules/local/tcrdist3_matrix'
11+
include { TCRDIST3_PLOT } from '../../modules/local/tcrdist3_plot'
1112
include { OLGA } from '../../modules/local/olga'
13+
include { CONVERGENCE } from '../../modules/local/convergence'
1214

1315
/*
1416
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -53,6 +55,10 @@ workflow SAMPLE {
5355
file(params.db_path)
5456
)
5557

58+
TCRDIST3_PLOT(
59+
TCRDIST3_MATRIX.out.tcr_output
60+
)
61+
5662
/////// =================== PLOT SAMPLE =================== ///////
5763

5864
SAMPLE_PLOT (
@@ -64,6 +70,8 @@ workflow SAMPLE {
6470

6571
OLGA ( sample_map )
6672

73+
CONVERGENCE ( sample_map )
74+
6775
// emit:
6876
// sample_stats_csv
6977
// v_family_csv

0 commit comments

Comments
 (0)