@@ -6,12 +6,99 @@ process COMPARE_CLONAL_PUBLICITY {
66 path concat_cdr3
77
88 output:
9- path " cdr3_sharing .tsv" , emit: " shared_cdr3"
9+ path " cdr3_sharing_pgen .tsv" , emit: " shared_cdr3"
1010 path " sample_mapping.tsv" , emit: " sample_mapping"
11+ path " sharing_histogram.png"
12+ path " sharing_pgen_scatterplot.png"
1113
1214 script:
1315 """
14- # Concatenate input Adaptive files and process metadata
15- compare_clonal_publicity.py $concat_cdr3
16+ python - <<EOF
17+ import pandas as pd
18+ import numpy as np
19+ import matplotlib.pyplot as plt
20+
21+ # Load data
22+ df = pd.read_csv("${ concat_cdr3} ", sep="\t ")
23+
24+ # Step 1: Map samples to integers
25+ sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
26+ df['sample_id'] = df['sample'].map(sample_mapping)
27+
28+ # Step 2: Group by CDR3b and aggregate sample_ids
29+ grouped = (
30+ df.groupby('CDR3b')['sample_id']
31+ .apply(lambda x: sorted(set(x))) # remove duplicates if any
32+ .reset_index()
33+ )
34+
35+ # Step 3: Add comma-separated list and total count
36+ grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
37+ grouped['total_samples'] = grouped['sample_id'].apply(len)
38+
39+ # Step 4: Final output — drop raw list
40+ final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
41+ final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
42+
43+ # Step 5: Export both outputs
44+ final_df.to_csv("cdr3_sharing.tsv", sep="\t ", index=False)
45+
46+ # Also export the sample mapping
47+ sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
48+ sample_map_df.columns = ['patient', 'sample_id']
49+ sample_map_df.to_csv("sample_mapping.tsv", sep="\t ", index=False)
50+
51+
52+ # Plot histogram
53+ sharing = final_df['total_samples'].values
54+
55+ # Create integer bin edges from 0 to max(data)
56+ bins = np.arange(min(sharing), max(sharing) + 2) # +2 to include the last value as a bin edge
57+
58+ plt.figure(figsize=(8, 5))
59+ plt.hist(sharing, bins=bins, edgecolor='black', align='left')
60+ plt.xticks(bins[:-1]) # whole number positions and labels
61+ plt.yscale('log')
62+
63+ plt.xlabel('Number of Shared Samples')
64+ plt.ylabel('TCR Sequence Frequency (log scale)')
65+ plt.title('TCR Sharing Histogram')
66+
67+ # Save to file
68+ plt.savefig("sharing_histogram.png", dpi=300, bbox_inches="tight")
69+ plt.close()
70+ EOF
71+
72+ olga-compute_pgen --humanTRB -i cdr3_sharing.tsv -o pgen_sharing.tsv
73+
74+ python - <<EOF
75+ import pandas as pd
76+ import numpy as np
77+ import matplotlib.pyplot as plt
78+ from matplotlib.ticker import MaxNLocator
79+
80+ # Load TSVs for shared cdr3s and corresponding pgen values
81+ left_df = pd.read_csv('pgen_sharing.tsv', sep='\t ', header=None, usecols=[0, 1], names=['CDR3b', 'pgen'])
82+ right_df = pd.read_csv('cdr3_sharing.tsv', sep='\t ')
83+
84+ # Drop rows where pgen == 0 and merge
85+ left_df = left_df[left_df['pgen'] != 0]
86+ merged_df = pd.merge(left_df, right_df, on='CDR3b', how='left')
87+ merged_df.to_csv('cdr3_sharing_pgen.tsv', sep='\t ', index=False)
88+
89+ # Create scatter plot with log-transform pgen
90+ merged_df["log10_pgen"] = np.log10(merged_df["pgen"])
91+ plt.figure(figsize=(8, 6))
92+ plt.grid(True)
93+ plt.scatter(merged_df["log10_pgen"], merged_df["total_samples"], c='blue', alpha=0.7)
94+ plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
95+
96+ plt.xlabel("log10(Probability)")
97+ plt.ylabel("Number of Shared Samples")
98+ plt.title("Scatterplot of Shared TCRs vs log10(Generation Probability)")
99+ plt.tight_layout()
100+ plt.savefig("sharing_pgen_scatterplot.png", dpi=300, bbox_inches="tight")
101+ plt.close()
102+ EOF
16103 """
17104}
0 commit comments