Skip to content

Commit ec9dd6d

Browse files
authored
Merge branch 'dev' into main
2 parents 5a5d023 + c7a8290 commit ec9dd6d

File tree

5 files changed

+136
-54
lines changed

5 files changed

+136
-54
lines changed

bin/tcrdist3_matrix.py

Lines changed: 63 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,21 @@
88
import pandas as pd
99
from tcrdist.repertoire import TCRrep
1010

11-
def reverse_transform_trbv(trbv):
12-
"""Convert TCRBV notation back to TRBV format, remove zero padding before *, and handle /OR cases."""
11+
def transform_trbv(trbv):
12+
"""Convert gene names from Adaptive ImmunoSEQ to IMGT format."""
1313
if not isinstance(trbv, str):
1414
return trbv # Return as-is if not a string
1515

16-
trbv = trbv.replace("TCRBV", "TRBV") # Convert TCRBV → TRBV
16+
# Convert locus name
17+
trbv = trbv.replace("TCRBV", "TRBV")
1718

18-
# Remove zero padding from main number (TCRBV07 TRBV7)
19+
# Remove zero padding from gene name (TCRBV07 to TRBV7)
1920
trbv = re.sub(r'(?<=TRBV)0*(\d+)', r'\1', trbv)
2021

21-
# Remove zero padding from subgroup (TCRBV7-02 TRBV7-2)
22+
# Remove zero padding from subgroup (TCRBV7-02 to TRBV7-2)
2223
trbv = re.sub(r'-(0\d+)', lambda m: f'-{int(m.group(1))}', trbv)
2324

24-
# Convert "-orXX_XX" format back to "/OR#-#"
25+
# Convert "-orXX_XX" to "/OR#-#" for orphon genes
2526
trbv = re.sub(r'-or0?(\d+)_0?(\d+)', r'/OR\1-\2', trbv)
2627

2728
# Add *01 if allele group not specified
@@ -31,16 +32,22 @@ def reverse_transform_trbv(trbv):
3132
return trbv
3233

3334
def remove_locus(gene_name):
34-
"""If gene is in TCRBVXX-##*0# format, try removing the -##."""
35-
return re.sub(r'-(\d+)\*', '*', gene_name)
35+
"""Remove the -## gene position from TRBV names unless the gene contains /OR."""
36+
if '/OR' in gene_name:
37+
return gene_name
38+
else:
39+
return re.sub(r'-(\d+)\*', '*', gene_name)
3640

3741
def split_and_check_genes(gene_name):
38-
"""Handle cases where two genes are combined (TCRBVXX-YY/XX-ZZ*0#) and return both separately."""
39-
if '/' in gene_name and not re.search(r'/OR\d+-\d+', gene_name): # Ensure it's not an OR case
40-
base, star_part = gene_name.split("*") if "*" in gene_name else (gene_name, "01")
42+
"""Split combined TCRBV genes (e.g., TCRBV06-02/06-03*01 to TCRBV06-02*01 and TCRBV06-03*01."""
43+
if '/' in gene_name and not re.search(r'/OR\d+-\d+', gene_name): # Ensure it's not an orphon
44+
base, allele = gene_name.split("*") if "*" in gene_name else (gene_name, "01")
45+
prefix_match = re.match(r"(TCRBV\d+)", gene_name)
46+
prefix = prefix_match.group(1) if prefix_match else "TCRBV" # Fallback just in case
4147
genes = base.split("/") # Split the genes
42-
return [f"{g}*{star_part}" for g in genes] # Reattach the *0# part to both genes
43-
return [gene_name] # Return as list for consistency
48+
return [f"{prefix}-{g.split('-')[-1]}*{allele}" for g in genes]
49+
return [gene_name]
50+
4451

4552
def find_matching_gene(row, db):
4653
# Collect all possible genes from vMaxResolved and vGeneNameTies
@@ -57,12 +64,12 @@ def find_matching_gene(row, db):
5764
if "/" in gene and not re.search(r"/OR\d+-\d+", gene): # Avoid /OR cases
5865
sub_genes = split_and_check_genes(gene)
5966
for sub_gene in sub_genes:
60-
sub_gene = reverse_transform_trbv(sub_gene) # Ensure correct *0# format
67+
sub_gene = transform_trbv(sub_gene) # Ensure correct *0# format
6168
if sub_gene in db["id"].values:
6269
return sub_gene
6370

6471
# Direct match in db
65-
transform_gene = reverse_transform_trbv(gene)
72+
transform_gene = transform_trbv(gene)
6673
if transform_gene in db["id"].values:
6774
return transform_gene
6875

@@ -71,60 +78,62 @@ def find_matching_gene(row, db):
7178
if modified_gene in db["id"].values:
7279
return modified_gene
7380

74-
transform_row = reverse_transform_trbv(row["vMaxResolved"])
81+
transform_row = transform_trbv(row["vMaxResolved"])
7582
print(f'No match found for {transform_row}')
7683

7784
return transform_row # Return original vMaxResolved if no match is found
7885

79-
# Parse input arguments
80-
parser = argparse.ArgumentParser(description="Take positional args")
8186

82-
parser.add_argument("sample_tsv")
83-
parser.add_argument("ref_database")
84-
parser.add_argument("cores", type=int)
87+
if __name__ == "__main__":
88+
# Parse input arguments
89+
parser = argparse.ArgumentParser(description="Take positional args")
90+
91+
parser.add_argument("sample_tsv")
92+
parser.add_argument("ref_database")
93+
parser.add_argument("cores", type=int)
8594

86-
args = parser.parse_args()
95+
args = parser.parse_args()
8796

88-
print(f"sample_tsv: {args.sample_tsv}")
89-
print(f"ref_database: {args.ref_database}")
90-
print(f"cores: {args.cores}")
97+
print(f"sample_tsv: {args.sample_tsv}")
98+
print(f"ref_database: {args.ref_database}")
99+
print(f"cores: {args.cores}")
91100

92-
sample_tsv = args.sample_tsv
101+
sample_tsv = args.sample_tsv
93102

94-
# Get the basename
95-
basename = os.path.splitext(os.path.basename(sample_tsv))[0]
103+
# Get the basename
104+
basename = os.path.splitext(os.path.basename(sample_tsv))[0]
96105

97-
# --- 1. Convert Adaptive output to tcrdist db format ---
98-
db = pd.read_table(args.ref_database, delimiter = '\t')
106+
# --- 1. Convert Adaptive output to tcrdist db format ---
107+
db = pd.read_table(args.ref_database, delimiter = '\t')
99108

100-
db = db[db['organism']=='human']
109+
db = db[db['organism']=='human']
101110

102-
df = pd.read_table(sample_tsv, delimiter = '\t')
111+
df = pd.read_table(sample_tsv, delimiter = '\t')
103112

104-
df = df[['nucleotide', 'aminoAcid', 'vMaxResolved', 'vGeneNameTies', 'count (templates/reads)']]
105-
df["vMaxResolved"] = df.apply(lambda row: find_matching_gene(row, db), axis=1)
113+
df = df[['nucleotide', 'aminoAcid', 'vMaxResolved', 'vGeneNameTies', 'count (templates/reads)']]
114+
df["vMaxResolved"] = df.apply(lambda row: find_matching_gene(row, db), axis=1)
106115

107-
df = df.rename(columns={'nucleotide': 'cdr3_b_nucseq',
108-
'aminoAcid': 'cdr3_b_aa',
109-
# 'CDR3a': 'cdr3_a_aa',
110-
'vMaxResolved': 'v_b_gene',
111-
# 'TRBJ': 'j_b_gene',
112-
'count (templates/reads)': 'count'})
116+
df = df.rename(columns={'nucleotide': 'cdr3_b_nucseq',
117+
'aminoAcid': 'cdr3_b_aa',
118+
# 'CDR3a': 'cdr3_a_aa',
119+
'vMaxResolved': 'v_b_gene',
120+
# 'TRBJ': 'j_b_gene',
121+
'count (templates/reads)': 'count'})
113122

114-
df = df[df['cdr3_b_aa'].notna()]
115-
df = df[df['v_b_gene'].notna()]
116-
df = df.drop('vGeneNameTies', axis=1)
123+
df = df[df['cdr3_b_aa'].notna()]
124+
df = df[df['v_b_gene'].notna()]
125+
df = df.drop('vGeneNameTies', axis=1)
117126

118-
# --- 2. Calculate sparse distance matrix ---
119-
tr = TCRrep(cell_df = df,
120-
organism = 'human',
121-
chains = ['beta'],
122-
db_file = 'alphabeta_gammadelta_db.tsv',
123-
compute_distances = False)
124-
tr.cpus = args.cores
125-
tr.compute_distances()
127+
# --- 2. Calculate sparse distance matrix ---
128+
tr = TCRrep(cell_df = df,
129+
organism = 'human',
130+
chains = ['beta'],
131+
db_file = 'alphabeta_gammadelta_db.tsv',
132+
compute_distances = False)
133+
tr.cpus = args.cores
134+
tr.compute_distances()
126135

127-
np.savetxt(f"{basename}_distance_matrix.csv", tr.pw_beta, delimiter=",", fmt="%d")
136+
np.savetxt(f"{basename}_distance_matrix.csv", tr.pw_beta, delimiter=",", fmt="%d")
128137

129-
clone_df = tr.clone_df
130-
clone_df.to_csv(f"{basename}_clone_df.csv", index=False)
138+
clone_df = tr.clone_df
139+
clone_df.to_csv(f"{basename}_clone_df.csv", index=False)

bin/test.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import unittest
2+
import tcrdist3_matrix as t3m
3+
4+
class TestNameConversion(unittest.TestCase):
5+
def setUp(self):
6+
# This method is called before each test
7+
self.test_cases = [
8+
("TCRBV07", "TRBV7*01", "TRBV7*01"),
9+
("TCRBV27", "TRBV27*01", "TRBV27*01"),
10+
("TCRBV07-02", "TRBV7-2*01", "TRBV7*01"),
11+
("TCRBV17-02", "TRBV17-2*01", "TRBV17*01"),
12+
("TCRBV10-03*02", "TRBV10-3*02", "TRBV10*02"),
13+
("TCRBV07-02*01", "TRBV7-2*01", "TRBV7*01"),
14+
("TCRBV10-or09_02*01", "TRBV10/OR9-2*01", "TRBV10/OR9-2*01"),
15+
]
16+
self.split_cases = [
17+
("TCRBV06-02/06-03*01", ["TCRBV06-02*01", "TCRBV06-03*01"]),
18+
("TCRBV12-03/12-04", ["TCRBV12-03*01","TCRBV12-04*01"])
19+
]
20+
21+
def test_transform_trbv(self):
22+
for trbv_input, expected_output, _ in self.test_cases:
23+
result = t3m.transform_trbv(trbv_input)
24+
self.assertEqual(result, expected_output, f"Failed for input: {trbv_input}")
25+
26+
def test_remove_locus(self):
27+
for _, trbv_output, remove_locus_output in self.test_cases:
28+
result = t3m.remove_locus(trbv_output)
29+
self.assertEqual(result, remove_locus_output, f"Failed for input: {trbv_output}")
30+
31+
def test_split_and_check_genes(self):
32+
for split_input, split_output in self.split_cases:
33+
result = t3m.split_and_check_genes(split_input)
34+
self.assertEqual(result, split_output, f"Failed for input: {split_input}")
35+
36+
37+
if __name__ == '__main__':
38+
unittest.main()

env.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dependencies:
1818
- s3fs=2024.3.1
1919
- python-igraph=0.11.8
2020
- scikit-learn=1.6.1
21+
- olga=1.2.4
2122

2223
# R and R packages
2324
- r-base=4.4.2

modules/local/olga.nf

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
process OLGA {
2+
tag "${sample_meta[0]}"
3+
label 'process_low'
4+
container "ghcr.io/break-through-cancer/bulktcr:latest"
5+
6+
input:
7+
tuple val(sample_meta), path(count_table)
8+
9+
output:
10+
path "${count_table.baseName}_tcr_generation_probabilities.tsv", emit: "olga_output"
11+
12+
script:
13+
"""
14+
# Extract vector of cdr3 aa, dropping null values
15+
16+
cat > dropAA.py <<EOF
17+
18+
import pandas as pd
19+
20+
df = pd.read_csv("${count_table}", sep="\t")
21+
df = df.dropna(subset=["aminoAcid"])
22+
df = df["aminoAcid"]
23+
df.to_csv("output.tsv", sep="\t", index=False, header=False)
24+
25+
EOF
26+
27+
python dropAA.py
28+
29+
olga-compute_pgen --humanTRB -i output.tsv -o "${count_table.baseName}_tcr_generation_probabilities.tsv"
30+
"""
31+
}

subworkflows/local/sample.nf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
include { SAMPLE_CALC } from '../../modules/local/sample_calc'
99
include { SAMPLE_PLOT } from '../../modules/local/sample_plot'
1010
include { TCRDIST3_MATRIX } from '../../modules/local/tcrdist3_matrix'
11+
include { OLGA } from '../../modules/local/olga'
1112

1213
/*
1314
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -61,6 +62,8 @@ workflow SAMPLE {
6162
v_family_csv
6263
)
6364

65+
OLGA ( sample_map )
66+
6467
// emit:
6568
// sample_stats_csv
6669
// v_family_csv

0 commit comments

Comments
 (0)