88import pandas as pd
99from tcrdist .repertoire import TCRrep
1010
11- def reverse_transform_trbv (trbv ):
12- """Convert TCRBV notation back to TRBV format, remove zero padding before *, and handle /OR cases ."""
11+ def transform_trbv (trbv ):
12+ """Convert gene names from Adaptive ImmunoSEQ to IMGT format ."""
1313 if not isinstance (trbv , str ):
1414 return trbv # Return as-is if not a string
1515
16- trbv = trbv .replace ("TCRBV" , "TRBV" ) # Convert TCRBV → TRBV
16+ # Convert locus name
17+ trbv = trbv .replace ("TCRBV" , "TRBV" )
1718
18- # Remove zero padding from main number (TCRBV07 → TRBV7)
19+ # Remove zero padding from gene name (TCRBV07 to TRBV7)
1920 trbv = re .sub (r'(?<=TRBV)0*(\d+)' , r'\1' , trbv )
2021
21- # Remove zero padding from subgroup (TCRBV7-02 → TRBV7-2)
22+ # Remove zero padding from subgroup (TCRBV7-02 to TRBV7-2)
2223 trbv = re .sub (r'-(0\d+)' , lambda m : f'-{ int (m .group (1 ))} ' , trbv )
2324
24- # Convert "-orXX_XX" format back to "/OR#-#"
25+ # Convert "-orXX_XX" to "/OR#-#" for orphon genes
2526 trbv = re .sub (r'-or0?(\d+)_0?(\d+)' , r'/OR\1-\2' , trbv )
2627
2728 # Add *01 if allele group not specified
@@ -31,16 +32,22 @@ def reverse_transform_trbv(trbv):
3132 return trbv
3233
3334def remove_locus (gene_name ):
34- """If gene is in TCRBVXX-##*0# format, try removing the -##."""
35- return re .sub (r'-(\d+)\*' , '*' , gene_name )
35+ """Remove the -## gene position from TRBV names unless the gene contains /OR."""
36+ if '/OR' in gene_name :
37+ return gene_name
38+ else :
39+ return re .sub (r'-(\d+)\*' , '*' , gene_name )
3640
3741def split_and_check_genes (gene_name ):
38- """Handle cases where two genes are combined (TCRBVXX-YY/XX-ZZ*0#) and return both separately."""
39- if '/' in gene_name and not re .search (r'/OR\d+-\d+' , gene_name ): # Ensure it's not an OR case
40- base , star_part = gene_name .split ("*" ) if "*" in gene_name else (gene_name , "01" )
42+ """Split combined TCRBV genes (e.g., TCRBV06-02/06-03*01 to TCRBV06-02*01 and TCRBV06-03*01."""
43+ if '/' in gene_name and not re .search (r'/OR\d+-\d+' , gene_name ): # Ensure it's not an orphon
44+ base , allele = gene_name .split ("*" ) if "*" in gene_name else (gene_name , "01" )
45+ prefix_match = re .match (r"(TCRBV\d+)" , gene_name )
46+ prefix = prefix_match .group (1 ) if prefix_match else "TCRBV" # Fallback just in case
4147 genes = base .split ("/" ) # Split the genes
42- return [f"{ g } *{ star_part } " for g in genes ] # Reattach the *0# part to both genes
43- return [gene_name ] # Return as list for consistency
48+ return [f"{ prefix } -{ g .split ('-' )[- 1 ]} *{ allele } " for g in genes ]
49+ return [gene_name ]
50+
4451
4552def find_matching_gene (row , db ):
4653 # Collect all possible genes from vMaxResolved and vGeneNameTies
@@ -57,12 +64,12 @@ def find_matching_gene(row, db):
5764 if "/" in gene and not re .search (r"/OR\d+-\d+" , gene ): # Avoid /OR cases
5865 sub_genes = split_and_check_genes (gene )
5966 for sub_gene in sub_genes :
60- sub_gene = reverse_transform_trbv (sub_gene ) # Ensure correct *0# format
67+ sub_gene = transform_trbv (sub_gene ) # Ensure correct *0# format
6168 if sub_gene in db ["id" ].values :
6269 return sub_gene
6370
6471 # Direct match in db
65- transform_gene = reverse_transform_trbv (gene )
72+ transform_gene = transform_trbv (gene )
6673 if transform_gene in db ["id" ].values :
6774 return transform_gene
6875
@@ -71,60 +78,62 @@ def find_matching_gene(row, db):
7178 if modified_gene in db ["id" ].values :
7279 return modified_gene
7380
74- transform_row = reverse_transform_trbv (row ["vMaxResolved" ])
81+ transform_row = transform_trbv (row ["vMaxResolved" ])
7582 print (f'No match found for { transform_row } ' )
7683
7784 return transform_row # Return original vMaxResolved if no match is found
7885
79- # Parse input arguments
80- parser = argparse .ArgumentParser (description = "Take positional args" )
8186
82- parser .add_argument ("sample_tsv" )
83- parser .add_argument ("ref_database" )
84- parser .add_argument ("cores" , type = int )
87+ if __name__ == "__main__" :
88+ # Parse input arguments
89+ parser = argparse .ArgumentParser (description = "Take positional args" )
90+
91+ parser .add_argument ("sample_tsv" )
92+ parser .add_argument ("ref_database" )
93+ parser .add_argument ("cores" , type = int )
8594
86- args = parser .parse_args ()
95+ args = parser .parse_args ()
8796
88- print (f"sample_tsv: { args .sample_tsv } " )
89- print (f"ref_database: { args .ref_database } " )
90- print (f"cores: { args .cores } " )
97+ print (f"sample_tsv: { args .sample_tsv } " )
98+ print (f"ref_database: { args .ref_database } " )
99+ print (f"cores: { args .cores } " )
91100
92- sample_tsv = args .sample_tsv
101+ sample_tsv = args .sample_tsv
93102
94- # Get the basename
95- basename = os .path .splitext (os .path .basename (sample_tsv ))[0 ]
103+ # Get the basename
104+ basename = os .path .splitext (os .path .basename (sample_tsv ))[0 ]
96105
97- # --- 1. Convert Adaptive output to tcrdist db format ---
98- db = pd .read_table (args .ref_database , delimiter = '\t ' )
106+ # --- 1. Convert Adaptive output to tcrdist db format ---
107+ db = pd .read_table (args .ref_database , delimiter = '\t ' )
99108
100- db = db [db ['organism' ]== 'human' ]
109+ db = db [db ['organism' ]== 'human' ]
101110
102- df = pd .read_table (sample_tsv , delimiter = '\t ' )
111+ df = pd .read_table (sample_tsv , delimiter = '\t ' )
103112
104- df = df [['nucleotide' , 'aminoAcid' , 'vMaxResolved' , 'vGeneNameTies' , 'count (templates/reads)' ]]
105- df ["vMaxResolved" ] = df .apply (lambda row : find_matching_gene (row , db ), axis = 1 )
113+ df = df [['nucleotide' , 'aminoAcid' , 'vMaxResolved' , 'vGeneNameTies' , 'count (templates/reads)' ]]
114+ df ["vMaxResolved" ] = df .apply (lambda row : find_matching_gene (row , db ), axis = 1 )
106115
107- df = df .rename (columns = {'nucleotide' : 'cdr3_b_nucseq' ,
108- 'aminoAcid' : 'cdr3_b_aa' ,
109- # 'CDR3a': 'cdr3_a_aa',
110- 'vMaxResolved' : 'v_b_gene' ,
111- # 'TRBJ': 'j_b_gene',
112- 'count (templates/reads)' : 'count' })
116+ df = df .rename (columns = {'nucleotide' : 'cdr3_b_nucseq' ,
117+ 'aminoAcid' : 'cdr3_b_aa' ,
118+ # 'CDR3a': 'cdr3_a_aa',
119+ 'vMaxResolved' : 'v_b_gene' ,
120+ # 'TRBJ': 'j_b_gene',
121+ 'count (templates/reads)' : 'count' })
113122
114- df = df [df ['cdr3_b_aa' ].notna ()]
115- df = df [df ['v_b_gene' ].notna ()]
116- df = df .drop ('vGeneNameTies' , axis = 1 )
123+ df = df [df ['cdr3_b_aa' ].notna ()]
124+ df = df [df ['v_b_gene' ].notna ()]
125+ df = df .drop ('vGeneNameTies' , axis = 1 )
117126
118- # --- 2. Calculate sparse distance matrix ---
119- tr = TCRrep (cell_df = df ,
120- organism = 'human' ,
121- chains = ['beta' ],
122- db_file = 'alphabeta_gammadelta_db.tsv' ,
123- compute_distances = False )
124- tr .cpus = args .cores
125- tr .compute_distances ()
127+ # --- 2. Calculate sparse distance matrix ---
128+ tr = TCRrep (cell_df = df ,
129+ organism = 'human' ,
130+ chains = ['beta' ],
131+ db_file = 'alphabeta_gammadelta_db.tsv' ,
132+ compute_distances = False )
133+ tr .cpus = args .cores
134+ tr .compute_distances ()
126135
127- np .savetxt (f"{ basename } _distance_matrix.csv" , tr .pw_beta , delimiter = "," , fmt = "%d" )
136+ np .savetxt (f"{ basename } _distance_matrix.csv" , tr .pw_beta , delimiter = "," , fmt = "%d" )
128137
129- clone_df = tr .clone_df
130- clone_df .to_csv (f"{ basename } _clone_df.csv" , index = False )
138+ clone_df = tr .clone_df
139+ clone_df .to_csv (f"{ basename } _clone_df.csv" , index = False )
0 commit comments