|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import argparse |
| 4 | +import os |
| 5 | +import re |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +from tcrdist.repertoire import TCRrep |
| 10 | + |
| 11 | +def reverse_transform_trbv(trbv): |
| 12 | + """Convert TCRBV notation back to TRBV format, remove zero padding before *, and handle /OR cases.""" |
| 13 | + if not isinstance(trbv, str): |
| 14 | + return trbv # Return as-is if not a string |
| 15 | + |
| 16 | + trbv = trbv.replace("TCRBV", "TRBV") # Convert TCRBV → TRBV |
| 17 | + |
| 18 | + # Remove zero padding from main number (TCRBV07 → TRBV7) |
| 19 | + trbv = re.sub(r'(?<=TRBV)0*(\d+)', r'\1', trbv) |
| 20 | + |
| 21 | + # Remove zero padding from subgroup (TCRBV7-02 → TRBV7-2) |
| 22 | + trbv = re.sub(r'-(0\d+)', lambda m: f'-{int(m.group(1))}', trbv) |
| 23 | + |
| 24 | + # Convert "-orXX_XX" format back to "/OR#-#" |
| 25 | + trbv = re.sub(r'-or0?(\d+)_0?(\d+)', r'/OR\1-\2', trbv) |
| 26 | + |
| 27 | + # Add *01 if allele group not specified |
| 28 | + if not re.search(r'\*\d{2}$', trbv): |
| 29 | + trbv += "*01" |
| 30 | + |
| 31 | + return trbv |
| 32 | + |
| 33 | +def remove_locus(gene_name): |
| 34 | + """If gene is in TCRBVXX-##*0# format, try removing the -##.""" |
| 35 | + return re.sub(r'-(\d+)\*', '*', gene_name) |
| 36 | + |
| 37 | +def split_and_check_genes(gene_name): |
| 38 | + """Handle cases where two genes are combined (TCRBVXX-YY/XX-ZZ*0#) and return both separately.""" |
| 39 | + if '/' in gene_name and not re.search(r'/OR\d+-\d+', gene_name): # Ensure it's not an OR case |
| 40 | + base, star_part = gene_name.split("*") if "*" in gene_name else (gene_name, "01") |
| 41 | + genes = base.split("/") # Split the genes |
| 42 | + return [f"{g}*{star_part}" for g in genes] # Reattach the *0# part to both genes |
| 43 | + return [gene_name] # Return as list for consistency |
| 44 | + |
| 45 | +def find_matching_gene(row, db): |
| 46 | + # Collect all possible genes from vMaxResolved and vGeneNameTies |
| 47 | + possible_genes = set() # Use a set to avoid duplicates |
| 48 | + |
| 49 | + if pd.notna(row["vMaxResolved"]): |
| 50 | + possible_genes.add(row["vMaxResolved"]) # Always include vMaxResolved |
| 51 | + |
| 52 | + if pd.notna(row["vGeneNameTies"]): |
| 53 | + possible_genes.update(row["vGeneNameTies"].split(",")) # Add vGeneNameTies genes |
| 54 | + |
| 55 | + for gene in possible_genes: |
| 56 | + # If the gene contains multiple variants (e.g., TCRBV03-01/03-02*01), split and check both |
| 57 | + if "/" in gene and not re.search(r"/OR\d+-\d+", gene): # Avoid /OR cases |
| 58 | + sub_genes = split_and_check_genes(gene) |
| 59 | + for sub_gene in sub_genes: |
| 60 | + sub_gene = reverse_transform_trbv(sub_gene) # Ensure correct *0# format |
| 61 | + if sub_gene in db["id"].values: |
| 62 | + return sub_gene |
| 63 | + |
| 64 | + # Direct match in db |
| 65 | + transform_gene = reverse_transform_trbv(gene) |
| 66 | + if transform_gene in db["id"].values: |
| 67 | + return transform_gene |
| 68 | + |
| 69 | + # Try removing -## and checking again |
| 70 | + modified_gene = remove_locus(transform_gene) |
| 71 | + if modified_gene in db["id"].values: |
| 72 | + return modified_gene |
| 73 | + |
| 74 | + transform_row = reverse_transform_trbv(row["vMaxResolved"]) |
| 75 | + print(f'No match found for {transform_row}') |
| 76 | + |
| 77 | + return transform_row # Return original vMaxResolved if no match is found |
| 78 | + |
| 79 | +# Parse input arguments |
| 80 | +parser = argparse.ArgumentParser(description="Take positional args") |
| 81 | + |
| 82 | +parser.add_argument("sample_tsv") |
| 83 | +parser.add_argument("ref_database") |
| 84 | +parser.add_argument("cores", type=int) |
| 85 | + |
| 86 | +args = parser.parse_args() |
| 87 | + |
| 88 | +print(f"sample_tsv: {args.sample_tsv}") |
| 89 | +print(f"ref_database: {args.ref_database}") |
| 90 | +print(f"cores: {args.cores}") |
| 91 | + |
| 92 | +sample_tsv = args.sample_tsv |
| 93 | + |
| 94 | +# Get the basename |
| 95 | +basename = os.path.splitext(os.path.basename(sample_tsv))[0] |
| 96 | + |
| 97 | +# --- 1. Convert Adaptive output to tcrdist db format --- |
| 98 | +db = pd.read_table(args.ref_database, delimiter = '\t') |
| 99 | + |
| 100 | +db = db[db['organism']=='human'] |
| 101 | + |
| 102 | +df = pd.read_table(sample_tsv, delimiter = '\t') |
| 103 | + |
| 104 | +df = df[['nucleotide', 'aminoAcid', 'vMaxResolved', 'vGeneNameTies', 'count (templates/reads)']] |
| 105 | +df["vMaxResolved"] = df.apply(lambda row: find_matching_gene(row, db), axis=1) |
| 106 | + |
| 107 | +df = df.rename(columns={'nucleotide': 'cdr3_b_nucseq', |
| 108 | + 'aminoAcid': 'cdr3_b_aa', |
| 109 | + # 'CDR3a': 'cdr3_a_aa', |
| 110 | + 'vMaxResolved': 'v_b_gene', |
| 111 | + # 'TRBJ': 'j_b_gene', |
| 112 | + 'count (templates/reads)': 'count'}) |
| 113 | + |
| 114 | +df = df[df['cdr3_b_aa'].notna()] |
| 115 | +df = df[df['v_b_gene'].notna()] |
| 116 | +df = df.drop('vGeneNameTies', axis=1) |
| 117 | + |
| 118 | +# --- 2. Calculate sparse distance matrix --- |
| 119 | +tr = TCRrep(cell_df = df, |
| 120 | + organism = 'human', |
| 121 | + chains = ['beta'], |
| 122 | + db_file = 'alphabeta_gammadelta_db.tsv', |
| 123 | + compute_distances = False) |
| 124 | +tr.cpus = args.cores |
| 125 | +tr.compute_distances() |
| 126 | + |
| 127 | +np.savetxt(f"{basename}_distance_matrix.csv", tr.pw_beta, delimiter=",", fmt="%d") |
| 128 | + |
| 129 | +clone_df = tr.clone_df |
| 130 | +clone_df.to_csv(f"{basename}_clone_df.csv", index=False) |
0 commit comments