Skip to content

Commit b944696

Browse files
Merge pull request #22 from break-through-cancer/main
Clean up process names, initial implementation for tcrdist
2 parents 564d6c1 + 02e0b9c commit b944696

22 files changed

+1478
-126
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ notebooks/*.html
1313

1414
assets/TCRseq_metadata_templates/*
1515

16+
.DS_Store
17+
1618
## Python
1719
bin/__pycache__/*
1820
scripts/*

Dockerfile

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,35 @@ FROM condaforge/miniforge3:24.9.2-0
22

33
# Copy the environment file into /tmp
44
COPY env.yml /tmp/env.yml
5-
# ENV DEBIAN_FRONTEND=noninteractive
6-
7-
# Update the mamba base environment with required packages
8-
WORKDIR /tmp
9-
RUN mamba env update -n base --file env.yml
105

116
# Install system dependencies
127
RUN apt-get update \
13-
&& apt-get install -y curl \
8+
&& apt-get install -y \
9+
build-essential \
10+
curl \
11+
gcc \
12+
g++ \
1413
&& apt-get clean \
1514
&& rm -rf /var/lib/apt/lists/*
1615

16+
# Update the conda base environment with required packages
17+
WORKDIR /tmp
18+
RUN conda env update -n base --file env.yml
19+
1720
# Install quarto
18-
RUN mkdir -p /opt/quarto/1.6.40 \
21+
RUN mkdir -p /opt/quarto/1.6.42 \
1922
&& curl -o quarto.tar.gz -L \
20-
"https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.40/quarto-1.6.40-linux-amd64.tar.gz" \
23+
"https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.42/quarto-1.6.42-linux-amd64.tar.gz" \
2124
&& tar -zxvf quarto.tar.gz \
22-
-C "/opt/quarto/1.6.40" \
25+
-C "/opt/quarto/1.6.42" \
2326
--strip-components=1 \
2427
&& rm quarto.tar.gz
2528

26-
# Install R
27-
RUN mamba install -y r-base=4.4.2 \
28-
&& mamba clean -afy
29-
30-
RUN conda install r-igraph
31-
32-
# Install R packages, including igraph binary
33-
RUN Rscript -e "install.packages('remotes', repos='https://cran.r-project.org')" \
34-
&& Rscript -e "remotes::install_github('HetzDra/turboGliph')"
29+
# Install R package not available via conda
30+
RUN Rscript -e "remotes::install_github('HetzDra/turboGliph')"
3531

3632
# Add quarto to the PATH
37-
ENV PATH="/opt/quarto/1.6.40/bin:${PATH}"
33+
ENV PATH="/opt/quarto/1.6.42/bin:${PATH}"
34+
35+
# Add LD_LIBRARY_PATH for pandas
36+
ENV LD_LIBRARY_PATH=/opt/conda/lib

assets/tcrdist3_files/alphabeta_gammadelta_db.tsv

Lines changed: 1168 additions & 0 deletions
Large diffs are not rendered by default.

bin/prep_gliph2_tcr.py

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,35 +31,39 @@
3131
samplesheet = pd.read_csv(args.samplesheet, header=0)
3232
data_dir = args.data_dir + "/"
3333
tsv_files = glob.glob(os.path.join(data_dir, "*.tsv"))
34+
tsv_files = [os.path.abspath(file) for file in tsv_files]
3435

35-
# Load each file as element in dictionary
36-
tsv_dict = {}
37-
for file in tsv_files:
38-
# read each tsv into entry of a dictionary
39-
tsv_dict[file] = pd.read_csv(file, sep="\t", header=0)
36+
dfs = []
37+
for index, row in samplesheet.iterrows():
38+
file_path = row['file']
39+
print(f"Loading {file_path}")
4040

41-
# add a column to current df tsv_dict[file] with sample_id from samplesheet
42-
subject_id = samplesheet.loc[samplesheet.file == file]['subject_id']
43-
condition = samplesheet.loc[samplesheet.file == file]['sample']
44-
tsv_dict[file]['subject:condition'] = subject_id + ':' + condition
41+
# Read the TSV file into a dataframe
42+
df = pd.read_csv(file_path, sep="\t", header=0)
4543

44+
# Get metadata
45+
subject_id = row['subject_id']
46+
timepoint = row['timepoint']
47+
origin = row['origin']
48+
49+
# Add patient column
50+
df['patient'] = f"{subject_id}:{timepoint}_{origin}"
51+
52+
# Select relevant columns
53+
df = df[['aminoAcid', 'vGeneName', 'jGeneName', 'patient', 'count (templates/reads)']]
54+
dfs.append(df)
4655

47-
# Concatenate all dataframes in dictionary
48-
df = pd.concat(tsv_dict.values())
49-
df['CDR3a'] = 'NA'
50-
df = df[['aminoAcid', 'vGeneName', 'jGeneName', 'CDR3a', 'subject:condition', 'count (templates/reads)']]
51-
52-
# Rename columns
53-
df = df.rename(columns={'aminoAcid': 'CDR3b',
54-
'vGeneName': 'TRBV',
55-
'jGeneName': 'TRBJ',
56-
# 'HLA_column_name': 'HLA', if hla_file input exists, incorporate it here
57-
'subject:condition': 'patient',
58-
'count (templates/reads)': 'counts'})
5956

60-
# Filter out rows of the df with missing CDR3b values
61-
df = df[df['CDR3b'].notna()]
57+
# Concatenate all the dataframes into one
58+
df_combined = pd.concat(dfs)
6259

63-
# Write df to csv with the name ${project_name}_tcr.txt
64-
df.to_csv(args.project_name + "_tcr.txt", sep="\t", index=False, header=True)
60+
# Rename columns as required
61+
df_combined = df_combined.rename(columns={
62+
'aminoAcid': 'CDR3b',
63+
'vGeneName': 'TRBV',
64+
'jGeneName': 'TRBJ',
65+
'count (templates/reads)': 'counts'
66+
})
67+
df_combined = df_combined[df_combined['CDR3b'].notna()]
6568

69+
df_combined.to_csv(f"{args.project_name}_tcr.txt", sep="\t", index=False, header=True)

bin/tcrdist3_matrix.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import os
5+
import re
6+
7+
import numpy as np
8+
import pandas as pd
9+
from tcrdist.repertoire import TCRrep
10+
11+
def reverse_transform_trbv(trbv):
12+
"""Convert TCRBV notation back to TRBV format, remove zero padding before *, and handle /OR cases."""
13+
if not isinstance(trbv, str):
14+
return trbv # Return as-is if not a string
15+
16+
trbv = trbv.replace("TCRBV", "TRBV") # Convert TCRBV → TRBV
17+
18+
# Remove zero padding from main number (TCRBV07 → TRBV7)
19+
trbv = re.sub(r'(?<=TRBV)0*(\d+)', r'\1', trbv)
20+
21+
# Remove zero padding from subgroup (TCRBV7-02 → TRBV7-2)
22+
trbv = re.sub(r'-(0\d+)', lambda m: f'-{int(m.group(1))}', trbv)
23+
24+
# Convert "-orXX_XX" format back to "/OR#-#"
25+
trbv = re.sub(r'-or0?(\d+)_0?(\d+)', r'/OR\1-\2', trbv)
26+
27+
# Add *01 if allele group not specified
28+
if not re.search(r'\*\d{2}$', trbv):
29+
trbv += "*01"
30+
31+
return trbv
32+
33+
def remove_locus(gene_name):
34+
"""If gene is in TCRBVXX-##*0# format, try removing the -##."""
35+
return re.sub(r'-(\d+)\*', '*', gene_name)
36+
37+
def split_and_check_genes(gene_name):
38+
"""Handle cases where two genes are combined (TCRBVXX-YY/XX-ZZ*0#) and return both separately."""
39+
if '/' in gene_name and not re.search(r'/OR\d+-\d+', gene_name): # Ensure it's not an OR case
40+
base, star_part = gene_name.split("*") if "*" in gene_name else (gene_name, "01")
41+
genes = base.split("/") # Split the genes
42+
return [f"{g}*{star_part}" for g in genes] # Reattach the *0# part to both genes
43+
return [gene_name] # Return as list for consistency
44+
45+
def find_matching_gene(row, db):
46+
# Collect all possible genes from vMaxResolved and vGeneNameTies
47+
possible_genes = set() # Use a set to avoid duplicates
48+
49+
if pd.notna(row["vMaxResolved"]):
50+
possible_genes.add(row["vMaxResolved"]) # Always include vMaxResolved
51+
52+
if pd.notna(row["vGeneNameTies"]):
53+
possible_genes.update(row["vGeneNameTies"].split(",")) # Add vGeneNameTies genes
54+
55+
for gene in possible_genes:
56+
# If the gene contains multiple variants (e.g., TCRBV03-01/03-02*01), split and check both
57+
if "/" in gene and not re.search(r"/OR\d+-\d+", gene): # Avoid /OR cases
58+
sub_genes = split_and_check_genes(gene)
59+
for sub_gene in sub_genes:
60+
sub_gene = reverse_transform_trbv(sub_gene) # Ensure correct *0# format
61+
if sub_gene in db["id"].values:
62+
return sub_gene
63+
64+
# Direct match in db
65+
transform_gene = reverse_transform_trbv(gene)
66+
if transform_gene in db["id"].values:
67+
return transform_gene
68+
69+
# Try removing -## and checking again
70+
modified_gene = remove_locus(transform_gene)
71+
if modified_gene in db["id"].values:
72+
return modified_gene
73+
74+
transform_row = reverse_transform_trbv(row["vMaxResolved"])
75+
print(f'No match found for {transform_row}')
76+
77+
return transform_row # Return original vMaxResolved if no match is found
78+
79+
# Parse input arguments
80+
parser = argparse.ArgumentParser(description="Take positional args")
81+
82+
parser.add_argument("sample_tsv")
83+
parser.add_argument("ref_database")
84+
parser.add_argument("cores", type=int)
85+
86+
args = parser.parse_args()
87+
88+
print(f"sample_tsv: {args.sample_tsv}")
89+
print(f"ref_database: {args.ref_database}")
90+
print(f"cores: {args.cores}")
91+
92+
sample_tsv = args.sample_tsv
93+
94+
# Get the basename
95+
basename = os.path.splitext(os.path.basename(sample_tsv))[0]
96+
97+
# --- 1. Convert Adaptive output to tcrdist db format ---
98+
db = pd.read_table(args.ref_database, delimiter = '\t')
99+
100+
db = db[db['organism']=='human']
101+
102+
df = pd.read_table(sample_tsv, delimiter = '\t')
103+
104+
df = df[['nucleotide', 'aminoAcid', 'vMaxResolved', 'vGeneNameTies', 'count (templates/reads)']]
105+
df["vMaxResolved"] = df.apply(lambda row: find_matching_gene(row, db), axis=1)
106+
107+
df = df.rename(columns={'nucleotide': 'cdr3_b_nucseq',
108+
'aminoAcid': 'cdr3_b_aa',
109+
# 'CDR3a': 'cdr3_a_aa',
110+
'vMaxResolved': 'v_b_gene',
111+
# 'TRBJ': 'j_b_gene',
112+
'count (templates/reads)': 'count'})
113+
114+
df = df[df['cdr3_b_aa'].notna()]
115+
df = df[df['v_b_gene'].notna()]
116+
df = df.drop('vGeneNameTies', axis=1)
117+
118+
# --- 2. Calculate sparse distance matrix ---
119+
tr = TCRrep(cell_df = df,
120+
organism = 'human',
121+
chains = ['beta'],
122+
db_file = 'alphabeta_gammadelta_db.tsv',
123+
compute_distances = False)
124+
tr.cpus = args.cores
125+
tr.compute_distances()
126+
127+
np.savetxt(f"{basename}_distance_matrix.csv", tr.pw_beta, delimiter=",", fmt="%d")
128+
129+
clone_df = tr.clone_df
130+
clone_df.to_csv(f"{basename}_clone_df.csv", index=False)

conf/base.config

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,24 +33,27 @@ process {
3333
}
3434
withLabel:process_low {
3535
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
36-
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
36+
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
3737
time = { check_max( 4.h * task.attempt, 'time' ) }
3838
}
3939
withLabel:process_medium {
40-
cpus = { check_max( 16 * task.attempt, 'cpus' ) }
40+
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
4141
memory = { check_max( 16.GB * task.attempt, 'memory' ) }
4242
time = { check_max( 8.h * task.attempt, 'time' ) }
4343
}
4444
withLabel:process_high {
45-
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
46-
memory = { check_max( 72.GB * task.attempt, 'memory' ) }
45+
cpus = { check_max( 16 * task.attempt, 'cpus' ) }
46+
memory = { check_max( 64.GB * task.attempt, 'memory' ) }
4747
time = { check_max( 16.h * task.attempt, 'time' ) }
4848
}
4949
withLabel:process_long {
5050
time = { check_max( 20.h * task.attempt, 'time' ) }
5151
}
52+
withLabel:process_high_compute {
53+
cpus = { check_max( 64 * task.attempt, 'cpus' ) }
54+
}
5255
withLabel:process_high_memory {
53-
memory = { check_max( 200.GB * task.attempt, 'memory' ) }
56+
memory = { check_max( 256.GB * task.attempt, 'memory' ) }
5457
}
5558
withLabel:error_ignore {
5659
errorStrategy = 'ignore'

conf/modules.config

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,12 @@ process {
1818
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
1919
]
2020

21+
withName: SAMPLESHEET_CHECK {
22+
publishDir = [
23+
path: { "${params.output}/pipeline_info" },
24+
mode: params.publish_dir_mode,
25+
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
26+
]
27+
}
28+
2129
}

env.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
name: base
22
channels:
33
- conda-forge
4-
dependencies:
4+
- bioconda
5+
dependencies:
6+
# Python
57
- python=3.11.5
6-
- pandas=2.0.3
8+
- pandas=2.2.3
79
- numpy=1.25.2
810
- scipy=1.11.3
911
- seaborn=0.13.0
@@ -14,9 +16,20 @@ dependencies:
1416
- notebook=7.0.6
1517
- fsspec=2024.3.1
1618
- s3fs=2024.3.1
19+
- python-igraph=0.11.8
20+
- scikit-learn=1.6.1
21+
22+
# R and R packages
23+
- r-base=4.4.2
24+
- r-igraph=2.0.3
25+
- r-pheatmap=1.0.12
26+
- r-remotes=2.5.0
27+
28+
# Pip packages
1729
- pip:
1830
- csvkit==1.3.0
1931
- papermill==2.4.0
2032
- plotly==5.18.0
2133
- abydos==0.5.0
2234
- fastcluster==1.2.6
35+
- git+https://github.com/kmayerb/tcrdist3.git@0.2.2
Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
1-
process CALC_COMPARE {
2-
// tag "${sample_utf8}"
1+
process COMPARE_CALC {
32
label 'process_single'
4-
5-
// beforeScript 'export DOCKER_OPTS="-v $${params.data_dir}:$${params.data_dir}"'
6-
73
container "ghcr.io/break-through-cancer/bulktcr:latest"
8-
9-
publishDir "${params.output}/compare_output/", mode: "copy", overwrite: "true"
104

115
input:
126
path sample_utf8
@@ -23,5 +17,4 @@ process CALC_COMPARE {
2317
-s $sample_utf8 \
2418
-p $projectDir
2519
"""
26-
2720
}
Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
1-
process PLOT_COMPARE {
2-
// tag "${jaccard_mat}"
1+
process COMPARE_PLOT {
32
label 'process_single'
4-
53
container "ghcr.io/break-through-cancer/bulktcr:latest"
64

7-
publishDir "${params.output}/reports/", mode: "copy", overwrite: "true"
8-
95
input:
106
path sample_utf8
117
path jaccard_mat
@@ -39,5 +35,4 @@ process PLOT_COMPARE {
3935
"""
4036
touch compare_stats.qmd
4137
"""
42-
43-
}
38+
}

0 commit comments

Comments
 (0)