KarchinLab
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 18 additions & 19 deletions b/‎Dockerfile‎
Lines changed: 18 additions & 19 deletions
diff --git a/‎assets/tcrdist3_files/alphabeta_gammadelta_db.tsv‎
Lines changed: 1168 additions & 0 deletions b/‎assets/tcrdist3_files/alphabeta_gammadelta_db.tsv‎
Lines changed: 1168 additions & 0 deletions
diff --git a/‎bin/prep_gliph2_tcr.py‎
Lines changed: 29 additions & 25 deletions b/‎bin/prep_gliph2_tcr.py‎
Lines changed: 29 additions & 25 deletions
diff --git a/‎bin/tcrdist3_matrix.py‎
Lines changed: 130 additions & 0 deletions b/‎bin/tcrdist3_matrix.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎conf/base.config‎
Lines changed: 8 additions & 5 deletions b/‎conf/base.config‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎conf/modules.config‎
Lines changed: 8 additions & 0 deletions b/‎conf/modules.config‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎env.yml‎
Lines changed: 15 additions & 2 deletions b/‎env.yml‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎modules/local/calc_compare.nf‎ ‎modules/local/compare_calc.nf‎modules/local/calc_compare.nf renamed to modules/local/compare_calc.nf
Lines changed: 1 addition & 8 deletions b/‎modules/local/calc_compare.nf‎ ‎modules/local/compare_calc.nf‎modules/local/calc_compare.nf renamed to modules/local/compare_calc.nf
Lines changed: 1 addition & 8 deletions
diff --git a/‎modules/local/plot_compare.nf‎ ‎modules/local/compare_plot.nf‎modules/local/plot_compare.nf renamed to modules/local/compare_plot.nf
Lines changed: 2 additions & 7 deletions b/‎modules/local/plot_compare.nf‎ ‎modules/local/compare_plot.nf‎modules/local/plot_compare.nf renamed to modules/local/compare_plot.nf
Lines changed: 2 additions & 7 deletions
@@ -13,6 +13,8 @@ notebooks/*.html
 
 assets/TCRseq_metadata_templates/*
 
+.DS_Store
+
 ## Python
 bin/__pycache__/*
 scripts/*
 
@@ -2,36 +2,35 @@ FROM condaforge/miniforge3:24.9.2-0
 
 # Copy the environment file into /tmp
 COPY env.yml /tmp/env.yml
-# ENV DEBIAN_FRONTEND=noninteractive
-
-# Update the mamba base environment with required packages
-WORKDIR /tmp
-RUN mamba env update -n base --file env.yml
 
 # Install system dependencies
 RUN apt-get update \
-    && apt-get install -y curl \
+    && apt-get install -y \
+    build-essential \
+    curl \
+    gcc \
+    g++ \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+# Update the conda base environment with required packages
+WORKDIR /tmp
+RUN conda env update -n base --file env.yml
+
 # Install quarto
-RUN mkdir -p /opt/quarto/1.6.40 \
+RUN mkdir -p /opt/quarto/1.6.42 \
     && curl -o quarto.tar.gz -L \
-        "https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.40/quarto-1.6.40-linux-amd64.tar.gz" \
+        "https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.42/quarto-1.6.42-linux-amd64.tar.gz" \
     && tar -zxvf quarto.tar.gz \
-        -C "/opt/quarto/1.6.40" \
+        -C "/opt/quarto/1.6.42" \
         --strip-components=1 \
     && rm quarto.tar.gz 
 
-# Install R
-RUN mamba install -y r-base=4.4.2 \
-    && mamba clean -afy
-
-RUN conda install r-igraph
-
-# Install R packages, including igraph binary
-RUN Rscript -e "install.packages('remotes', repos='https://cran.r-project.org')" \
-    && Rscript -e "remotes::install_github('HetzDra/turboGliph')"
+# Install R package not available via conda
+RUN Rscript -e "remotes::install_github('HetzDra/turboGliph')"
 
 # Add quarto to the PATH
-ENV PATH="/opt/quarto/1.6.40/bin:${PATH}"
+ENV PATH="/opt/quarto/1.6.42/bin:${PATH}"
+
+# Add LD_LIBRARY_PATH for pandas
+ENV LD_LIBRARY_PATH=/opt/conda/lib
@@ -31,35 +31,39 @@
 samplesheet = pd.read_csv(args.samplesheet, header=0)
 data_dir = args.data_dir + "/"
 tsv_files = glob.glob(os.path.join(data_dir, "*.tsv"))
+tsv_files = [os.path.abspath(file) for file in tsv_files]
 
-# Load each file as element in dictionary
-tsv_dict = {}
-for file in tsv_files:
-    # read each tsv into entry of a dictionary
-    tsv_dict[file] = pd.read_csv(file, sep="\t", header=0)
+dfs = []
+for index, row in samplesheet.iterrows():
+    file_path = row['file']
+    print(f"Loading {file_path}")
 
-    # add a column to current df tsv_dict[file] with sample_id from samplesheet
-    subject_id = samplesheet.loc[samplesheet.file == file]['subject_id']
-    condition = samplesheet.loc[samplesheet.file == file]['sample']
-    tsv_dict[file]['subject:condition'] = subject_id + ':' + condition
+    # Read the TSV file into a dataframe
+    df = pd.read_csv(file_path, sep="\t", header=0)
 
+    # Get metadata
+    subject_id = row['subject_id']
+    timepoint = row['timepoint']
+    origin = row['origin']
+    
+    # Add patient column
+    df['patient'] = f"{subject_id}:{timepoint}_{origin}"
+    
+    # Select relevant columns
+    df = df[['aminoAcid', 'vGeneName', 'jGeneName', 'patient', 'count (templates/reads)']]
+    dfs.append(df)
 
-# Concatenate all dataframes in dictionary
-df = pd.concat(tsv_dict.values())
-df['CDR3a'] = 'NA'
-df = df[['aminoAcid', 'vGeneName', 'jGeneName', 'CDR3a', 'subject:condition', 'count (templates/reads)']]
-
-# Rename columns
-df = df.rename(columns={'aminoAcid': 'CDR3b', 
-                        'vGeneName': 'TRBV',
-                        'jGeneName': 'TRBJ',
-                        # 'HLA_column_name': 'HLA', if hla_file input exists, incorporate it here
-                        'subject:condition': 'patient',
-                        'count (templates/reads)': 'counts'})
 
-# Filter out rows of the df with missing CDR3b values
-df = df[df['CDR3b'].notna()]
+# Concatenate all the dataframes into one
+df_combined = pd.concat(dfs)
 
-# Write df to csv with the name ${project_name}_tcr.txt
-df.to_csv(args.project_name + "_tcr.txt", sep="\t", index=False, header=True)
+# Rename columns as required
+df_combined = df_combined.rename(columns={
+    'aminoAcid': 'CDR3b',
+    'vGeneName': 'TRBV',
+    'jGeneName': 'TRBJ',
+    'count (templates/reads)': 'counts'
+})
+df_combined = df_combined[df_combined['CDR3b'].notna()]
 
+df_combined.to_csv(f"{args.project_name}_tcr.txt", sep="\t", index=False, header=True)
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import re
+
+import numpy as np
+import pandas as pd
+from tcrdist.repertoire import TCRrep
+
+def reverse_transform_trbv(trbv):
+    """Convert TCRBV notation back to TRBV format, remove zero padding before *, and handle /OR cases."""
+    if not isinstance(trbv, str):
+        return trbv  # Return as-is if not a string
+    
+    trbv = trbv.replace("TCRBV", "TRBV")  # Convert TCRBV → TRBV
+    
+    # Remove zero padding from main number (TCRBV07 → TRBV7)
+    trbv = re.sub(r'(?<=TRBV)0*(\d+)', r'\1', trbv)  
+    
+    # Remove zero padding from subgroup (TCRBV7-02 → TRBV7-2)
+    trbv = re.sub(r'-(0\d+)', lambda m: f'-{int(m.group(1))}', trbv)  
+    
+    # Convert "-orXX_XX" format back to "/OR#-#"
+    trbv = re.sub(r'-or0?(\d+)_0?(\d+)', r'/OR\1-\2', trbv)
+    
+    # Add *01 if allele group not specified
+    if not re.search(r'\*\d{2}$', trbv):
+        trbv += "*01"
+    
+    return trbv
+
+def remove_locus(gene_name):
+    """If gene is in TCRBVXX-##*0# format, try removing the -##."""
+    return re.sub(r'-(\d+)\*', '*', gene_name)
+
+def split_and_check_genes(gene_name):
+    """Handle cases where two genes are combined (TCRBVXX-YY/XX-ZZ*0#) and return both separately."""
+    if '/' in gene_name and not re.search(r'/OR\d+-\d+', gene_name):  # Ensure it's not an OR case
+        base, star_part = gene_name.split("*") if "*" in gene_name else (gene_name, "01")  
+        genes = base.split("/")  # Split the genes
+        return [f"{g}*{star_part}" for g in genes]  # Reattach the *0# part to both genes
+    return [gene_name]  # Return as list for consistency
+
+def find_matching_gene(row, db):
+    # Collect all possible genes from vMaxResolved and vGeneNameTies
+    possible_genes = set()  # Use a set to avoid duplicates
+    
+    if pd.notna(row["vMaxResolved"]):
+        possible_genes.add(row["vMaxResolved"])  # Always include vMaxResolved
+    
+    if pd.notna(row["vGeneNameTies"]):
+        possible_genes.update(row["vGeneNameTies"].split(","))  # Add vGeneNameTies genes
+    
+    for gene in possible_genes:
+        # If the gene contains multiple variants (e.g., TCRBV03-01/03-02*01), split and check both
+        if "/" in gene and not re.search(r"/OR\d+-\d+", gene):  # Avoid /OR cases
+            sub_genes = split_and_check_genes(gene)
+            for sub_gene in sub_genes:
+                sub_gene = reverse_transform_trbv(sub_gene)  # Ensure correct *0# format
+                if sub_gene in db["id"].values:
+                    return sub_gene
+        
+        # Direct match in db
+        transform_gene = reverse_transform_trbv(gene)
+        if transform_gene in db["id"].values:
+            return transform_gene
+        
+        # Try removing -## and checking again
+        modified_gene = remove_locus(transform_gene)
+        if modified_gene in db["id"].values:
+            return modified_gene
+        
+    transform_row = reverse_transform_trbv(row["vMaxResolved"])
+    print(f'No match found for {transform_row}')
+    
+    return transform_row  # Return original vMaxResolved if no match is found
+
+# Parse input arguments
+parser = argparse.ArgumentParser(description="Take positional args")
+
+parser.add_argument("sample_tsv")
+parser.add_argument("ref_database")
+parser.add_argument("cores", type=int)
+
+args = parser.parse_args()
+
+print(f"sample_tsv: {args.sample_tsv}")
+print(f"ref_database: {args.ref_database}")
+print(f"cores: {args.cores}")
+
+sample_tsv = args.sample_tsv
+
+# Get the basename
+basename = os.path.splitext(os.path.basename(sample_tsv))[0]
+
+# --- 1. Convert Adaptive output to tcrdist db format ---
+db = pd.read_table(args.ref_database, delimiter = '\t')
+
+db = db[db['organism']=='human']
+
+df = pd.read_table(sample_tsv, delimiter = '\t')
+
+df = df[['nucleotide', 'aminoAcid', 'vMaxResolved', 'vGeneNameTies', 'count (templates/reads)']]
+df["vMaxResolved"] = df.apply(lambda row: find_matching_gene(row, db), axis=1)
+
+df = df.rename(columns={'nucleotide': 'cdr3_b_nucseq',
+                    'aminoAcid': 'cdr3_b_aa',
+                    # 'CDR3a': 'cdr3_a_aa', 
+                    'vMaxResolved': 'v_b_gene',
+                    # 'TRBJ': 'j_b_gene',
+                    'count (templates/reads)': 'count'})
+
+df = df[df['cdr3_b_aa'].notna()]
+df = df[df['v_b_gene'].notna()]
+df = df.drop('vGeneNameTies', axis=1)
+
+# --- 2. Calculate sparse distance matrix ---
+tr = TCRrep(cell_df = df,
+            organism = 'human',
+            chains = ['beta'],
+            db_file = 'alphabeta_gammadelta_db.tsv',
+            compute_distances = False)
+tr.cpus = args.cores
+tr.compute_distances()
+
+np.savetxt(f"{basename}_distance_matrix.csv", tr.pw_beta, delimiter=",", fmt="%d")
+
+clone_df = tr.clone_df
+clone_df.to_csv(f"{basename}_clone_df.csv", index=False)
@@ -33,24 +33,27 @@ process {
     }
     withLabel:process_low {
         cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        memory = { check_max( 4.GB  * task.attempt, 'memory'  ) }
         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_medium {
-        cpus   = { check_max( 16     * task.attempt, 'cpus'    ) }
+        cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
         memory = { check_max( 16.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 8.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        cpus   = { check_max( 16    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 64.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 16.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
     }
+    withLabel:process_high_compute {
+        cpus = { check_max( 64 * task.attempt, 'cpus' ) }
+    }
     withLabel:process_high_memory {
-        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+        memory = { check_max( 256.GB * task.attempt, 'memory' ) }
     }
     withLabel:error_ignore {
         errorStrategy = 'ignore'
 
@@ -18,4 +18,12 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
+    withName: SAMPLESHEET_CHECK {
+        publishDir = [
+            path: { "${params.output}/pipeline_info" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
 }
@@ -1,9 +1,11 @@
 name: base
 channels:
   - conda-forge
-dependencies:
+  - bioconda
+dependencies:  
+  # Python
   - python=3.11.5
-  - pandas=2.0.3
+  - pandas=2.2.3
   - numpy=1.25.2
   - scipy=1.11.3
   - seaborn=0.13.0
@@ -14,9 +16,20 @@ dependencies:
   - notebook=7.0.6
   - fsspec=2024.3.1
   - s3fs=2024.3.1
+  - python-igraph=0.11.8
+  - scikit-learn=1.6.1
+
+  # R and R packages
+  - r-base=4.4.2
+  - r-igraph=2.0.3
+  - r-pheatmap=1.0.12
+  - r-remotes=2.5.0
+
+  # Pip packages
   - pip:
     - csvkit==1.3.0
     - papermill==2.4.0
     - plotly==5.18.0
     - abydos==0.5.0
     - fastcluster==1.2.6
+    - git+https://github.com/kmayerb/tcrdist3.git@0.2.2
@@ -1,12 +1,6 @@
-process CALC_COMPARE {
-    // tag "${sample_utf8}"
+process COMPARE_CALC {
     label 'process_single'
-
-    // beforeScript 'export DOCKER_OPTS="-v $${params.data_dir}:$${params.data_dir}"'
-
     container "ghcr.io/break-through-cancer/bulktcr:latest"
-
-    publishDir "${params.output}/compare_output/", mode: "copy", overwrite: "true"
 
     input:
     path sample_utf8
@@ -23,5 +17,4 @@ process CALC_COMPARE {
         -s $sample_utf8 \
         -p $projectDir 
     """
-
 }
@@ -1,11 +1,7 @@
-process PLOT_COMPARE {
-    // tag "${jaccard_mat}"
+process COMPARE_PLOT {
     label 'process_single'
-
     container "ghcr.io/break-through-cancer/bulktcr:latest"
 
-    publishDir "${params.output}/reports/", mode: "copy", overwrite: "true"
-
     input:
     path sample_utf8
     path jaccard_mat
@@ -39,5 +35,4 @@ process PLOT_COMPARE {
     """
     touch compare_stats.qmd
     """
-    
-    }
+}
Original file line number	Diff line number	Diff line change
`@@ -33,24 +33,27 @@ process {`
`33`	`33`	`}`
`34`	`34`	`withLabel:process_low {`
`35`	`35`	`cpus = { check_max( 2 * task.attempt, 'cpus' ) }`
`36`		`- memory = { check_max( 12.GB * task.attempt, 'memory' ) }`
	`36`	`+ memory = { check_max( 4.GB * task.attempt, 'memory' ) }`
`37`	`37`	`time = { check_max( 4.h * task.attempt, 'time' ) }`
`38`	`38`	`}`
`39`	`39`	`withLabel:process_medium {`
`40`		`- cpus = { check_max( 16 * task.attempt, 'cpus' ) }`
	`40`	`+ cpus = { check_max( 8 * task.attempt, 'cpus' ) }`
`41`	`41`	`memory = { check_max( 16.GB * task.attempt, 'memory' ) }`
`42`	`42`	`time = { check_max( 8.h * task.attempt, 'time' ) }`
`43`	`43`	`}`
`44`	`44`	`withLabel:process_high {`
`45`		`- cpus = { check_max( 12 * task.attempt, 'cpus' ) }`
`46`		`- memory = { check_max( 72.GB * task.attempt, 'memory' ) }`
	`45`	`+ cpus = { check_max( 16 * task.attempt, 'cpus' ) }`
	`46`	`+ memory = { check_max( 64.GB * task.attempt, 'memory' ) }`
`47`	`47`	`time = { check_max( 16.h * task.attempt, 'time' ) }`
`48`	`48`	`}`
`49`	`49`	`withLabel:process_long {`
`50`	`50`	`time = { check_max( 20.h * task.attempt, 'time' ) }`
`51`	`51`	`}`
	`52`	`+ withLabel:process_high_compute {`
	`53`	`+ cpus = { check_max( 64 * task.attempt, 'cpus' ) }`
	`54`	`+ }`
`52`	`55`	`withLabel:process_high_memory {`
`53`		`- memory = { check_max( 200.GB * task.attempt, 'memory' ) }`
	`56`	`+ memory = { check_max( 256.GB * task.attempt, 'memory' ) }`
`54`	`57`	`}`
`55`	`58`	`withLabel:error_ignore {`
`56`	`59`	`errorStrategy = 'ignore'`
Original file line number	Diff line number	Diff line change
`@@ -18,4 +18,12 @@ process {`
`18`	`18`	`saveAs: { filename -> filename.equals('versions.yml') ? null : filename }`
`19`	`19`	`]`
`20`	`20`
	`21`	`+ withName: SAMPLESHEET_CHECK {`
	`22`	`+ publishDir = [`
	`23`	`+ path: { "${params.output}/pipeline_info" },`
	`24`	`+ mode: params.publish_dir_mode,`
	`25`	`+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }`
	`26`	`+ ]`
	`27`	`+ }`
	`28`	`+`
`21`	`29`	`}`