KarchinLab
diff --git a/‎.cirro/process-form.json‎
Lines changed: 0 additions & 7 deletions b/‎.cirro/process-form.json‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎.cirro/process-input.json‎
Lines changed: 0 additions & 1 deletion b/‎.cirro/process-input.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎bin/calc_compare.py‎ ‎bin/compare_calc.py‎bin/calc_compare.py renamed to bin/compare_calc.py
Lines changed: 8 additions & 6 deletions b/‎bin/calc_compare.py‎ ‎bin/compare_calc.py‎bin/calc_compare.py renamed to bin/compare_calc.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎bin/compare_clonal_publicity.py‎
Lines changed: 54 additions & 0 deletions b/‎bin/compare_clonal_publicity.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎bin/compare_concatenate.py‎
Lines changed: 69 additions & 0 deletions b/‎bin/compare_concatenate.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎bin/prep_gliph2_tcr.py‎
Lines changed: 0 additions & 69 deletions b/‎bin/prep_gliph2_tcr.py‎
Lines changed: 0 additions & 69 deletions
diff --git a/‎bin/calc_sample.py‎ ‎bin/sample_calc.py‎bin/calc_sample.py renamed to bin/sample_calc.py
Lines changed: 28 additions & 31 deletions b/‎bin/calc_sample.py‎ ‎bin/sample_calc.py‎bin/calc_sample.py renamed to bin/sample_calc.py
Lines changed: 28 additions & 31 deletions
diff --git a/‎bin/samplesheet.py‎
Lines changed: 34 additions & 0 deletions b/‎bin/samplesheet.py‎
Lines changed: 34 additions & 0 deletions
@@ -15,13 +15,6 @@
                 "type": "boolean",
                 "value": true
             },
-            "cluster_lvl": {
-                "default": true,
-                "description": "Clone clustering analysis with GLIPH2",
-                "title": "Cluster",
-                "type": "boolean",
-                "value": true
-            },
             "kmer_min_depth": {
                 "default": "3",
                 "description": "minimum depth of k-mer during clustering (GLIPH2)",
 
@@ -1,7 +1,6 @@
 {
     "sample_lvl": "$.params.dataset.paramJson.sample_lvl",
     "compare_lvl": "$.params.dataset.paramJson.compare_lvl",
-    "cluster_lvl": "$.params.dataset.paramJson.cluster_lvl",
     "data_dir": "$.params.inputs[0].s3|/data/",
     "kmer_min_depth": "$.params.dataset.paramJson.kmer_min_depth",
     "local_min_OVE": "$.params.dataset.paramJson.local_min_OVE",
 
@@ -13,7 +13,7 @@
 import csv
 from scipy.stats import entropy
 
-print('-- ENTERED calc_compare.py--')
+print('-- ENTERED compare_calc.py--')
 print('-- THE TIME IS: --' + str(pd.Timestamp.now()))
 
 # initialize parser
@@ -28,16 +28,16 @@
 #                     metavar='meta_data',
 #                     type=str,
 #                     help='metadata CSV file initially passed to nextflow run command')
-parser.add_argument('-p', '--project_dir',
-                    metavar='project_dir',
+parser.add_argument('-d', '--data_dir',
+                    metavar='data_dir',
                     type=str,
-                    help='path to project directory')
+                    help='path to data directory')
 
 args = parser.parse_args() 
 
 ## Import project directory path
-project_dir = args.project_dir
-sys.path.append(project_dir + '/bin/')
+data_dir = args.data_dir
+
 from utils import jaccard_index, sorensen_index, morisita_horn_index #, jensen_shannon_distance
 
 ## Read in sample table CSV file
@@ -59,6 +59,8 @@
 dfs = {}
 for file in files:
     # load data
+    file = os.path.basename(file)
+    file = os.path.join(data_dir, file)
     df = pd.read_csv(file, sep='\t', header=0)
 
     # Rename columns
 
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+"""
+compare_clonal_publicity.py
+Input: .tsv of CDR3 sequences,
+Output: .tsv of TCR sharing across samples, .tsv of sample-sample_id mapping 
+"""
+import argparse
+import pandas as pd
+
+def main():
+    # Initialize the parser
+    parser = argparse.ArgumentParser(description="Take positional args")
+    
+    # Add positional arguments
+    parser.add_argument(
+        "cdr_df",
+        type=str,
+        help="Input file name, expected to be in TSV format with columns [CDR3b, sample]",
+    )
+    args = parser.parse_args()
+    
+    # Load data
+    df = pd.read_csv(args.cdr_df, sep="\t")
+    
+    # Step 1: Map samples to integers
+    sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
+    df['sample_id'] = df['sample'].map(sample_mapping)
+    
+    # Step 2: Group by CDR3b and aggregate sample_ids
+    grouped = (
+        df.groupby('CDR3b')['sample_id']
+        .apply(lambda x: sorted(set(x)))  # remove duplicates if any
+        .reset_index()
+    )
+    
+    # Step 3: Add comma-separated list and total count
+    grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
+    grouped['total_samples'] = grouped['sample_id'].apply(len)
+    
+    # Step 4: Final output — drop raw list
+    final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
+    final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
+    
+    # Step 5: Export both outputs
+    final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
+    
+    # Also export the sample mapping
+    sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
+    sample_map_df.columns = ['patient', 'sample_id']
+    sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+"""
+gliph2_preprocess.py
+Input: adaptive TSV files
+Output: $concatenated_cdr3.txt
+"""
+
+# Import modules
+import argparse
+import os
+import pandas as pd
+
+def main():
+    # Initialize the parser
+    parser = argparse.ArgumentParser(description="Take positional args")
+
+    # Add positional arguments
+    parser.add_argument("data_dir")
+    parser.add_argument("samplesheet")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Print the arguments
+    print("data_dir: ", args.data_dir)
+    print("samplesheet: ", args.samplesheet)
+
+    samplesheet = pd.read_csv(args.samplesheet, header=0)
+
+    dfs = []
+    for index, row in samplesheet.iterrows():
+        file_path = os.path.basename(row['file'])
+        file_path = os.path.join(args.data_dir, file_path)
+        print(f"Loading {file_path}")
+        
+        # Read the TSV file into a dataframe
+        df = pd.read_csv(file_path, sep="\t", header=0)
+        
+        # Get metadata
+        subject_id = row['subject_id']
+        timepoint = row['timepoint']
+        origin = row['origin']
+            
+        # Add patient column
+        df['patient'] = f"{subject_id}:{timepoint}_{origin}"
+        df['sample'] = row['sample']
+        
+        # Select relevant columns
+        df = df[['aminoAcid', 'vGeneName', 'jGeneName', 'patient', 'count (templates/reads)', 'sample']]
+        dfs.append(df)
+
+
+    # Concatenate all the dataframes into one
+    df_combined = pd.concat(dfs)
+
+    # Rename columns as required
+    df_combined = df_combined.rename(columns={
+        'aminoAcid': 'CDR3b',
+        'vGeneName': 'TRBV',
+        'jGeneName': 'TRBJ',
+        'count (templates/reads)': 'counts'
+    })
+    df_combined = df_combined[df_combined['CDR3b'].notna()]
+
+    df_combined.to_csv(f"concatenated_cdr3.txt", sep="\t", index=False, header=True)
+
+if __name__ == "__main__":
+    main()
@@ -14,36 +14,6 @@
 import numpy as np
 import csv
 
-# initialize parser
-parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
-
-# add arguments
-parser.add_argument('-s', '--sample_meta', 
-                    metavar='sample_meta', 
-                    type=str, 
-                    help='sample metadata passed in through samples CSV file')
-parser.add_argument('-c', '--count_table', 
-                    metavar='count_table', 
-                    type=argparse.FileType('r'), 
-                    help='counts file in TSV format')
-# parser.add_argument('-d', '--data_dir',
-#                     metavar='data_dir',
-#                     type=str,
-#                     help='path to data directory')
-
-args = parser.parse_args() 
-
-## convert metadata to list
-s = args.sample_meta
-sample_meta = args.sample_meta[1:-1].split(', ')
-# print('sample_meta looks like this: ' + str(sample_meta))
-
-# Read in the counts file
-counts = pd.read_csv(args.count_table, sep='\t', header=0)
-counts = counts.rename(columns={'count (templates/reads)': 'read_count', 'frequencyCount (%)': 'frequency'})
-# print('counts columns: \n')
-# print(counts.columns)
-
 def calc_sample_stats(sample_meta, counts):
     """Calculate sample level statistics of TCR repertoire."""
 
@@ -145,4 +115,31 @@ def calc_sample_stats(sample_meta, counts):
     # with open('gene_usage_' + str(metadata[1] + '_' + str(metadata[2] + '_' + str(metadata[3]))) + '.pkl', 'wb') as f:
     #     pickle.dump(gene_usage, f)
 
-calc_sample_stats(sample_meta, counts)
+def main():
+    # initialize parser
+    parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
+
+    # add arguments
+    parser.add_argument('-s', '--sample_meta', 
+                        metavar='sample_meta', 
+                        type=str, 
+                        help='sample metadata passed in through samples CSV file')
+    parser.add_argument('-c', '--count_table', 
+                        metavar='count_table', 
+                        type=argparse.FileType('r'), 
+                        help='counts file in TSV format')
+
+    args = parser.parse_args() 
+
+    ## convert metadata to list
+    s = args.sample_meta
+    sample_meta = args.sample_meta[1:-1].split(', ')
+
+    # Read in the counts file
+    counts = pd.read_csv(args.count_table, sep='\t', header=0)
+    counts = counts.rename(columns={'count (templates/reads)': 'read_count', 'frequencyCount (%)': 'frequency'})
+
+    calc_sample_stats(sample_meta, counts)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+import argparse
+import pandas as pd
+
+# do any processing of the samplesheet here
+def samplesheet(samplesheet, data_dir):
+    ss = pd.read_csv(samplesheet, sep=',')
+    ss.to_csv('samplesheet_utf8.csv', index=False, encoding='utf-8-sig')
+    
+    stats = ss.describe()
+    stats.to_csv('samplesheet_stats.csv', index=False, encoding='utf-8-sig')
+    
+    print(ss.head())
+
+def main():
+    # initialize parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-s', '--samplesheet', 
+                        metavar='samplesheet', 
+                        type=str, 
+                        help='sample metadata passed in through samples CSV file')
+
+    parser.add_argument('-d', '--data_dir',
+                        metavar='data_dir',
+                        type=str,
+                        help='path to data directory')
+
+    args = parser.parse_args()
+
+    samplesheet(args.samplesheet, args.data_dir)
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"sample_lvl": "$.params.dataset.paramJson.sample_lvl",`
`3`	`3`	`"compare_lvl": "$.params.dataset.paramJson.compare_lvl",`
`4`		`- "cluster_lvl": "$.params.dataset.paramJson.cluster_lvl",`
`5`	`4`	`"data_dir": "$.params.inputs[0].s3\|/data/",`
`6`	`5`	`"kmer_min_depth": "$.params.dataset.paramJson.kmer_min_depth",`
`7`	`6`	`"local_min_OVE": "$.params.dataset.paramJson.local_min_OVE",`