Skip to content

Commit cb57c8a

Browse files
committed
Add main to scripts
1 parent 9992bc9 commit cb57c8a

File tree

4 files changed

+138
-126
lines changed

4 files changed

+138
-126
lines changed

bin/compare_clonal_publicity.py

Lines changed: 44 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,49 @@
66
Output: .tsv of TCR sharing across samples, .tsv of sample-sample_id mapping
77
"""
88
import argparse
9-
109
import pandas as pd
1110

12-
# Initialize the parser
13-
parser = argparse.ArgumentParser(description="Take positional args")
14-
15-
# Add positional arguments
16-
parser.add_argument("cdr_df")
17-
args = parser.parse_args()
18-
19-
# Load your data
20-
df = pd.read_csv(args.cdr_df, sep="\t")
21-
22-
# Step 1: Map samples to integers
23-
sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
24-
df['sample_id'] = df['sample'].map(sample_mapping)
25-
26-
# Step 2: Group by CDR3b and aggregate sample_ids
27-
grouped = (
28-
df.groupby('CDR3b')['sample_id']
29-
.apply(lambda x: sorted(set(x))) # remove duplicates if any
30-
.reset_index()
31-
)
32-
33-
# Step 3: Add comma-separated list and total count
34-
grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
35-
grouped['total_samples'] = grouped['sample_id'].apply(len)
36-
37-
# Step 4: Final output — drop raw list
38-
final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
39-
final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
40-
41-
# Step 5: Export both outputs
42-
final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
43-
44-
# Also export the sample mapping
45-
sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
46-
sample_map_df.columns = ['patient', 'sample_id']
47-
sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)
11+
def main():
12+
# Initialize the parser
13+
parser = argparse.ArgumentParser(description="Take positional args")
14+
15+
# Add positional arguments
16+
parser.add_argument(
17+
"cdr_df",
18+
type=str,
19+
help="Input file name, expected to be in TSV format with columns [CDR3b, sample]",
20+
)
21+
args = parser.parse_args()
22+
23+
# Load data
24+
df = pd.read_csv(args.cdr_df, sep="\t")
25+
26+
# Step 1: Map samples to integers
27+
sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
28+
df['sample_id'] = df['sample'].map(sample_mapping)
29+
30+
# Step 2: Group by CDR3b and aggregate sample_ids
31+
grouped = (
32+
df.groupby('CDR3b')['sample_id']
33+
.apply(lambda x: sorted(set(x))) # remove duplicates if any
34+
.reset_index()
35+
)
36+
37+
# Step 3: Add comma-separated list and total count
38+
grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
39+
grouped['total_samples'] = grouped['sample_id'].apply(len)
40+
41+
# Step 4: Final output — drop raw list
42+
final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
43+
final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
44+
45+
# Step 5: Export both outputs
46+
final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
47+
48+
# Also export the sample mapping
49+
sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
50+
sample_map_df.columns = ['patient', 'sample_id']
51+
sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)
52+
53+
if __name__ == "__main__":
54+
main()

bin/compare_concatenate.py

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,59 +8,62 @@
88

99
# Import modules
1010
import argparse
11-
import glob
1211
import os
1312
import pandas as pd
1413

15-
# Initialize the parser
16-
parser = argparse.ArgumentParser(description="Take positional args")
14+
def main():
15+
# Initialize the parser
16+
parser = argparse.ArgumentParser(description="Take positional args")
1717

18-
# Add positional arguments
19-
parser.add_argument("data_dir")
20-
parser.add_argument("samplesheet")
18+
# Add positional arguments
19+
parser.add_argument("data_dir")
20+
parser.add_argument("samplesheet")
2121

22-
# Parse the arguments
23-
args = parser.parse_args()
22+
# Parse the arguments
23+
args = parser.parse_args()
2424

25-
# Print the arguments
26-
print("data_dir: ", args.data_dir)
27-
print("samplesheet: ", args.samplesheet)
25+
# Print the arguments
26+
print("data_dir: ", args.data_dir)
27+
print("samplesheet: ", args.samplesheet)
2828

29-
samplesheet = pd.read_csv(args.samplesheet, header=0)
29+
samplesheet = pd.read_csv(args.samplesheet, header=0)
3030

31-
dfs = []
32-
for index, row in samplesheet.iterrows():
33-
file_path = os.path.basename(row['file'])
34-
file_path = os.path.join(args.data_dir, file_path)
35-
print(f"Loading {file_path}")
36-
37-
# Read the TSV file into a dataframe
38-
df = pd.read_csv(file_path, sep="\t", header=0)
39-
40-
# Get metadata
41-
subject_id = row['subject_id']
42-
timepoint = row['timepoint']
43-
origin = row['origin']
31+
dfs = []
32+
for index, row in samplesheet.iterrows():
33+
file_path = os.path.basename(row['file'])
34+
file_path = os.path.join(args.data_dir, file_path)
35+
print(f"Loading {file_path}")
4436

45-
# Add patient column
46-
df['patient'] = f"{subject_id}:{timepoint}_{origin}"
47-
df['sample'] = row['sample']
48-
49-
# Select relevant columns
50-
df = df[['aminoAcid', 'vGeneName', 'jGeneName', 'patient', 'count (templates/reads)', 'sample']]
51-
dfs.append(df)
37+
# Read the TSV file into a dataframe
38+
df = pd.read_csv(file_path, sep="\t", header=0)
39+
40+
# Get metadata
41+
subject_id = row['subject_id']
42+
timepoint = row['timepoint']
43+
origin = row['origin']
44+
45+
# Add patient column
46+
df['patient'] = f"{subject_id}:{timepoint}_{origin}"
47+
df['sample'] = row['sample']
48+
49+
# Select relevant columns
50+
df = df[['aminoAcid', 'vGeneName', 'jGeneName', 'patient', 'count (templates/reads)', 'sample']]
51+
dfs.append(df)
52+
5253

54+
# Concatenate all the dataframes into one
55+
df_combined = pd.concat(dfs)
5356

54-
# Concatenate all the dataframes into one
55-
df_combined = pd.concat(dfs)
57+
# Rename columns as required
58+
df_combined = df_combined.rename(columns={
59+
'aminoAcid': 'CDR3b',
60+
'vGeneName': 'TRBV',
61+
'jGeneName': 'TRBJ',
62+
'count (templates/reads)': 'counts'
63+
})
64+
df_combined = df_combined[df_combined['CDR3b'].notna()]
5665

57-
# Rename columns as required
58-
df_combined = df_combined.rename(columns={
59-
'aminoAcid': 'CDR3b',
60-
'vGeneName': 'TRBV',
61-
'jGeneName': 'TRBJ',
62-
'count (templates/reads)': 'counts'
63-
})
64-
df_combined = df_combined[df_combined['CDR3b'].notna()]
66+
df_combined.to_csv(f"concatenated_cdr3.txt", sep="\t", index=False, header=True)
6567

66-
df_combined.to_csv(f"concatenated_cdr3.txt", sep="\t", index=False, header=True)
68+
if __name__ == "__main__":
69+
main()

bin/sample_calc.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,6 @@
1313
from scipy.stats import entropy
1414
import numpy as np
1515
import csv
16-
import os
17-
18-
# initialize parser
19-
parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
20-
21-
# add arguments
22-
parser.add_argument('-s', '--sample_meta',
23-
metavar='sample_meta',
24-
type=str,
25-
help='sample metadata passed in through samples CSV file')
26-
parser.add_argument('-c', '--count_table',
27-
metavar='count_table',
28-
type=argparse.FileType('r'),
29-
help='counts file in TSV format')
30-
31-
args = parser.parse_args()
32-
33-
## convert metadata to list
34-
s = args.sample_meta
35-
sample_meta = args.sample_meta[1:-1].split(', ')
36-
# print('sample_meta looks like this: ' + str(sample_meta))
37-
38-
# Read in the counts file
39-
counts = pd.read_csv(args.count_table, sep='\t', header=0)
40-
counts = counts.rename(columns={'count (templates/reads)': 'read_count', 'frequencyCount (%)': 'frequency'})
41-
# print('counts columns: \n')
42-
# print(counts.columns)
4316

4417
def calc_sample_stats(sample_meta, counts):
4518
"""Calculate sample level statistics of TCR repertoire."""
@@ -142,4 +115,31 @@ def calc_sample_stats(sample_meta, counts):
142115
# with open('gene_usage_' + str(metadata[1] + '_' + str(metadata[2] + '_' + str(metadata[3]))) + '.pkl', 'wb') as f:
143116
# pickle.dump(gene_usage, f)
144117

145-
calc_sample_stats(sample_meta, counts)
118+
def main():
119+
# initialize parser
120+
parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
121+
122+
# add arguments
123+
parser.add_argument('-s', '--sample_meta',
124+
metavar='sample_meta',
125+
type=str,
126+
help='sample metadata passed in through samples CSV file')
127+
parser.add_argument('-c', '--count_table',
128+
metavar='count_table',
129+
type=argparse.FileType('r'),
130+
help='counts file in TSV format')
131+
132+
args = parser.parse_args()
133+
134+
## convert metadata to list
135+
s = args.sample_meta
136+
sample_meta = args.sample_meta[1:-1].split(', ')
137+
138+
# Read in the counts file
139+
counts = pd.read_csv(args.count_table, sep='\t', header=0)
140+
counts = counts.rename(columns={'count (templates/reads)': 'read_count', 'frequencyCount (%)': 'frequency'})
141+
142+
calc_sample_stats(sample_meta, counts)
143+
144+
if __name__ == "__main__":
145+
main()

bin/samplesheet.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,9 @@
11
#!/usr/bin/env python
22

33
import argparse
4-
import os
54
import pandas as pd
65

7-
# initialize parser
8-
parser = argparse.ArgumentParser()
9-
parser.add_argument('-s', '--samplesheet',
10-
metavar='samplesheet',
11-
type=str,
12-
help='sample metadata passed in through samples CSV file')
13-
14-
parser.add_argument('-d', '--data_dir',
15-
metavar='data_dir',
16-
type=str,
17-
help='path to data directory')
18-
19-
args = parser.parse_args()
20-
21-
#do any processing of the samplesheet here
6+
# do any processing of the samplesheet here
227
def samplesheet(samplesheet, data_dir):
238
ss = pd.read_csv(samplesheet, sep=',')
249
ss.to_csv('samplesheet_utf8.csv', index=False, encoding='utf-8-sig')
@@ -28,5 +13,22 @@ def samplesheet(samplesheet, data_dir):
2813

2914
print(ss.head())
3015

31-
samplesheet(args.samplesheet, args.data_dir)
32-
16+
def main():
17+
# initialize parser
18+
parser = argparse.ArgumentParser()
19+
parser.add_argument('-s', '--samplesheet',
20+
metavar='samplesheet',
21+
type=str,
22+
help='sample metadata passed in through samples CSV file')
23+
24+
parser.add_argument('-d', '--data_dir',
25+
metavar='data_dir',
26+
type=str,
27+
help='path to data directory')
28+
29+
args = parser.parse_args()
30+
31+
samplesheet(args.samplesheet, args.data_dir)
32+
33+
if __name__ == "__main__":
34+
main()

0 commit comments

Comments
 (0)