Skip to content

Commit b7ce179

Browse files
Applied black format (#49)
1 parent 70e4833 commit b7ce179

32 files changed

+5826
-3725
lines changed

datasources/cancergenecensus/diff_versions.py

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,37 +3,58 @@
33

44

55
def get_set_difference(case_series, comparison_series, column_name):
6-
new_members = case_series.difference(comparison_series)
7-
return pandas.Series(new_members, name=column_name)
6+
new_members = case_series.difference(comparison_series)
7+
return pandas.Series(new_members, name=column_name)
88

99

1010
def read_file(file, column_name):
11-
dataframe = pandas.read_csv(file, sep='\t', usecols=[column_name])
12-
return dataframe.set_index(column_name).index
11+
dataframe = pandas.read_csv(file, sep="\t", usecols=[column_name])
12+
return dataframe.set_index(column_name).index
1313

1414

1515
def write_file(dataframe, output_name):
16-
dataframe.to_csv(output_name, sep='\t', index=False)
16+
dataframe.to_csv(output_name, sep="\t", index=False)
1717

1818

1919
if __name__ == "__main__":
20-
description = "Identify genes added and removed between versions of Cancer Gene Census"
21-
arg_parser = argparse.ArgumentParser(prog='diff versions', description=description)
22-
arg_parser.add_argument('--old_version', '-o', help='input file, old version of datasource', required=True)
23-
arg_parser.add_argument('--new_version', '-n', help='input file, new version of datasource', required=True)
24-
arg_parser.add_argument('--gene_column_name', '-g', help='column which contains gene names', default="Gene Symbol")
25-
args = arg_parser.parse_args()
26-
27-
old = read_file(args.old_version, args.gene_column_name)
28-
new = read_file(args.new_version, args.gene_column_name)
29-
30-
removals = get_set_difference(old, new, args.gene_column_name)
31-
additions = get_set_difference(new, old, args.gene_column_name)
32-
33-
print(f"{len(removals)} genes have been removed between {args.old_version} and {args.new_version}")
34-
print(f"{', '.join(removals.tolist())}")
35-
print('')
36-
37-
print(f"{len(additions)} new genes appear in {args.new_version} that were not present in {args.old_version}")
38-
print(f"{', '.join(additions.tolist())}")
39-
print('')
20+
description = (
21+
"Identify genes added and removed between versions of Cancer Gene Census"
22+
)
23+
arg_parser = argparse.ArgumentParser(prog="diff versions", description=description)
24+
arg_parser.add_argument(
25+
"--old_version",
26+
"-o",
27+
help="input file, old version of datasource",
28+
required=True,
29+
)
30+
arg_parser.add_argument(
31+
"--new_version",
32+
"-n",
33+
help="input file, new version of datasource",
34+
required=True,
35+
)
36+
arg_parser.add_argument(
37+
"--gene_column_name",
38+
"-g",
39+
help="column which contains gene names",
40+
default="Gene Symbol",
41+
)
42+
args = arg_parser.parse_args()
43+
44+
old = read_file(args.old_version, args.gene_column_name)
45+
new = read_file(args.new_version, args.gene_column_name)
46+
47+
removals = get_set_difference(old, new, args.gene_column_name)
48+
additions = get_set_difference(new, old, args.gene_column_name)
49+
50+
print(
51+
f"{len(removals)} genes have been removed between {args.old_version} and {args.new_version}"
52+
)
53+
print(f"{', '.join(removals.tolist())}")
54+
print("")
55+
56+
print(
57+
f"{len(additions)} new genes appear in {args.new_version} that were not present in {args.old_version}"
58+
)
59+
print(f"{', '.join(additions.tolist())}")
60+
print("")

datasources/cancergenecensus/extract_genes.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,26 @@
33

44

55
def read_file(file, column_name):
6-
return pandas.read_csv(file, sep='\t', usecols=[column_name])
6+
return pandas.read_csv(file, sep="\t", usecols=[column_name])
77

88

99
def write_file(dataframe, output_name):
10-
dataframe.to_csv(output_name, sep='\t', index=False)
10+
dataframe.to_csv(output_name, sep="\t", index=False)
1111

1212

1313
if __name__ == "__main__":
14-
arg_parser = argparse.ArgumentParser(prog='extract genes', description='Extract genes from tab separated values.')
15-
arg_parser.add_argument('--input', '-i', help='input file', required=True)
16-
arg_parser.add_argument('--output', '-o', help='output file', required=True)
17-
arg_parser.add_argument('--gene_column_name', '-g', help='column which contains gene names', default="Gene Symbol")
18-
args = arg_parser.parse_args()
14+
arg_parser = argparse.ArgumentParser(
15+
prog="extract genes", description="Extract genes from tab separated values."
16+
)
17+
arg_parser.add_argument("--input", "-i", help="input file", required=True)
18+
arg_parser.add_argument("--output", "-o", help="output file", required=True)
19+
arg_parser.add_argument(
20+
"--gene_column_name",
21+
"-g",
22+
help="column which contains gene names",
23+
default="Gene Symbol",
24+
)
25+
args = arg_parser.parse_args()
1926

20-
df = read_file(args.input, args.gene_column_name)
21-
df.to_csv(args.output, sep='\t', index=False)
27+
df = read_file(args.input, args.gene_column_name)
28+
df.to_csv(args.output, sep="\t", index=False)
Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,27 @@
11
import pandas as pd
2-
df2_cols = ['Gene', 'Residue', 'p-value', 'Class']
3-
df5_cols = ['Gene', 'Reference amino acid', 'Variant amino acid', 'Amino_Acid_Position']
4-
df2 = pd.read_csv('3d_hotspots_T2.txt', sep='\t', usecols = df2_cols)
5-
df5 = pd.read_csv('3d_hotspots_T5.txt', sep = '\t', usecols = df5_cols)
62

7-
df5['Residue'] = df5.loc[:,'Reference amino acid'] + df5.loc[:,'Amino_Acid_Position'].astype(str)
8-
df5['alteration'] = 'p.' + df5.loc[:,'Reference amino acid'] + df5.loc[:,'Amino_Acid_Position'].astype(str) + df5.loc[:,'Variant amino acid']
9-
df5 = df5.drop(['Reference amino acid', 'Variant amino acid', 'Amino_Acid_Position'], axis = 1)
3+
df2_cols = ["Gene", "Residue", "p-value", "Class"]
4+
df5_cols = ["Gene", "Reference amino acid", "Variant amino acid", "Amino_Acid_Position"]
5+
df2 = pd.read_csv("3d_hotspots_T2.txt", sep="\t", usecols=df2_cols)
6+
df5 = pd.read_csv("3d_hotspots_T5.txt", sep="\t", usecols=df5_cols)
107

11-
df = pd.merge(df2, df5, on=['Gene', 'Residue'], how='left')
8+
df5["Residue"] = df5.loc[:, "Reference amino acid"] + df5.loc[
9+
:, "Amino_Acid_Position"
10+
].astype(str)
11+
df5["alteration"] = (
12+
"p."
13+
+ df5.loc[:, "Reference amino acid"]
14+
+ df5.loc[:, "Amino_Acid_Position"].astype(str)
15+
+ df5.loc[:, "Variant amino acid"]
16+
)
17+
df5 = df5.drop(
18+
["Reference amino acid", "Variant amino acid", "Amino_Acid_Position"], axis=1
19+
)
1220

13-
class_map = {
14-
'Cluster-exclusive': 1,
15-
'Hotspot-linked': 2,
16-
'Hotspot': 3
17-
}
21+
df = pd.merge(df2, df5, on=["Gene", "Residue"], how="left")
1822

19-
df['cancerhotspots3D_bin'] = df['Class'].map(class_map)
23+
class_map = {"Cluster-exclusive": 1, "Hotspot-linked": 2, "Hotspot": 3}
2024

21-
df.to_csv('hotspots3d.txt', sep = '\t', index = False)
25+
df["cancerhotspots3D_bin"] = df["Class"].map(class_map)
26+
27+
df.to_csv("hotspots3d.txt", sep="\t", index=False)

datasources/clinvar/prepare_clinvar.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
import argparse
33

44
COLUMNS = [
5-
'GeneSymbol',
6-
'Chromosome',
7-
'Start',
8-
'Stop',
9-
'ReferenceAllele',
10-
'AlternateAllele',
11-
'ClinicalSignificance',
12-
'ClinSigSimple'
5+
"GeneSymbol",
6+
"Chromosome",
7+
"Start",
8+
"Stop",
9+
"ReferenceAllele",
10+
"AlternateAllele",
11+
"ClinicalSignificance",
12+
"ClinSigSimple",
1313
]
1414

1515

@@ -18,24 +18,32 @@ def create_output_filename(date):
1818

1919

2020
def read_file(file, relevant_columns):
21-
return pandas.read_csv(file, sep='\t', usecols=relevant_columns, low_memory=False)
21+
return pandas.read_csv(file, sep="\t", usecols=relevant_columns, low_memory=False)
2222

2323

2424
def write_file(dataframe, date):
2525
output_name = create_output_filename(date)
26-
dataframe.to_csv(output_name, sep='\t', index=False)
26+
dataframe.to_csv(output_name, sep="\t", index=False)
2727

2828

2929
if __name__ == "__main__":
30-
parser = argparse.ArgumentParser(prog='prepare clinvar', description='Prepare ClinVar for use with MOAlmanac')
31-
parser.add_argument('--input', '-i', help='input file, CosmicMutantExport.tsv', required=True)
32-
parser.add_argument('--date', '-d', help='date of access; e.g. 2023-03-09', required=True)
30+
parser = argparse.ArgumentParser(
31+
prog="prepare clinvar", description="Prepare ClinVar for use with MOAlmanac"
32+
)
33+
parser.add_argument(
34+
"--input", "-i", help="input file, CosmicMutantExport.tsv", required=True
35+
)
36+
parser.add_argument(
37+
"--date", "-d", help="date of access; e.g. 2023-03-09", required=True
38+
)
3339
args = parser.parse_args()
3440

3541
df = read_file(args.input, COLUMNS)
3642
df.drop_duplicates(inplace=True)
3743
write_file(df, args.date)
3844

39-
gene_count = df['GeneSymbol'].drop_duplicates().shape[0]
45+
gene_count = df["GeneSymbol"].drop_duplicates().shape[0]
4046
total_count = df.shape[0]
41-
print(f"As of {args.date}, ClinVar contains {gene_count} genes and {total_count} variants.")
47+
print(
48+
f"As of {args.date}, ClinVar contains {gene_count} genes and {total_count} variants."
49+
)

datasources/cosmic/prepare_cosmic.py

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,48 @@
33

44

55
def create_output_filename(version):
6-
return f"CosmicMutantExport_{version}.lite.txt"
6+
return f"CosmicMutantExport_{version}.lite.txt"
77

88

99
def read_file(file, relevant_columns):
10-
return pandas.read_csv(file, sep='\t', usecols=relevant_columns)
10+
return pandas.read_csv(file, sep="\t", usecols=relevant_columns)
1111

1212

1313
def write_file(dataframe, version):
14-
output_name = f"CosmicMutantExport_{version}.lite.txt"
15-
dataframe.to_csv(output_name, sep='\t', index=False)
14+
output_name = f"CosmicMutantExport_{version}.lite.txt"
15+
dataframe.to_csv(output_name, sep="\t", index=False)
1616

1717

1818
if __name__ == "__main__":
19-
parser = argparse.ArgumentParser(prog='prepare COSMIC', description='Prepare COSMIC for use with MOAlmanac')
20-
parser.add_argument('--input', '-i', help='input file, CosmicMutantExport.tsv', required=True)
21-
parser.add_argument('--version', '-v', help='input file version; e.g., v97', required=True)
22-
parser.add_argument('--gene_column_name', '-g', help='column that contains gene names', default="Gene name")
23-
parser.add_argument('--protein_column_name', '-p', help='column that contains protein changes', default="Mutation AA")
24-
args = parser.parse_args()
25-
26-
df = read_file(args.input, [args.gene_column_name, args.protein_column_name])
27-
df.drop_duplicates(inplace=True)
28-
write_file(df, args.version)
29-
30-
gene_count = df[args.gene_column_name].drop_duplicates().shape[0]
31-
total_count = df.shape[0]
32-
print(f"COSMIC {args.version} contains {gene_count} genes and {total_count} protein changes")
19+
parser = argparse.ArgumentParser(
20+
prog="prepare COSMIC", description="Prepare COSMIC for use with MOAlmanac"
21+
)
22+
parser.add_argument(
23+
"--input", "-i", help="input file, CosmicMutantExport.tsv", required=True
24+
)
25+
parser.add_argument(
26+
"--version", "-v", help="input file version; e.g., v97", required=True
27+
)
28+
parser.add_argument(
29+
"--gene_column_name",
30+
"-g",
31+
help="column that contains gene names",
32+
default="Gene name",
33+
)
34+
parser.add_argument(
35+
"--protein_column_name",
36+
"-p",
37+
help="column that contains protein changes",
38+
default="Mutation AA",
39+
)
40+
args = parser.parse_args()
41+
42+
df = read_file(args.input, [args.gene_column_name, args.protein_column_name])
43+
df.drop_duplicates(inplace=True)
44+
write_file(df, args.version)
45+
46+
gene_count = df[args.gene_column_name].drop_duplicates().shape[0]
47+
total_count = df.shape[0]
48+
print(
49+
f"COSMIC {args.version} contains {gene_count} genes and {total_count} protein changes"
50+
)

datasources/exac/expand_exac.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,32 @@
22
import argparse
33

44
parser = argparse.ArgumentParser()
5-
parser.add_argument('--exac', help='Tab delimited ExAC', required=True)
5+
parser.add_argument("--exac", help="Tab delimited ExAC", required=True)
66
args = parser.parse_args()
77

8-
exac = pd.read_csv(args.exac, sep='\t', low_memory=False)
8+
exac = pd.read_csv(args.exac, sep="\t", low_memory=False)
99

10-
cols = ['ALT', 'AF', 'AC', 'AC_AFR', 'AC_AMR', 'AC_EAS', 'AC_FIN', 'AC_NFE', 'AC_OTH', 'AC_SAS']
10+
cols = [
11+
"ALT",
12+
"AF",
13+
"AC",
14+
"AC_AFR",
15+
"AC_AMR",
16+
"AC_EAS",
17+
"AC_FIN",
18+
"AC_NFE",
19+
"AC_OTH",
20+
"AC_SAS",
21+
]
1122
fillcols = list(set(exac.columns.tolist()) - set(cols))
1223

13-
idx = exac['ALT'].astype(str).str.contains(',')
24+
idx = exac["ALT"].astype(str).str.contains(",")
1425
idx_multiallele = exac[idx].index
1526
idx_singleallele = exac[~idx].index
1627

1728
expanded_list = []
1829
for i in idx_multiallele:
19-
expand = exac.loc[i, cols].str.split(',', expand=True).T
30+
expand = exac.loc[i, cols].str.split(",", expand=True).T
2031
fill = pd.DataFrame(exac.loc[i, fillcols]).T.reset_index(drop=True)
2132
for j in expand.index[1:]:
2233
fill = fill.append(exac.loc[i, fillcols], ignore_index=True)
@@ -26,6 +37,6 @@
2637

2738
df = pd.concat([exac.loc[idx_singleallele, :], expanded_exac], ignore_index=True)
2839

29-
mincols = ['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'AF', 'AC', 'AN']
30-
df.to_csv('exac.expanded.r1.txt', sep='\t', index=False)
31-
df.loc[:, mincols].to_csv('exac.expanded.min.r1.txt', sep='\t', index=False)
40+
mincols = ["CHROM", "POS", "REF", "ALT", "QUAL", "AF", "AC", "AN"]
41+
df.to_csv("exac.expanded.r1.txt", sep="\t", index=False)
42+
df.loc[:, mincols].to_csv("exac.expanded.min.r1.txt", sep="\t", index=False)

0 commit comments

Comments
 (0)