Skip to content

Commit ff9c0f1

Browse files
committed
add variant counts script
1 parent 670c3a6 commit ff9c0f1

File tree

1 file changed

+51
-0
lines changed

1 file changed

+51
-0
lines changed

variant_counts.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Simple script to calculate the number of variants per year
2+
3+
import sys
4+
import pandas as pd
5+
6+
7+
def get_total(s):
8+
"""Convert a semicolon-separated list of integers into the sum of that list.
9+
10+
Helper function for multi-target papers.
11+
12+
Returns the integer if only one value is present.
13+
14+
Returns 0 if the value of s is None or NA.
15+
"""
16+
if s is None or pd.isna(s):
17+
return 0
18+
elif ";" in s:
19+
return sum(int(x) for x in s.split(";"))
20+
else:
21+
return int(s)
22+
23+
24+
if __name__ == "__main__":
25+
# read the table
26+
if len(sys.argv) > 1:
27+
infile = sys.argv[1]
28+
else:
29+
infile = "maverefs.tsv"
30+
df = pd.read_csv(infile, sep="\t")
31+
32+
# calculate and store the number of variants per paper
33+
# keep the maximum of nt and aa variant counts if both are specified
34+
df["Variants (max)"] = 0
35+
for i, r in df.iterrows():
36+
df.loc[i, "Variants (max)"] = max(
37+
get_total(r["Variants (nt)"]), get_total(r["Variants (aa)"])
38+
)
39+
40+
# calculate the sum of variants for each year
41+
result = df.groupby("Year")["Variants (max)"].sum()
42+
result.index = [
43+
int(x) for x in result.index
44+
] # convert years to ints instead of float
45+
result.index.name = "year"
46+
result.name = "variants"
47+
result = pd.DataFrame(result)
48+
result["cumulative_variants"] = result["variants"].cumsum()
49+
50+
# write the result to stdout
51+
result.to_csv(sys.stdout)

0 commit comments

Comments
 (0)