Skip to content

Commit fa2f0d2

Browse files
committed
added tool ingest
1 parent 73cb236 commit fa2f0d2

File tree

1 file changed

+119
-0
lines changed

1 file changed

+119
-0
lines changed

script/ingestion/tools.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import pandas as pd
2+
import yaml
3+
from glob import glob
4+
from uuid import uuid5, NAMESPACE_URL
5+
from uuid import uuid5, NAMESPACE_URL
6+
from s3_update import backup_file
7+
from ingest_common import connection
8+
import io
9+
import csv
10+
publication_df = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_publications.tsv', sep="\t", index_col=0)
11+
publication_mapper = {}
12+
for i, row in publication_df.iterrows():
13+
doi = row['doi']
14+
if type(doi) == str:
15+
publication_mapper[doi] = i
16+
17+
tools = {}
18+
for filename in glob('../../src/pages/tools/*.md'):
19+
with open(filename) as o:
20+
markdown = o.read()
21+
m = markdown.split("---")
22+
row = yaml.safe_load(m[1])
23+
if "label" in row:
24+
uid = str(uuid5(NAMESPACE_URL, row['label']))
25+
description = m[-1].strip()
26+
27+
val = {"description": description}
28+
for k, v in row.items():
29+
if k == "doi":
30+
if type(v) == str:
31+
doi = v.replace("https://doi.org/", "")
32+
pub_id = publication_mapper[doi]
33+
publication_df.at[pub_id, 'tool_id'] = uid
34+
elif not k == "layout":
35+
val[k] = v
36+
37+
tools[uid] = val
38+
39+
tools_df = pd.DataFrame.from_dict(tools, orient="index").fillna('')
40+
tools_df.index.name = "id"
41+
backup_file(tools_df, "tools", quoting=False)
42+
backup_file(publication_df, "publications", quoting=False)
43+
44+
cur = connection.cursor()
45+
# Remove tool_ids on publication
46+
cur.execute('''
47+
UPDATE publications
48+
SET tool_id=NULL;
49+
''')
50+
51+
# delete tool table
52+
cur.execute('''
53+
DELETE FROM tools;
54+
''')
55+
# Create tools
56+
cur.execute('''
57+
create table tools_tmp
58+
as table tools
59+
with no data;
60+
''')
61+
62+
t_buf = io.StringIO()
63+
tools_df.to_csv(t_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t")
64+
t_buf.seek(0)
65+
66+
columns = next(t_buf).strip().split('\t')
67+
cur.copy_from(t_buf, 'tools_tmp',
68+
columns=columns,
69+
null='',
70+
sep='\t',
71+
)
72+
column_string = ", ".join(columns)
73+
set_string = ",\n".join(["%s = excluded.%s"%(i,i) for i in columns])
74+
cur.execute('''
75+
insert into tools (%s)
76+
select %s
77+
from tools_tmp
78+
on conflict (id)
79+
do update
80+
set %s
81+
;
82+
'''%(column_string, column_string, set_string))
83+
cur.execute('drop table tools_tmp;')
84+
85+
# Create publications
86+
cur.execute('''
87+
create table publications_tmp
88+
as table publications
89+
with no data;
90+
''')
91+
92+
p_buf = io.StringIO()
93+
publication_df.to_csv(p_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t")
94+
p_buf.seek(0)
95+
96+
columns = next(p_buf).strip().split('\t')
97+
cur.copy_from(p_buf, 'publications_tmp',
98+
columns=columns,
99+
null='',
100+
sep='\t',
101+
)
102+
column_string = ", ".join(columns)
103+
set_string = ",\n".join(["%s = excluded.%s"%(i,i) for i in columns])
104+
cur.execute('''
105+
insert into publications (%s)
106+
select %s
107+
from publications_tmp
108+
on conflict (id)
109+
do update
110+
set %s
111+
;
112+
'''%(column_string, column_string, set_string))
113+
cur.execute('drop table publications_tmp;')
114+
115+
116+
117+
connection.commit()
118+
119+
print("Ingested Tools")

0 commit comments

Comments
 (0)