Skip to content

Commit caeaecf

Browse files
committed
added usecase script
1 parent 709d120 commit caeaecf

File tree

1 file changed

+121
-0
lines changed

1 file changed

+121
-0
lines changed

script/ingestion/usecase.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import pandas as pd
2+
import yaml
3+
from glob import glob
4+
from uuid import uuid5, NAMESPACE_URL
5+
from uuid import uuid5, NAMESPACE_URL
6+
from s3_update import backup_file
7+
from ingest_common import connection
8+
import io
9+
import csv
10+
import json
11+
12+
dccs = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv', sep="\t", index_col=0, header=0)
13+
# map dcc names to their respective ids
14+
dcc_mapper = {}
15+
for i, v in dccs.loc[:,'short_label'].items():
16+
dcc_mapper[v] = i
17+
18+
data = {}
19+
dcc_usecase = []
20+
for filename in glob('../../src/pages/usecase/*.md'):
21+
with open(filename) as o:
22+
markdown = o.read()
23+
m = markdown.split("---")
24+
row = yaml.safe_load(m[1])
25+
title = row['title']
26+
uid = str(uuid5(NAMESPACE_URL, title))
27+
data[uid] = {
28+
"title": row.get("title"),
29+
"short_description": row.get("short_description"),
30+
"description": m[-1].strip(),
31+
"tool_icon": row.get("tool_icon"),
32+
"tool_name": row.get("tool_name"),
33+
"inputs": json.dumps(list(set(row["inputs"].strip().split("; ")))) if row.get('inputs') else '',
34+
"sources": json.dumps(list(set(row["sources"].strip().split("; ")))) if row.get('sources') else '',
35+
"link": row.get("link"),
36+
"image": row.get("image"),
37+
"tutorial": row.get("tutorial"),
38+
"featured": row.get("featured"),
39+
"creator_dcc_id": dcc_mapper[row["creator_dcc"][0]],
40+
}
41+
if row.get("source_dcc"):
42+
for dcc in set([dcc_mapper[i] for i in row["source_dcc"]]):
43+
dcc_usecase.append({"usecase_id": uid, "dcc_id": dcc})
44+
45+
usecase_df = pd.DataFrame.from_dict(data, orient="index").fillna('')
46+
usecase_df.index.name = "id"
47+
dcc_usecase_df = pd.DataFrame.from_records(dcc_usecase)
48+
49+
backup_file(usecase_df, "usecase", quoting=False)
50+
backup_file(dcc_usecase_df, "dcc_usecase", False)
51+
52+
cur = connection.cursor()
53+
54+
cur.execute('''
55+
DELETE FROM dcc_usecase;
56+
''')
57+
58+
cur.execute('''
59+
DELETE FROM usecase;
60+
''')
61+
cur.execute('''
62+
create table usecase_tmp
63+
as table usecase
64+
with no data;
65+
''')
66+
67+
u_buf = io.StringIO()
68+
usecase_df.to_csv(u_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t")
69+
u_buf.seek(0)
70+
columns = next(u_buf).strip().split('\t')
71+
cur.copy_from(u_buf, 'usecase_tmp',
72+
columns=columns,
73+
null='',
74+
sep='\t',
75+
)
76+
column_string = ", ".join(columns)
77+
set_string = ",\n".join(["%s = excluded.%s"%(i,i) for i in columns])
78+
cur.execute('''
79+
insert into usecase (%s)
80+
select %s
81+
from usecase_tmp
82+
on conflict (id)
83+
do update
84+
set %s
85+
;
86+
'''%(column_string, column_string, set_string))
87+
cur.execute('drop table usecase_tmp;')
88+
connection.commit()
89+
90+
cur = connection.cursor()
91+
cur.execute('''
92+
create table dcc_usecase_tmp
93+
as table dcc_usecase
94+
with no data;
95+
''')
96+
97+
98+
99+
d_buf = io.StringIO()
100+
dcc_usecase_df.to_csv(d_buf, header=True, index=None, sep="\t")
101+
d_buf.seek(0)
102+
columns = next(d_buf).strip().split('\t')
103+
cur.copy_from(d_buf, 'dcc_usecase_tmp',
104+
columns=columns,
105+
null='',
106+
sep='\t',
107+
)
108+
109+
column_string = ", ".join(columns)
110+
111+
cur.execute('''
112+
insert into dcc_usecase (%s)
113+
select %s
114+
from dcc_usecase_tmp
115+
on conflict
116+
do nothing
117+
;
118+
'''%(column_string, column_string))
119+
cur.execute('drop table dcc_usecase_tmp;')
120+
connection.commit()
121+

0 commit comments

Comments
 (0)