Skip to content

Commit 83b8eba

Browse files
committed
updated ingestion scripts
1 parent 270a639 commit 83b8eba

File tree

2 files changed

+63
-1
lines changed

2 files changed

+63
-1
lines changed

script/ingestion/r03.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import pandas as pd
2+
import yaml
3+
from glob import glob
4+
from uuid import uuid5, NAMESPACE_URL
5+
from uuid import uuid5, NAMESPACE_URL
6+
from s3_update import backup_file
7+
from ingest_common import connection
8+
import io
9+
import csv
10+
import json
11+
12+
data = {}
13+
for filename in glob('../../src/pages/r03/*.md'):
14+
with open(filename) as o:
15+
markdown = o.read()
16+
m = markdown.split("---")
17+
row = yaml.safe_load(m[1])
18+
if "grant_num" in row:
19+
description = m[-1].strip()
20+
grant_num = row['grant_num']
21+
uid = str(uuid5(NAMESPACE_URL, grant_num))
22+
data[uid] = {**row, "description": description}
23+
r03_df = pd.DataFrame.from_dict(data, orient="index").fillna('')
24+
r03_df.index.name = "id"
25+
r03_df.to_csv('r03.tsv', sep="\t")
26+
# Update S3
27+
backup_file(r03_df, "r03")
28+
29+
## ingest
30+
31+
cur = connection.cursor()
32+
33+
cur.execute('''
34+
create table r03_tmp
35+
as table r03
36+
with no data;
37+
''')
38+
39+
s_buf = io.StringIO()
40+
r03_df.to_csv(s_buf, header=True, sep="\t", quoting=csv.QUOTE_NONE)
41+
s_buf.seek(0)
42+
columns = next(s_buf).strip().split('\t')
43+
cur.copy_from(s_buf, 'r03_tmp',
44+
columns=columns,
45+
null='',
46+
sep='\t',
47+
)
48+
column_string = ", ".join(columns)
49+
set_string = ",\n".join(["%s = excluded.%s"%(i,i) for i in columns])
50+
cur.execute('''
51+
insert into r03 (%s)
52+
select %s
53+
from r03_tmp
54+
on conflict (id)
55+
do update
56+
set %s
57+
;
58+
'''%(column_string, column_string, set_string))
59+
cur.execute('drop table r03_tmp;')
60+
61+
connection.commit()
62+
print("ingested r03")

script/process_markdown.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@
555555
},
556556
{
557557
"cell_type": "code",
558-
"execution_count": null,
558+
"execution_count": 31,
559559
"metadata": {},
560560
"outputs": [],
561561
"source": [

0 commit comments

Comments
 (0)