Skip to content

Commit ffd5642

Browse files
committed
added publications
1 parent ce6982c commit ffd5642

File tree

89 files changed

+215
-7
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+215
-7
lines changed
164 Bytes
Binary file not shown.

script/ingestion/publications.py

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
import pandas as pd
2+
import yaml
3+
from glob import glob
4+
from uuid import uuid5, NAMESPACE_URL
5+
from uuid import uuid5, NAMESPACE_URL
6+
from s3_update import backup_file
7+
from ingest_common import connection
8+
import io
9+
import csv
10+
dccs = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv', sep="\t", index_col=0, header=0)
11+
12+
dccs = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv', sep="\t", index_col=0, header=0)
13+
# map dcc names to their respective ids
14+
dcc_mapper = {}
15+
for k,v in dccs.iterrows():
16+
dcc_mapper[v["short_label"]] = k
17+
18+
partnerships = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_partnerships.tsv', sep="\t", index_col=0)
19+
partnership_mapper = {}
20+
for k,v in partnerships.iterrows():
21+
partnership_mapper[v["title"]] = k
22+
23+
r03 = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_r03s.tsv', sep="\t", index_col=0)
24+
r03_mapper = {}
25+
for k,v in r03.iterrows():
26+
r03_mapper[v["grant_num"]] = k
27+
28+
publication_columns = ["title", "journal", "authors", "year", "page", "volume", "issue", "pmid", "pmcid", "doi", "landmark", "tool_id", "carousel", "carousel_title", "carousel_link", "carousel_description", "image", "featured", "keywords" ]
29+
dcc_publication_columns = ["publication_id", "dcc_id"]
30+
partnership_publication_columns = ["publication_id", "partnership_id"]
31+
r03_publication_columns = ["publication_id", "r03_id"]
32+
33+
publication_df = pd.DataFrame("-", index=[], columns=publication_columns)
34+
publication_df.index.name = "id"
35+
dcc_publication_df = pd.DataFrame("-", index=[], columns=dcc_publication_columns)
36+
partnership_publication_df = pd.DataFrame("-", index=[], columns=partnership_publication_columns)
37+
r03_publication_df = pd.DataFrame("-", index=[], columns=r03_publication_columns)
38+
ind = 0
39+
pind = 0
40+
rind = 0
41+
42+
for filename in glob('../../src/pages/publications/*.md'):
43+
with open(filename) as o:
44+
markdown = o.read()
45+
m = markdown.split("---")
46+
yml = yaml.safe_load(m[1])
47+
if "title" in yml:
48+
title = yml['title']
49+
uid = str(uuid5(NAMESPACE_URL, title))
50+
v = {c: yml[c] for c in publication_columns if c in yml}
51+
publication_df.loc[uid] = v
52+
if "dccs" in yml and type(yml["dccs"]) == str and yml["dccs"].strip() != '':
53+
for dcc in yml["dccs"].split(";"):
54+
dcc = dcc.strip()
55+
dcc_id = dcc_mapper[dcc]
56+
dcc_publication_df.loc[ind] = [uid, dcc_mapper[dcc]]
57+
ind += 1
58+
if "partnerships" in yml and type(yml["partnerships"]) == str and yml["partnerships"].strip() != '':
59+
for partnership in yml["partnerships"].split(";"):
60+
partnership = partnership.strip()
61+
partnership_id = partnership_mapper[partnership]
62+
partnership_publication_df.loc[pind] = [uid, partnership_mapper[partnership]]
63+
pind += 1
64+
65+
if "r03" in yml and type(yml["r03"]) == str and yml["r03"].strip() != '':
66+
for r03 in yml["r03"].split(";"):
67+
r03 = r03.strip()
68+
r03_id = r03_mapper[r03]
69+
r03_publication_df.loc[rind] = [uid, r03_mapper[r03]]
70+
rind += 1
71+
72+
## Update S3
73+
backup_file(publication_df, "publications", quoting=False)
74+
backup_file(dcc_publication_df, "dcc_publications", False)
75+
backup_file(r03_publication_df, "r03_publications", "False")
76+
77+
## ingest
78+
79+
cur = connection.cursor()
80+
cur.execute('''
81+
DELETE FROM dcc_publications;
82+
''')
83+
84+
cur.execute('''
85+
DELETE FROM partnership_publications;
86+
''')
87+
88+
cur.execute('''
89+
DELETE FROM r03_publications;
90+
''')
91+
92+
cur.execute('''
93+
DELETE FROM publications;
94+
''')
95+
96+
cur.execute('''
97+
create table publication_tmp
98+
as table publications
99+
with no data;
100+
''')
101+
102+
p_buf = io.StringIO()
103+
publication_df.to_csv(p_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t")
104+
p_buf.seek(0)
105+
columns = next(p_buf).strip().split('\t')
106+
cur.copy_from(p_buf, 'publication_tmp',
107+
columns=columns,
108+
null='',
109+
sep='\t',
110+
)
111+
column_string = ", ".join(columns)
112+
set_string = ",\n".join(["%s = excluded.%s"%(i,i) for i in columns])
113+
114+
cur.execute('''
115+
insert into publications (%s)
116+
select %s
117+
from publication_tmp
118+
on conflict (id)
119+
do update
120+
set %s
121+
;
122+
'''%(column_string, column_string, set_string))
123+
cur.execute('drop table publication_tmp;')
124+
125+
126+
127+
cur = connection.cursor()
128+
cur.execute('''
129+
create table dcc_publication_tmp
130+
as table dcc_publications
131+
with no data;
132+
''')
133+
134+
135+
d_buf = io.StringIO()
136+
dcc_publication_df.to_csv(d_buf, header=True, sep="\t", index=None)
137+
d_buf.seek(0)
138+
columns = next(d_buf).strip().split('\t')
139+
cur.copy_from(d_buf, 'dcc_publication_tmp',
140+
columns=dcc_publication_columns,
141+
null='',
142+
sep='\t',
143+
)
144+
cur.execute('''
145+
insert into dcc_publications (publication_id, dcc_id)
146+
select publication_id, dcc_id
147+
from dcc_publication_tmp
148+
on conflict
149+
do nothing
150+
;
151+
''')
152+
cur.execute('drop table dcc_publication_tmp;')
153+
154+
155+
cur = connection.cursor()
156+
cur.execute('''
157+
create table partnership_publication_tmp
158+
as table partnership_publications
159+
with no data;
160+
''')
161+
162+
part_buf = io.StringIO()
163+
partnership_publication_df.to_csv(part_buf, header=True, sep="\t", index=None)
164+
part_buf.seek(0)
165+
columns = next(part_buf).strip().split('\t')
166+
cur.copy_from(part_buf, 'partnership_publication_tmp',
167+
columns=partnership_publication_columns,
168+
null='',
169+
sep='\t',
170+
)
171+
cur.execute('''
172+
insert into partnership_publications (publication_id, partnership_id)
173+
select publication_id, partnership_id
174+
from partnership_publication_tmp
175+
on conflict
176+
do nothing
177+
;
178+
''')
179+
cur.execute('drop table partnership_publication_tmp;')
180+
181+
cur = connection.cursor()
182+
cur.execute('''
183+
create table r03_publication_tmp
184+
as table r03_publications
185+
with no data;
186+
''')
187+
188+
r_buf = io.StringIO()
189+
r03_publication_df.to_csv(r_buf, header=True, sep="\t", index=None)
190+
r_buf.seek(0)
191+
columns = next(r_buf).strip().split('\t')
192+
cur.copy_from(r_buf, 'r03_publication_tmp',
193+
columns=r03_publication_columns,
194+
null='',
195+
sep='\t',
196+
)
197+
198+
cur.execute('''
199+
insert into r03_publications (publication_id, r03_id)
200+
select publication_id, r03_id
201+
from r03_publication_tmp
202+
on conflict
203+
do nothing
204+
;
205+
''')
206+
cur.execute('drop table r03_publication_tmp;')
207+
208+
209+
connection.commit()
210+
211+
print("ingested publications")

script/ingestion/s3_update.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import boto3
55
from botocore.exceptions import ClientError
66
import pandas as pd
7-
7+
import csv
88
def upload_file(file_obj, bucket, object_name=None):
99
"""Upload a file to an S3 bucket
1010
@@ -31,13 +31,15 @@ def upload_file(file_obj, bucket, object_name=None):
3131

3232
now = str(date.today()).replace("-", "")
3333

34-
def backup_file(df, suffix, include_index=True):
34+
def backup_file(df, suffix, include_index=True, quoting=True):
3535
print("backing up on s3...")
3636
s_buf = io.StringIO()
3737
# df.to_csv(s_buf, header=True, sep="\t")
3838
# print(s_buf.read())
3939
if include_index:
4040
df.to_csv(s_buf, header=True, sep="\t")
41+
elif not quoting:
42+
df.to_csv(s_buf, header=True, sep="\t", quoting=csv.QUOTE_NONE)
4143
else:
4244
df.to_csv(s_buf, header=True, sep="\t", index=None)
4345
object_name = "database/test/%s_%s.tsv"%(now, suffix)

src/pages/publications/DOI: 10.1158-1538-7445.AM2023-6576.md renamed to src/pages/publications/DOI:10.1158-1538-7445.AM2023-6576.md

0 commit comments

Comments
 (0)