Skip to content

Commit ee36d88

Browse files
committed
modified script
1 parent 92cc7d3 commit ee36d88

File tree

3 files changed

+88
-48
lines changed

3 files changed

+88
-48
lines changed

script/ingestion/centers.py

Lines changed: 45 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -7,42 +7,41 @@
77
from ingest_common import connection
88
import io
99
import csv
10+
import json
1011
dccs = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv', sep="\t", index_col=0, header=0)
1112
# map dcc names to their respective ids
1213
dcc_mapper = {}
1314
for i, v in dccs.loc[:,'short_label'].items():
1415
dcc_mapper[v] = i
1516
data = {}
16-
center_publication = []
17+
# center_publication = []
1718
for filename in glob('../../src/pages/centers/*.md'):
1819
with open(filename) as o:
1920
markdown = o.read()
2021
m = markdown.split("---")
2122
row = yaml.safe_load(m[1])
2223
if "label" in row:
2324
label = row['label']
25+
description = m[-1].strip()
2426
uid = str(uuid5(NAMESPACE_URL, label))
25-
data[uid] = {
26-
"label": row["label"],
27-
"short_label": row.get("short_label"),
28-
"short_description": row.get("short_description"),
29-
"description": row.get("description"),
30-
"homepage": row.get("homepage"),
31-
"icon": row.get("icon"),
32-
"grant_num": row.get("grant_num"),
33-
"active": row.get("active"),
34-
}
35-
if row.get("publications"):
36-
for pub in set(row["publications"]):
37-
center_publication.append({"center_id": uid, "publication_id": pub})
27+
data[uid] = {"description": description}
28+
for k,v in row.items():
29+
if not k == "layout":
30+
if type(v) == dict or type(v) == list:
31+
data[uid][k] = json.dumps(v)
32+
else:
33+
data[uid][k] = v
34+
# if row.get("publications"):
35+
# for pub in set(row["publications"]):
36+
# center_publication.append({"center_id": uid, "publication_id": pub})
3837

3938
center_df = pd.DataFrame.from_dict(data, orient="index").fillna('')
4039
center_df.index.name = "id"
41-
center_publication_df = pd.DataFrame.from_records(center_publication, columns=['center_id', 'publication_id'])
40+
# center_publication_df = pd.DataFrame.from_records(center_publication, columns=['center_id', 'publication_id'])
4241

4342
## Update S3
4443
backup_file(center_df, "centers", quoting=False)
45-
backup_file(center_publication_df, "center_publication", False)
44+
# backup_file(center_publication_df, "center_publication", False)
4645

4746
## ingest
4847

@@ -65,7 +64,7 @@
6564
''')
6665

6766
p_buf = io.StringIO()
68-
center_df.to_csv(p_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t")
67+
center_df.to_csv(p_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\')
6968
p_buf.seek(0)
7069
columns = next(p_buf).strip().split('\t')
7170
cur.copy_from(p_buf, 'centers_tmp',
@@ -87,35 +86,35 @@
8786
cur.execute('drop table centers_tmp;')
8887

8988

90-
cur = connection.cursor()
91-
cur.execute('''
92-
create table center_publications_tmp
93-
as table center_publications
94-
with no data;
95-
''')
96-
97-
98-
cp_buf = io.StringIO()
99-
center_publication_df.to_csv(cp_buf, header=True, sep="\t", index=None)
100-
cp_buf.seek(0)
101-
columns = next(cp_buf).strip().split('\t')
102-
cur.copy_from(cp_buf, 'center_publications_tmp',
103-
columns=columns,
104-
null='',
105-
sep='\t',
106-
)
107-
108-
column_string = ", ".join(columns)
109-
110-
cur.execute('''
111-
insert into center_publications (%s)
112-
select %s
113-
from center_publications_tmp
114-
on conflict
115-
do nothing
116-
;
117-
'''%(column_string, column_string))
118-
cur.execute('drop table center_publications_tmp;')
89+
# cur = connection.cursor()
90+
# cur.execute('''
91+
# create table center_publications_tmp
92+
# as table center_publications
93+
# with no data;
94+
# ''')
95+
96+
97+
# cp_buf = io.StringIO()
98+
# center_publication_df.to_csv(cp_buf, header=True, sep="\t", index=None)
99+
# cp_buf.seek(0)
100+
# columns = next(cp_buf).strip().split('\t')
101+
# cur.copy_from(cp_buf, 'center_publications_tmp',
102+
# columns=columns,
103+
# null='',
104+
# sep='\t',
105+
# )
106+
107+
# column_string = ", ".join(columns)
108+
109+
# cur.execute('''
110+
# insert into center_publications (%s)
111+
# select %s
112+
# from center_publications_tmp
113+
# on conflict
114+
# do nothing
115+
# ;
116+
# '''%(column_string, column_string))
117+
# cur.execute('drop table center_publications_tmp;')
119118
connection.commit()
120119

121120
print("Ingested centers")

script/ingestion/publications.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
dcc_mapper = {}
1414
for k,v in dccs.iterrows():
1515
dcc_mapper[v["short_label"]] = k
16+
17+
18+
centers = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_centers.tsv', sep="\t", index_col=0, header=0)
19+
# map center names to their respective ids
20+
center_mapper = {}
21+
for k,v in centers.iterrows():
22+
center_mapper[v["short_label"]] = k
1623

1724
partnerships = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_partnerships.tsv', sep="\t", index_col=0)
1825
partnership_mapper = {}
@@ -26,12 +33,14 @@
2633

2734
publication_columns = ["title", "journal", "authors", "year", "page", "volume", "issue", "pmid", "pmcid", "doi", "landmark", "tool_id", "carousel", "carousel_title", "carousel_link", "carousel_description", "image", "featured", "keywords" ]
2835
dcc_publication_columns = ["publication_id", "dcc_id"]
36+
center_publication_columns = ["publication_id", "center_id"]
2937
partnership_publication_columns = ["publication_id", "partnership_id"]
3038
r03_publication_columns = ["publication_id", "r03_id"]
3139

3240
publication_df = pd.DataFrame("-", index=[], columns=publication_columns)
3341
publication_df.index.name = "id"
3442
dcc_publication_df = pd.DataFrame("-", index=[], columns=dcc_publication_columns)
43+
center_publication_df = pd.DataFrame("-", index=[], columns=center_publication_columns)
3544
partnership_publication_df = pd.DataFrame("-", index=[], columns=partnership_publication_columns)
3645
r03_publication_df = pd.DataFrame("-", index=[], columns=r03_publication_columns)
3746
ind = 0
@@ -54,6 +63,13 @@
5463
dcc_id = dcc_mapper[dcc]
5564
dcc_publication_df.loc[ind] = [uid, dcc_mapper[dcc]]
5665
ind += 1
66+
if "centers" in yml:
67+
for center in yml["centers"]:
68+
center = center.strip()
69+
center_id = center_mapper[center]
70+
center_publication_df.loc[ind] = [uid, center_mapper[center]]
71+
ind += 1
72+
5773
if "partnerships" in yml:
5874
partnership = yml["partnerships"]
5975
partnership = partnership.strip()
@@ -71,6 +87,7 @@
7187
## Update S3
7288
backup_file(publication_df, "publications", quoting=False)
7389
backup_file(dcc_publication_df, "dcc_publications", False)
90+
backup_file(center_publication_df, "center_publications", False)
7491
backup_file(partnership_publication_df, "partnership_publications", False)
7592
backup_file(r03_publication_df, "r03_publications", False)
7693

@@ -151,6 +168,30 @@
151168
''')
152169
cur.execute('drop table dcc_publication_tmp;')
153170

171+
cur.execute('''
172+
create table center_publication_tmp
173+
as table center_publications
174+
with no data;
175+
''')
176+
center_buf = io.StringIO()
177+
center_publication_df.to_csv(center_buf, header=True, sep="\t", index=None)
178+
center_buf.seek(0)
179+
columns = next(center_buf).strip().split('\t')
180+
cur.copy_from(center_buf, 'center_publication_tmp',
181+
columns=center_publication_columns,
182+
null='',
183+
sep='\t',
184+
)
185+
cur.execute('''
186+
insert into center_publications (publication_id, center_id)
187+
select publication_id, center_id
188+
from center_publication_tmp
189+
on conflict
190+
do nothing
191+
;
192+
''')
193+
cur.execute('drop table center_publication_tmp;')
194+
154195

155196
cur = connection.cursor()
156197
cur.execute('''

script/ingestion/s3_update.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def backup_file(df, suffix, include_index=True, quoting=True):
4242
df.to_csv(s_buf, header=True, sep="\t", quoting=csv.QUOTE_NONE)
4343
else:
4444
df.to_csv(s_buf, header=True, sep="\t", index=None)
45-
object_name = "database/test/%s_%s.tsv"%(now, suffix)
45+
object_name = "database/files/%s_%s.tsv"%(now, suffix)
4646
upload_file(s_buf.getvalue(), bucket, object_name)
47-
object_name = "database/test/current_%s.tsv"%(suffix)
47+
object_name = "database/files/current_%s.tsv"%(suffix)
4848
upload_file(s_buf.getvalue(), bucket, object_name)

0 commit comments

Comments
 (0)