|
7 | 7 | from ingest_common import connection |
8 | 8 | import io |
9 | 9 | import csv |
| 10 | +import json |
10 | 11 | dccs = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv', sep="\t", index_col=0, header=0) |
11 | 12 | # map dcc names to their respective ids |
12 | 13 | dcc_mapper = {} |
13 | 14 | for i, v in dccs.loc[:,'short_label'].items(): |
14 | 15 | dcc_mapper[v] = i |
15 | 16 | data = {} |
16 | | -center_publication = [] |
| 17 | +# center_publication = [] |
17 | 18 | for filename in glob('../../src/pages/centers/*.md'): |
18 | 19 | with open(filename) as o: |
19 | 20 | markdown = o.read() |
20 | 21 | m = markdown.split("---") |
21 | 22 | row = yaml.safe_load(m[1]) |
22 | 23 | if "label" in row: |
23 | 24 | label = row['label'] |
| 25 | + description = m[-1].strip() |
24 | 26 | uid = str(uuid5(NAMESPACE_URL, label)) |
25 | | - data[uid] = { |
26 | | - "label": row["label"], |
27 | | - "short_label": row.get("short_label"), |
28 | | - "short_description": row.get("short_description"), |
29 | | - "description": row.get("description"), |
30 | | - "homepage": row.get("homepage"), |
31 | | - "icon": row.get("icon"), |
32 | | - "grant_num": row.get("grant_num"), |
33 | | - "active": row.get("active"), |
34 | | - } |
35 | | - if row.get("publications"): |
36 | | - for pub in set(row["publications"]): |
37 | | - center_publication.append({"center_id": uid, "publication_id": pub}) |
| 27 | + data[uid] = {"description": description} |
| 28 | + for k,v in row.items(): |
| 29 | + if not k == "layout": |
| 30 | + if type(v) == dict or type(v) == list: |
| 31 | + data[uid][k] = json.dumps(v) |
| 32 | + else: |
| 33 | + data[uid][k] = v |
| 34 | + # if row.get("publications"): |
| 35 | + # for pub in set(row["publications"]): |
| 36 | + # center_publication.append({"center_id": uid, "publication_id": pub}) |
38 | 37 |
|
39 | 38 | center_df = pd.DataFrame.from_dict(data, orient="index").fillna('') |
40 | 39 | center_df.index.name = "id" |
41 | | -center_publication_df = pd.DataFrame.from_records(center_publication, columns=['center_id', 'publication_id']) |
| 40 | +# center_publication_df = pd.DataFrame.from_records(center_publication, columns=['center_id', 'publication_id']) |
42 | 41 |
|
43 | 42 | ## Update S3 |
44 | 43 | backup_file(center_df, "centers", quoting=False) |
45 | | -backup_file(center_publication_df, "center_publication", False) |
| 44 | +# backup_file(center_publication_df, "center_publication", False) |
46 | 45 |
|
47 | 46 | ## ingest |
48 | 47 |
|
|
65 | 64 | ''') |
66 | 65 |
|
67 | 66 | p_buf = io.StringIO() |
68 | | -center_df.to_csv(p_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t") |
| 67 | +center_df.to_csv(p_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\') |
69 | 68 | p_buf.seek(0) |
70 | 69 | columns = next(p_buf).strip().split('\t') |
71 | 70 | cur.copy_from(p_buf, 'centers_tmp', |
|
87 | 86 | cur.execute('drop table centers_tmp;') |
88 | 87 |
|
89 | 88 |
|
90 | | -cur = connection.cursor() |
91 | | -cur.execute(''' |
92 | | - create table center_publications_tmp |
93 | | - as table center_publications |
94 | | - with no data; |
95 | | -''') |
96 | | - |
97 | | - |
98 | | -cp_buf = io.StringIO() |
99 | | -center_publication_df.to_csv(cp_buf, header=True, sep="\t", index=None) |
100 | | -cp_buf.seek(0) |
101 | | -columns = next(cp_buf).strip().split('\t') |
102 | | -cur.copy_from(cp_buf, 'center_publications_tmp', |
103 | | - columns=columns, |
104 | | - null='', |
105 | | - sep='\t', |
106 | | -) |
107 | | - |
108 | | -column_string = ", ".join(columns) |
109 | | - |
110 | | -cur.execute(''' |
111 | | - insert into center_publications (%s) |
112 | | - select %s |
113 | | - from center_publications_tmp |
114 | | - on conflict |
115 | | - do nothing |
116 | | - ; |
117 | | - '''%(column_string, column_string)) |
118 | | -cur.execute('drop table center_publications_tmp;') |
| 89 | +# cur = connection.cursor() |
| 90 | +# cur.execute(''' |
| 91 | +# create table center_publications_tmp |
| 92 | +# as table center_publications |
| 93 | +# with no data; |
| 94 | +# ''') |
| 95 | + |
| 96 | + |
| 97 | +# cp_buf = io.StringIO() |
| 98 | +# center_publication_df.to_csv(cp_buf, header=True, sep="\t", index=None) |
| 99 | +# cp_buf.seek(0) |
| 100 | +# columns = next(cp_buf).strip().split('\t') |
| 101 | +# cur.copy_from(cp_buf, 'center_publications_tmp', |
| 102 | +# columns=columns, |
| 103 | +# null='', |
| 104 | +# sep='\t', |
| 105 | +# ) |
| 106 | + |
| 107 | +# column_string = ", ".join(columns) |
| 108 | + |
| 109 | +# cur.execute(''' |
| 110 | +# insert into center_publications (%s) |
| 111 | +# select %s |
| 112 | +# from center_publications_tmp |
| 113 | +# on conflict |
| 114 | +# do nothing |
| 115 | +# ; |
| 116 | +# '''%(column_string, column_string)) |
| 117 | +# cur.execute('drop table center_publications_tmp;') |
119 | 118 | connection.commit() |
120 | 119 |
|
121 | 120 | print("Ingested centers") |
|
0 commit comments