Skip to content

Commit 6709923

Browse files
committed
updated outreach
1 parent caeaecf commit 6709923

13 files changed

+357
-57
lines changed

script/ingestion/outreach.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import pandas as pd
2+
import yaml
3+
from glob import glob
4+
from uuid import uuid5, NAMESPACE_URL
5+
from uuid import uuid5, NAMESPACE_URL
6+
from s3_update import backup_file
7+
from ingest_common import connection
8+
import io
9+
import csv
10+
import json
11+
12+
dccs = pd.read_csv('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv', sep="\t", index_col=0, header=0)
13+
# map dcc names to their respective ids
14+
dcc_mapper = {}
15+
for k,v in dccs.iterrows():
16+
dcc_mapper[v["short_label"]] = k
17+
18+
outreach_columns = ["title", "short_description", "description", "tags", "agenda", "featured", "active", "start_date", "end_date", "application_start", "application_end", "link", "image", "carousel", "cfde_specific", "flyer"]
19+
dcc_outreach_columns = ["outreach_id", "dcc_id"]
20+
21+
outreach_df = pd.DataFrame("-", index=[], columns=outreach_columns)
22+
outreach_df.index.name = 'id'
23+
dcc_outreach_df = pd.DataFrame("-", index=[], columns=dcc_outreach_columns)
24+
ind = 0
25+
# outreach_df = outreach_df.fillna('')
26+
27+
for filename in glob('../../src/pages/outreach/*.md'):
28+
with open(filename) as o:
29+
markdown = o.read()
30+
m = markdown.split("---")
31+
val = yaml.safe_load(m[1])
32+
description = m[-1].strip()
33+
if "title" in val:
34+
title = val["title"]
35+
36+
start_date = val.get("start_date", "")
37+
end_date = val.get("end_date", "")
38+
string_id = title + str(start_date) + str(end_date)
39+
uid = str(uuid5(NAMESPACE_URL, string_id))
40+
if uid =='830ddbac-bf21-5612-af1a-75c713045299':
41+
print(val)
42+
vdict = {}
43+
for c in outreach_columns:
44+
if c == "description":
45+
vdict[c] = description
46+
elif val.get(c):
47+
vdict[c] = val[c]
48+
outreach_df.loc[uid] = vdict
49+
if val.get('dcc'):
50+
for dcc in val["dcc"]:
51+
dcc_id = dcc_mapper[dcc]
52+
dcc_outreach_df.loc[ind] = [uid, dcc_mapper[dcc]]
53+
ind += 1
54+
55+
# webinars
56+
webinars = {}
57+
outreach_df['agenda'] = ''
58+
for filename in glob('../../src/pages/webinars/*.md'):
59+
with open(filename) as o:
60+
markdown = o.read()
61+
m = markdown.split("---")
62+
val = yaml.safe_load(m[1])
63+
description = m[-1].strip()
64+
if "title" in val:
65+
title = 'CFDE Webinar Series'
66+
start_date = val["start_date"]
67+
end_date = val.get('end_date')
68+
string_id = title + str(start_date) + str(end_date)
69+
uid = str(uuid5(NAMESPACE_URL, string_id))
70+
if uid not in outreach_df.index:
71+
print(uid)
72+
else:
73+
if "agenda" in val:
74+
if not type(val["agenda"]) == list:
75+
print(val['agenda'])
76+
outreach_df.at[uid,'agenda'] = json.dumps(val["agenda"])
77+
78+
79+
80+
outreach_df['active'] = outreach_df['active'].astype(bool)
81+
outreach_df['featured'] = outreach_df['featured'].astype(bool)
82+
outreach_df['carousel'] = outreach_df['carousel'].astype(bool)
83+
outreach_df['cfde_specific'] = outreach_df['cfde_specific'].astype(bool)
84+
backup_file(outreach_df, "outreach", quoting=False)
85+
backup_file(dcc_outreach_df, "dcc_outreach", False)
86+
87+
cur = connection.cursor()
88+
89+
cur.execute('''
90+
DELETE FROM dcc_outreach;
91+
''')
92+
93+
cur.execute('''
94+
DELETE FROM outreach;
95+
''')
96+
cur.execute('''
97+
create table outreach_tmp
98+
as table outreach
99+
with no data;
100+
''')
101+
102+
o_buf = io.StringIO()
103+
outreach_df.to_csv('outreach.tsv', sep="\t", quoting=csv.QUOTE_NONE)
104+
outreach_df.to_csv(o_buf, header=True, quoting=csv.QUOTE_NONE, sep="\t")
105+
o_buf.seek(0)
106+
columns = next(o_buf).strip().split('\t')
107+
cur.copy_from(o_buf, 'outreach_tmp',
108+
columns=columns,
109+
null='',
110+
sep='\t',
111+
)
112+
column_string = ", ".join(columns)
113+
set_string = ",\n".join(["%s = excluded.%s"%(i,i) for i in columns])
114+
115+
cur.execute('''
116+
insert into outreach (%s)
117+
select %s
118+
from outreach_tmp
119+
on conflict (id)
120+
do update
121+
set %s
122+
;
123+
'''%(column_string, column_string, set_string))
124+
cur.execute('drop table outreach_tmp;')
125+
126+
cur = connection.cursor()
127+
cur.execute('''
128+
create table dcc_outreach_tmp
129+
as table dcc_outreach
130+
with no data;
131+
''')
132+
d_buf = io.StringIO()
133+
dcc_outreach_df.to_csv(d_buf, header=True, sep="\t", index=None)
134+
d_buf.seek(0)
135+
columns = next(d_buf).strip().split('\t')
136+
cur.copy_from(d_buf, 'dcc_outreach_tmp',
137+
columns=dcc_outreach_columns,
138+
null='',
139+
sep='\t',
140+
)
141+
142+
cur.execute('''
143+
insert into dcc_outreach (outreach_id, dcc_id)
144+
select outreach_id, dcc_id
145+
from dcc_outreach_tmp
146+
on conflict
147+
do nothing
148+
;
149+
''')
150+
cur.execute('drop table dcc_outreach_tmp;')
151+
connection.commit()
152+
153+
print("Ingested outreach and webinars")

script/ingestion/outreach.tsv

Lines changed: 64 additions & 0 deletions
Large diffs are not rendered by default.

script/process_markdown.ipynb

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 5,
66
"metadata": {},
77
"outputs": [],
88
"source": [
@@ -1121,6 +1121,7 @@
11211121
"\tyml = {\"layout\": \"../../layouts/UseCase.astro\"}\n",
11221122
"\tdescription = ''\n",
11231123
"\tfor k,v in row.items():\n",
1124+
"\t\t\n",
11241125
"\t\tif not v == '':\n",
11251126
"\t\t\tif not k == 'description':\n",
11261127
"\t\t\t\tif k == 'source_dcc':\n",
@@ -1141,7 +1142,7 @@
11411142
},
11421143
{
11431144
"cell_type": "code",
1144-
"execution_count": 33,
1145+
"execution_count": 31,
11451146
"metadata": {},
11461147
"outputs": [
11471148
{
@@ -1354,7 +1355,7 @@
13541355
"4 True "
13551356
]
13561357
},
1357-
"execution_count": 33,
1358+
"execution_count": 31,
13581359
"metadata": {},
13591360
"output_type": "execute_result"
13601361
}
@@ -1367,7 +1368,7 @@
13671368
},
13681369
{
13691370
"cell_type": "code",
1370-
"execution_count": 36,
1371+
"execution_count": 32,
13711372
"metadata": {},
13721373
"outputs": [
13731374
{
@@ -1376,7 +1377,7 @@
13761377
"'2024-05-20'"
13771378
]
13781379
},
1379-
"execution_count": 36,
1380+
"execution_count": 32,
13801381
"metadata": {},
13811382
"output_type": "execute_result"
13821383
}
@@ -1387,7 +1388,7 @@
13871388
},
13881389
{
13891390
"cell_type": "code",
1390-
"execution_count": 32,
1391+
"execution_count": 33,
13911392
"metadata": {},
13921393
"outputs": [
13931394
{
@@ -1396,7 +1397,7 @@
13961397
"np.int64(9)"
13971398
]
13981399
},
1399-
"execution_count": 32,
1400+
"execution_count": 33,
14001401
"metadata": {},
14011402
"output_type": "execute_result"
14021403
}
@@ -1407,19 +1408,20 @@
14071408
},
14081409
{
14091410
"cell_type": "code",
1410-
"execution_count": 40,
1411+
"execution_count": 34,
14111412
"metadata": {},
14121413
"outputs": [],
14131414
"source": [
14141415
"for i, row in outreach.iterrows():\n",
1415-
"\tyml = {\"layout\": \"../../layouts/Outreach.astro\"}\n",
1416+
"\tyml = {\"layout\": \"@/layouts/Outreach.astro\"}\n",
14161417
"\tdescription = ''\n",
14171418
"\tfor k,v in row.items():\n",
14181419
"\t\tif not v == '':\n",
14191420
"\t\t\tif not k == 'description':\n",
14201421
"\t\t\t\tif k == 'dcc':\n",
14211422
"\t\t\t\t\tv = v.split(\"; \")\n",
1422-
"\t\t\t\tyml[k] = v\n",
1423+
"\t\t\t\tif not k == 'agenda':\n",
1424+
"\t\t\t\t\tyml[k] = v\n",
14231425
"\t\t\telse:\n",
14241426
"\t\t\t\tdescription = v\n",
14251427
"\tdate = ''\n",
@@ -1430,9 +1432,9 @@
14301432
"\n",
14311433
"\tfilename = yml[\"title\"]\n",
14321434
"\tif not date == '':\n",
1433-
"\t\tfilename = \"%s (%s)\"%(yml['title'], date.split('T')[0])\n",
1435+
"\t\tfilename = \"%s %s\"%(yml['title'], date.split('T')[0])\n",
14341436
"\tfilename = filename.replace(\"/\", \"-\")\n",
1435-
"\twith open('out/outreach/%s.md'%filename, 'w') as o:\n",
1437+
"\twith open('../src/pages/outreach/%s.md'%filename, 'w') as o:\n",
14361438
"\t\to.write('---\\n')\n",
14371439
"\t\to.write(yaml.dump(yml))\n",
14381440
"\t\to.write('---\\n')\n",
@@ -1441,7 +1443,7 @@
14411443
},
14421444
{
14431445
"cell_type": "code",
1444-
"execution_count": 65,
1446+
"execution_count": 35,
14451447
"metadata": {},
14461448
"outputs": [
14471449
{
@@ -1661,7 +1663,7 @@
16611663
"4 NaN "
16621664
]
16631665
},
1664-
"execution_count": 65,
1666+
"execution_count": 35,
16651667
"metadata": {},
16661668
"output_type": "execute_result"
16671669
}
@@ -1673,7 +1675,7 @@
16731675
},
16741676
{
16751677
"cell_type": "code",
1676-
"execution_count": 67,
1678+
"execution_count": 36,
16771679
"metadata": {},
16781680
"outputs": [
16791681
{
@@ -1752,7 +1754,7 @@
17521754
" <td>NaN</td>\n",
17531755
" <td>NaN</td>\n",
17541756
" <td>/info/training_and_outreach/cfde-webinar-series</td>\n",
1755-
" <td>/img/cfde_webinar2.png</td>\n",
1757+
" <td>https://cfde-drc.s3.us-east-2.amazonaws.com/as...</td>\n",
17561758
" <td>False</td>\n",
17571759
" <td>True</td>\n",
17581760
" <td>/pdf/cfde_webinar2.pdf</td>\n",
@@ -1866,7 +1868,7 @@
18661868
"\n",
18671869
" image carousel \\\n",
18681870
"15 https://cfde-drc.s3.amazonaws.com/assets/img/C... False \n",
1869-
"19 /img/cfde_webinar2.png False \n",
1871+
"19 https://cfde-drc.s3.us-east-2.amazonaws.com/as... False \n",
18701872
"28 https://cfde-drc.s3.us-east-2.amazonaws.com/as... True \n",
18711873
"29 https://cfde-drc.s3.us-east-2.amazonaws.com/as... True \n",
18721874
"36 https://cfde-drc.s3.us-east-2.amazonaws.com/as... True \n",
@@ -1879,7 +1881,7 @@
18791881
"36 True NaN "
18801882
]
18811883
},
1882-
"execution_count": 67,
1884+
"execution_count": 36,
18831885
"metadata": {},
18841886
"output_type": "execute_result"
18851887
}
@@ -1891,17 +1893,17 @@
18911893
},
18921894
{
18931895
"cell_type": "code",
1894-
"execution_count": null,
1896+
"execution_count": 37,
18951897
"metadata": {},
18961898
"outputs": [],
18971899
"source": [
18981900
"webinars = webinars[['start_date', 'end_date', 'agenda']]\n",
1899-
"webinars = webinars.fillna()"
1901+
"webinars = webinars.fillna('')"
19001902
]
19011903
},
19021904
{
19031905
"cell_type": "code",
1904-
"execution_count": 74,
1906+
"execution_count": 38,
19051907
"metadata": {},
19061908
"outputs": [],
19071909
"source": [
@@ -1910,22 +1912,24 @@
19101912
},
19111913
{
19121914
"cell_type": "code",
1913-
"execution_count": 75,
1915+
"execution_count": 39,
19141916
"metadata": {},
19151917
"outputs": [],
19161918
"source": [
19171919
"for i, row in webinars.iterrows():\n",
1918-
"\tyml = {\"layout\": \"../../layouts/Webinars.astro\"}\n",
1920+
"\tyml = {\"layout\": \"@/layouts/Webinar.astro\"}\n",
19191921
"\tdescription = ''\n",
19201922
"\tfor k,v in row.items():\n",
19211923
"\t\tif not v == '':\n",
19221924
"\t\t\tif k == \"agenda\":\n",
19231925
"\t\t\t\tv = json.loads(v)\n",
1926+
"\t\t\tif v == \"estimateCommonDisp\":\n",
1927+
"\t\t\t\tprint(row)\n",
19241928
"\t\t\tyml[k] = v\t\t\n",
19251929
"\tdate = yml['start_date'] \n",
19261930
"\n",
19271931
"\tfilename = date.split('T')[0]\n",
1928-
"\twith open('out/webinars/%s.md'%filename, 'w') as o:\n",
1932+
"\twith open('../src/pages/webinars/%s.md'%filename, 'w') as o:\n",
19291933
"\t\to.write('---\\n')\n",
19301934
"\t\to.write(yaml.dump(yml))\n",
19311935
"\t\to.write('---\\n')\n",
@@ -2454,7 +2458,7 @@
24542458
"name": "python",
24552459
"nbconvert_exporter": "python",
24562460
"pygments_lexer": "ipython3",
2457-
"version": "3.12.2"
2461+
"version": "3.12.8"
24582462
}
24592463
},
24602464
"nbformat": 4,

0 commit comments

Comments
 (0)