1+ import pandas as pd
2+ import yaml
3+ from glob import glob
4+ from uuid import uuid5 , NAMESPACE_URL
5+ from uuid import uuid5 , NAMESPACE_URL
6+ from s3_update import backup_file
7+ from ingest_common import connection
8+ import io
9+ import csv
10+ import json
11+
12+ data = {}
13+ for filename in glob ('../../src/pages/r03/*.md' ):
14+ with open (filename ) as o :
15+ markdown = o .read ()
16+ m = markdown .split ("---" )
17+ row = yaml .safe_load (m [1 ])
18+ if "grant_num" in row :
19+ description = m [- 1 ].strip ()
20+ grant_num = row ['grant_num' ]
21+ uid = str (uuid5 (NAMESPACE_URL , grant_num ))
22+ data [uid ] = {** row , "description" : description }
23+ r03_df = pd .DataFrame .from_dict (data , orient = "index" ).fillna ('' )
24+ r03_df .index .name = "id"
25+ r03_df .to_csv ('r03.tsv' , sep = "\t " )
26+ # Update S3
27+ backup_file (r03_df , "r03" )
28+
29+ ## ingest
30+
31+ cur = connection .cursor ()
32+
33+ cur .execute ('''
34+ create table r03_tmp
35+ as table r03
36+ with no data;
37+ ''' )
38+
39+ s_buf = io .StringIO ()
40+ r03_df .to_csv (s_buf , header = True , sep = "\t " , quoting = csv .QUOTE_NONE )
41+ s_buf .seek (0 )
42+ columns = next (s_buf ).strip ().split ('\t ' )
43+ cur .copy_from (s_buf , 'r03_tmp' ,
44+ columns = columns ,
45+ null = '' ,
46+ sep = '\t ' ,
47+ )
48+ column_string = ", " .join (columns )
49+ set_string = ",\n " .join (["%s = excluded.%s" % (i ,i ) for i in columns ])
50+ cur .execute ('''
51+ insert into r03 (%s)
52+ select %s
53+ from r03_tmp
54+ on conflict (id)
55+ do update
56+ set %s
57+ ;
58+ ''' % (column_string , column_string , set_string ))
59+ cur .execute ('drop table r03_tmp;' )
60+
61+ connection .commit ()
62+ print ("ingested r03" )
0 commit comments