1+ import pandas as pd
2+ import yaml
3+ from glob import glob
4+ from uuid import uuid5 , NAMESPACE_URL
5+ from uuid import uuid5 , NAMESPACE_URL
6+ from s3_update import backup_file
7+ from ingest_common import connection
8+ import io
9+ import csv
10+ dccs = pd .read_csv ('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv' , sep = "\t " , index_col = 0 , header = 0 )
11+
12+ dccs = pd .read_csv ('https://cfde-drc.s3.amazonaws.com/database/files/current_dccs.tsv' , sep = "\t " , index_col = 0 , header = 0 )
13+ # map dcc names to their respective ids
14+ dcc_mapper = {}
15+ for k ,v in dccs .iterrows ():
16+ dcc_mapper [v ["short_label" ]] = k
17+
18+ partnerships = pd .read_csv ('https://cfde-drc.s3.amazonaws.com/database/files/current_partnerships.tsv' , sep = "\t " , index_col = 0 )
19+ partnership_mapper = {}
20+ for k ,v in partnerships .iterrows ():
21+ partnership_mapper [v ["title" ]] = k
22+
23+ r03 = pd .read_csv ('https://cfde-drc.s3.amazonaws.com/database/files/current_r03s.tsv' , sep = "\t " , index_col = 0 )
24+ r03_mapper = {}
25+ for k ,v in r03 .iterrows ():
26+ r03_mapper [v ["grant_num" ]] = k
27+
28+ publication_columns = ["title" , "journal" , "authors" , "year" , "page" , "volume" , "issue" , "pmid" , "pmcid" , "doi" , "landmark" , "tool_id" , "carousel" , "carousel_title" , "carousel_link" , "carousel_description" , "image" , "featured" , "keywords" ]
29+ dcc_publication_columns = ["publication_id" , "dcc_id" ]
30+ partnership_publication_columns = ["publication_id" , "partnership_id" ]
31+ r03_publication_columns = ["publication_id" , "r03_id" ]
32+
33+ publication_df = pd .DataFrame ("-" , index = [], columns = publication_columns )
34+ publication_df .index .name = "id"
35+ dcc_publication_df = pd .DataFrame ("-" , index = [], columns = dcc_publication_columns )
36+ partnership_publication_df = pd .DataFrame ("-" , index = [], columns = partnership_publication_columns )
37+ r03_publication_df = pd .DataFrame ("-" , index = [], columns = r03_publication_columns )
38+ ind = 0
39+ pind = 0
40+ rind = 0
41+
42+ for filename in glob ('../../src/pages/publications/*.md' ):
43+ with open (filename ) as o :
44+ markdown = o .read ()
45+ m = markdown .split ("---" )
46+ yml = yaml .safe_load (m [1 ])
47+ if "title" in yml :
48+ title = yml ['title' ]
49+ uid = str (uuid5 (NAMESPACE_URL , title ))
50+ v = {c : yml [c ] for c in publication_columns if c in yml }
51+ publication_df .loc [uid ] = v
52+ if "dccs" in yml and type (yml ["dccs" ]) == str and yml ["dccs" ].strip () != '' :
53+ for dcc in yml ["dccs" ].split (";" ):
54+ dcc = dcc .strip ()
55+ dcc_id = dcc_mapper [dcc ]
56+ dcc_publication_df .loc [ind ] = [uid , dcc_mapper [dcc ]]
57+ ind += 1
58+ if "partnerships" in yml and type (yml ["partnerships" ]) == str and yml ["partnerships" ].strip () != '' :
59+ for partnership in yml ["partnerships" ].split (";" ):
60+ partnership = partnership .strip ()
61+ partnership_id = partnership_mapper [partnership ]
62+ partnership_publication_df .loc [pind ] = [uid , partnership_mapper [partnership ]]
63+ pind += 1
64+
65+ if "r03" in yml and type (yml ["r03" ]) == str and yml ["r03" ].strip () != '' :
66+ for r03 in yml ["r03" ].split (";" ):
67+ r03 = r03 .strip ()
68+ r03_id = r03_mapper [r03 ]
69+ r03_publication_df .loc [rind ] = [uid , r03_mapper [r03 ]]
70+ rind += 1
71+
72+ ## Update S3
73+ backup_file (publication_df , "publications" , quoting = False )
74+ backup_file (dcc_publication_df , "dcc_publications" , False )
75+ backup_file (r03_publication_df , "r03_publications" , "False" )
76+
77+ ## ingest
78+
79+ cur = connection .cursor ()
80+ cur .execute ('''
81+ DELETE FROM dcc_publications;
82+ ''' )
83+
84+ cur .execute ('''
85+ DELETE FROM partnership_publications;
86+ ''' )
87+
88+ cur .execute ('''
89+ DELETE FROM r03_publications;
90+ ''' )
91+
92+ cur .execute ('''
93+ DELETE FROM publications;
94+ ''' )
95+
96+ cur .execute ('''
97+ create table publication_tmp
98+ as table publications
99+ with no data;
100+ ''' )
101+
102+ p_buf = io .StringIO ()
103+ publication_df .to_csv (p_buf , header = True , quoting = csv .QUOTE_NONE , sep = "\t " )
104+ p_buf .seek (0 )
105+ columns = next (p_buf ).strip ().split ('\t ' )
106+ cur .copy_from (p_buf , 'publication_tmp' ,
107+ columns = columns ,
108+ null = '' ,
109+ sep = '\t ' ,
110+ )
111+ column_string = ", " .join (columns )
112+ set_string = ",\n " .join (["%s = excluded.%s" % (i ,i ) for i in columns ])
113+
114+ cur .execute ('''
115+ insert into publications (%s)
116+ select %s
117+ from publication_tmp
118+ on conflict (id)
119+ do update
120+ set %s
121+ ;
122+ ''' % (column_string , column_string , set_string ))
123+ cur .execute ('drop table publication_tmp;' )
124+
125+
126+
127+ cur = connection .cursor ()
128+ cur .execute ('''
129+ create table dcc_publication_tmp
130+ as table dcc_publications
131+ with no data;
132+ ''' )
133+
134+
135+ d_buf = io .StringIO ()
136+ dcc_publication_df .to_csv (d_buf , header = True , sep = "\t " , index = None )
137+ d_buf .seek (0 )
138+ columns = next (d_buf ).strip ().split ('\t ' )
139+ cur .copy_from (d_buf , 'dcc_publication_tmp' ,
140+ columns = dcc_publication_columns ,
141+ null = '' ,
142+ sep = '\t ' ,
143+ )
144+ cur .execute ('''
145+ insert into dcc_publications (publication_id, dcc_id)
146+ select publication_id, dcc_id
147+ from dcc_publication_tmp
148+ on conflict
149+ do nothing
150+ ;
151+ ''' )
152+ cur .execute ('drop table dcc_publication_tmp;' )
153+
154+
155+ cur = connection .cursor ()
156+ cur .execute ('''
157+ create table partnership_publication_tmp
158+ as table partnership_publications
159+ with no data;
160+ ''' )
161+
162+ part_buf = io .StringIO ()
163+ partnership_publication_df .to_csv (part_buf , header = True , sep = "\t " , index = None )
164+ part_buf .seek (0 )
165+ columns = next (part_buf ).strip ().split ('\t ' )
166+ cur .copy_from (part_buf , 'partnership_publication_tmp' ,
167+ columns = partnership_publication_columns ,
168+ null = '' ,
169+ sep = '\t ' ,
170+ )
171+ cur .execute ('''
172+ insert into partnership_publications (publication_id, partnership_id)
173+ select publication_id, partnership_id
174+ from partnership_publication_tmp
175+ on conflict
176+ do nothing
177+ ;
178+ ''' )
179+ cur .execute ('drop table partnership_publication_tmp;' )
180+
181+ cur = connection .cursor ()
182+ cur .execute ('''
183+ create table r03_publication_tmp
184+ as table r03_publications
185+ with no data;
186+ ''' )
187+
188+ r_buf = io .StringIO ()
189+ r03_publication_df .to_csv (r_buf , header = True , sep = "\t " , index = None )
190+ r_buf .seek (0 )
191+ columns = next (r_buf ).strip ().split ('\t ' )
192+ cur .copy_from (r_buf , 'r03_publication_tmp' ,
193+ columns = r03_publication_columns ,
194+ null = '' ,
195+ sep = '\t ' ,
196+ )
197+
198+ cur .execute ('''
199+ insert into r03_publications (publication_id, r03_id)
200+ select publication_id, r03_id
201+ from r03_publication_tmp
202+ on conflict
203+ do nothing
204+ ;
205+ ''' )
206+ cur .execute ('drop table r03_publication_tmp;' )
207+
208+
209+ connection .commit ()
210+
211+ print ("ingested publications" )
0 commit comments