1+ import pandas as pd
2+ import yaml
3+ from glob import glob
4+ from uuid import uuid5 , NAMESPACE_URL
5+ from uuid import uuid5 , NAMESPACE_URL
6+ from s3_update import backup_file
7+ from ingest_common import connection
8+ import io
9+ import csv
10+ publication_df = pd .read_csv ('https://cfde-drc.s3.amazonaws.com/database/files/current_publications.tsv' , sep = "\t " , index_col = 0 )
11+ publication_mapper = {}
12+ for i , row in publication_df .iterrows ():
13+ doi = row ['doi' ]
14+ if type (doi ) == str :
15+ publication_mapper [doi ] = i
16+
17+ tools = {}
18+ for filename in glob ('../../src/pages/tools/*.md' ):
19+ with open (filename ) as o :
20+ markdown = o .read ()
21+ m = markdown .split ("---" )
22+ row = yaml .safe_load (m [1 ])
23+ if "label" in row :
24+ uid = str (uuid5 (NAMESPACE_URL , row ['label' ]))
25+ description = m [- 1 ].strip ()
26+
27+ val = {"description" : description }
28+ for k , v in row .items ():
29+ if k == "doi" :
30+ if type (v ) == str :
31+ doi = v .replace ("https://doi.org/" , "" )
32+ pub_id = publication_mapper [doi ]
33+ publication_df .at [pub_id , 'tool_id' ] = uid
34+ elif not k == "layout" :
35+ val [k ] = v
36+
37+ tools [uid ] = val
38+
39+ tools_df = pd .DataFrame .from_dict (tools , orient = "index" ).fillna ('' )
40+ tools_df .index .name = "id"
41+ backup_file (tools_df , "tools" , quoting = False )
42+ backup_file (publication_df , "publications" , quoting = False )
43+
44+ cur = connection .cursor ()
45+ # Remove tool_ids on publication
46+ cur .execute ('''
47+ UPDATE publications
48+ SET tool_id=NULL;
49+ ''' )
50+
51+ # delete tool table
52+ cur .execute ('''
53+ DELETE FROM tools;
54+ ''' )
55+ # Create tools
56+ cur .execute ('''
57+ create table tools_tmp
58+ as table tools
59+ with no data;
60+ ''' )
61+
62+ t_buf = io .StringIO ()
63+ tools_df .to_csv (t_buf , header = True , quoting = csv .QUOTE_NONE , sep = "\t " )
64+ t_buf .seek (0 )
65+
66+ columns = next (t_buf ).strip ().split ('\t ' )
67+ cur .copy_from (t_buf , 'tools_tmp' ,
68+ columns = columns ,
69+ null = '' ,
70+ sep = '\t ' ,
71+ )
72+ column_string = ", " .join (columns )
73+ set_string = ",\n " .join (["%s = excluded.%s" % (i ,i ) for i in columns ])
74+ cur .execute ('''
75+ insert into tools (%s)
76+ select %s
77+ from tools_tmp
78+ on conflict (id)
79+ do update
80+ set %s
81+ ;
82+ ''' % (column_string , column_string , set_string ))
83+ cur .execute ('drop table tools_tmp;' )
84+
85+ # Create publications
86+ cur .execute ('''
87+ create table publications_tmp
88+ as table publications
89+ with no data;
90+ ''' )
91+
92+ p_buf = io .StringIO ()
93+ publication_df .to_csv (p_buf , header = True , quoting = csv .QUOTE_NONE , sep = "\t " )
94+ p_buf .seek (0 )
95+
96+ columns = next (p_buf ).strip ().split ('\t ' )
97+ cur .copy_from (p_buf , 'publications_tmp' ,
98+ columns = columns ,
99+ null = '' ,
100+ sep = '\t ' ,
101+ )
102+ column_string = ", " .join (columns )
103+ set_string = ",\n " .join (["%s = excluded.%s" % (i ,i ) for i in columns ])
104+ cur .execute ('''
105+ insert into publications (%s)
106+ select %s
107+ from publications_tmp
108+ on conflict (id)
109+ do update
110+ set %s
111+ ;
112+ ''' % (column_string , column_string , set_string ))
113+ cur .execute ('drop table publications_tmp;' )
114+
115+
116+
117+ connection .commit ()
118+
119+ print ("Ingested Tools" )
0 commit comments