2020KEY_PATTERN = 'publications/{migdar_id}'
2121PAGE_TITLE_PATTERN = '{title}'
2222
23- SCOPES = ['https://www.googleapis.com/auth/drive ' ]
23+ SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly ' ]
2424try :
2525 credentials = Credentials .from_service_account_file (
2626 '/migdar-gdrive/secret-g-service-account.json' , scopes = SCOPES )
2727except Exception :
2828 logging .exception ('Failed to open creds!' )
2929 credentials = Credentials .from_service_account_file (
3030 'gdrive_creds.json' , scopes = SCOPES )
31- drive_service = build ('drive' , 'v3' , credentials = credentials )
3231
32+ service = build ('sheets' , 'v4' , credentials = credentials )
3333
34- def list_gdrive ():
35- results = drive_service .files ().list (
36- q = "'16bSopg9nlQDBN8gsjW712xuBWy16gPW0' in parents" ,
37- fields = 'files(id,kind,name,mimeType,modifiedTime)' ).execute ()
38- yield from results .get ('files' , [])
34+ GOOGLE_SHEETS_ID = '1IPRvpogUZ06R9zVRPdZeYfAwdrs9hx0iRB8zSFubl_o'
3935
40-
41- def download_files ():
42- os .makedirs ('pubfiles' , exist_ok = True )
43-
44- def func (row ):
45- filename = row ['filename' ]
46- if not os .path .exists (filename ):
47- print ('Downloading' , filename )
48- with open (filename , 'wb' ) as f :
49- request = drive_service .files ().get_media (fileId = row ['id' ])
50- downloader = MediaIoBaseDownload (f , request )
51- done = False
52- while done is False :
53- status , done = downloader .next_chunk (num_retries = 3 )
54-
55- return func
56-
57-
58- def one (i ):
59- return len (list (filter (lambda x : x , i ))) == 1
60-
61-
62- def get_sheets ():
63- def func (rows ):
64- total = 0
65- for row in rows :
66- print ('Attempting with %r' % row )
67- wb = load_workbook (row ['filename' ])
68- for sheet_name in wb .sheetnames :
69- if 'deleted' in sheet_name .strip ().lower ():
70- continue
71- row = copy (row )
72- row ['sheet' ] = sheet_name
73- row ['headers' ] = None
74- sheet = wb [sheet_name ]
75- for i , cells in enumerate (sheet .rows , start = 1 ):
76- headers = [x .value for x in cells ]
77- if not any (headers ):
78- continue
79- assert one (x in headers
80- for x in ['Domain' , 'Life Domains' ]),\
81- 'DOMAIN %r' % list (zip (headers , [x .value for x in list (sheet .rows )[i + 1 ]]))
82- if 'migdar_id' not in headers :
83- print ('BAD HEADERS' , row ['name' ], sheet_name )
84- continue
85- if i > 3 :
86- break
87- migdar_id_col = headers .index ('migdar_id' )
88- row ['headers' ] = i
89- j = i + 1
90- while sheet .cell (row = j , column = migdar_id_col ).value :
91- j += 1
92- print ('%s // %s: Found %r ROWS' % (row ['filename' ], sheet_name , j - i - 1 ))
93- total += j - i - 1
94- break
95- if row .get ('headers' ) is not None :
96- yield row
97- break
98- print ('TOTAL ROWS' , total )
99- return func
36+ def list_all_sheet_ids (google_doc_id ):
37+ # Get all 'gid' numbers of the google spreadsheet:
38+ spreadsheet = service .spreadsheets ().get (
39+ spreadsheetId = google_doc_id ,
40+ fields = "sheets(properties(sheetId,title))"
41+ ).execute ()
42+ ret = []
43+ for sheet in spreadsheet ['sheets' ]:
44+ props = sheet ['properties' ]
45+ print (f"{ props ['title' ]} → gid={ props ['sheetId' ]} " )
46+ # append ret the full url of the sheet
47+ ret .append (f"https://docs.google.com/spreadsheets/d/{ google_doc_id } /edit#gid={ props ['sheetId' ]} " )
48+ return ret
10049
10150
10251years = re .compile ('[12][0-9]{3}' )
@@ -132,31 +81,14 @@ def func(row):
13281
13382
13483def base_flow ():
135- sources , * _ = Flow (
136- list_gdrive (),
137- filter_rows (lambda row : (
138- row ['kind' ] == 'drive#file' and
139- row ['mimeType' ] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
140- )),
141- add_field ('filename' , 'string' ,
142- default = lambda row : 'pubfiles/{modifiedTime}-{id}.xlsx' .format (** row )),
143- parallelize (
144- download_files (),
145- num_processors = 8 ,
146- ),
147- add_field ('sheet' , 'string' ),
148- add_field ('headers' , 'integer' , 1 ),
149- get_sheets (),
150- ).results ()
84+ sources = list_all_sheet_ids (GOOGLE_SHEETS_ID )
15185 return Flow (
15286 * [
153- load (source ['filename' ],
154- sheet = source ['sheet' ],
155- headers = source ['headers' ],
87+ load (source ,
15688 infer_strategy = load .INFER_STRINGS ,
15789 cast_strategy = load .CAST_TO_STRINGS ,
158- name = source [ 'filename' ])
159- for source in sources [ 0 ]
90+ name = source . split ( '#' )[ 1 ]. split ( '=' )[ 1 ])
91+ for source in sources
16092 ],
16193 filter_rows (lambda row : row .get ('migdar_id' ) not in ('' , 'None' , None )),
16294 load ('data/zotero/zotero.csv' ),
@@ -170,13 +102,13 @@ def base_flow():
170102 'notes' : [],
171103 'tags' : ['Tags' ],
172104 'publisher' : [],
173- 'languages' : ['language_code' ],
174- 'item_kind' : ['Item Type' , 'Item type' , 'item_type' ],
175- 'pubyear' : ['pubyear/pubdate' ],
176- 'life_areas' : ['Life Domains' , 'Domain' ],
177- 'source_kind' : ['Resource Type' , 'Resource type' ],
178- 'authors' : ['author' ],
179- 'url' : ['URL' ],
105+ 'languages' : [],
106+ 'item_kind' : [],
107+ 'pubyear' : [],
108+ 'life_areas' : [],
109+ 'source_kind' : [],
110+ 'authors' : [],
111+ 'url' : [],
180112
181113 },
182114 target = dict (
0 commit comments