Skip to content

Commit e1cb0ff

Browse files
committed
restore publications flow
1 parent 0acb239 commit e1cb0ff

File tree

2 files changed

+32
-101
lines changed

2 files changed

+32
-101
lines changed

datapackage_pipelines_migdar/flows/publications.py

Lines changed: 27 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -20,83 +20,32 @@
2020
KEY_PATTERN = 'publications/{migdar_id}'
2121
PAGE_TITLE_PATTERN = '{title}'
2222

23-
SCOPES = ['https://www.googleapis.com/auth/drive']
23+
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
2424
try:
2525
credentials = Credentials.from_service_account_file(
2626
'/migdar-gdrive/secret-g-service-account.json', scopes=SCOPES)
2727
except Exception:
2828
logging.exception('Failed to open creds!')
2929
credentials = Credentials.from_service_account_file(
3030
'gdrive_creds.json', scopes=SCOPES)
31-
drive_service = build('drive', 'v3', credentials=credentials)
3231

32+
service = build('sheets', 'v4', credentials=credentials)
3333

34-
def list_gdrive():
35-
results = drive_service.files().list(
36-
q="'16bSopg9nlQDBN8gsjW712xuBWy16gPW0' in parents",
37-
fields='files(id,kind,name,mimeType,modifiedTime)').execute()
38-
yield from results.get('files', [])
34+
GOOGLE_SHEETS_ID = '1IPRvpogUZ06R9zVRPdZeYfAwdrs9hx0iRB8zSFubl_o'
3935

40-
41-
def download_files():
42-
os.makedirs('pubfiles', exist_ok=True)
43-
44-
def func(row):
45-
filename = row['filename']
46-
if not os.path.exists(filename):
47-
print('Downloading', filename)
48-
with open(filename, 'wb') as f:
49-
request = drive_service.files().get_media(fileId=row['id'])
50-
downloader = MediaIoBaseDownload(f, request)
51-
done = False
52-
while done is False:
53-
status, done = downloader.next_chunk(num_retries=3)
54-
55-
return func
56-
57-
58-
def one(i):
59-
return len(list(filter(lambda x: x, i))) == 1
60-
61-
62-
def get_sheets():
63-
def func(rows):
64-
total = 0
65-
for row in rows:
66-
print('Attempting with %r' % row)
67-
wb = load_workbook(row['filename'])
68-
for sheet_name in wb.sheetnames:
69-
if 'deleted' in sheet_name.strip().lower():
70-
continue
71-
row = copy(row)
72-
row['sheet'] = sheet_name
73-
row['headers'] = None
74-
sheet = wb[sheet_name]
75-
for i, cells in enumerate(sheet.rows, start=1):
76-
headers = [x.value for x in cells]
77-
if not any(headers):
78-
continue
79-
assert one(x in headers
80-
for x in ['Domain', 'Life Domains']),\
81-
'DOMAIN %r' % list(zip(headers, [x.value for x in list(sheet.rows)[i+1]]))
82-
if 'migdar_id' not in headers:
83-
print('BAD HEADERS', row['name'], sheet_name)
84-
continue
85-
if i > 3:
86-
break
87-
migdar_id_col = headers.index('migdar_id')
88-
row['headers'] = i
89-
j = i + 1
90-
while sheet.cell(row=j, column=migdar_id_col).value:
91-
j += 1
92-
print('%s // %s: Found %r ROWS' % (row['filename'], sheet_name, j - i - 1))
93-
total += j - i - 1
94-
break
95-
if row.get('headers') is not None:
96-
yield row
97-
break
98-
print('TOTAL ROWS', total)
99-
return func
36+
def list_all_sheet_ids(google_doc_id):
37+
# Get all 'gid' numbers of the google spreadsheet:
38+
spreadsheet = service.spreadsheets().get(
39+
spreadsheetId=google_doc_id,
40+
fields="sheets(properties(sheetId,title))"
41+
).execute()
42+
ret = []
43+
for sheet in spreadsheet['sheets']:
44+
props = sheet['properties']
45+
print(f"{props['title']} → gid={props['sheetId']}")
46+
# append ret the full url of the sheet
47+
ret.append(f"https://docs.google.com/spreadsheets/d/{google_doc_id}/edit#gid={props['sheetId']}")
48+
return ret
10049

10150

10251
years = re.compile('[12][0-9]{3}')
@@ -132,31 +81,14 @@ def func(row):
13281

13382

13483
def base_flow():
135-
sources, *_ = Flow(
136-
list_gdrive(),
137-
filter_rows(lambda row: (
138-
row['kind'] == 'drive#file' and
139-
row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
140-
)),
141-
add_field('filename', 'string',
142-
default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)),
143-
parallelize(
144-
download_files(),
145-
num_processors=8,
146-
),
147-
add_field('sheet', 'string'),
148-
add_field('headers', 'integer', 1),
149-
get_sheets(),
150-
).results()
84+
sources = list_all_sheet_ids(GOOGLE_SHEETS_ID)
15185
return Flow(
15286
*[
153-
load(source['filename'],
154-
sheet=source['sheet'],
155-
headers=source['headers'],
87+
load(source,
15688
infer_strategy=load.INFER_STRINGS,
15789
cast_strategy=load.CAST_TO_STRINGS,
158-
name=source['filename'])
159-
for source in sources[0]
90+
name=source.split('#')[1].split('=')[1])
91+
for source in sources
16092
],
16193
filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)),
16294
load('data/zotero/zotero.csv'),
@@ -170,13 +102,13 @@ def base_flow():
170102
'notes': [],
171103
'tags': ['Tags'],
172104
'publisher': [],
173-
'languages': ['language_code'],
174-
'item_kind': ['Item Type', 'Item type', 'item_type'],
175-
'pubyear': ['pubyear/pubdate'],
176-
'life_areas': ['Life Domains', 'Domain'],
177-
'source_kind': ['Resource Type', 'Resource type'],
178-
'authors': ['author'],
179-
'url': ['URL'],
105+
'languages': [],
106+
'item_kind': [],
107+
'pubyear': [],
108+
'life_areas': [],
109+
'source_kind': [],
110+
'authors': [],
111+
'url': [],
180112

181113
},
182114
target=dict(

pipeline-spec.yaml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,11 @@ zotero_fetch:
2424
- flow: datapackage_pipelines_migdar.flows.zotero
2525

2626

27-
# google drive was deleted, we rely on existing data in elasticsearch restored by notebooks/restore_publications.ipynb
28-
#publications:
29-
# dependencies:
30-
# - pipeline: ./zotero_fetch
31-
# pipeline:
32-
# - flow: datapackage_pipelines_migdar.flows.publications
27+
publications:
28+
dependencies:
29+
- pipeline: ./zotero_fetch
30+
pipeline:
31+
- flow: datapackage_pipelines_migdar.flows.publications
3332

3433

3534
sitemap:

0 commit comments

Comments
 (0)