-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_scraper.py
More file actions
94 lines (72 loc) · 2.09 KB
/
data_scraper.py
File metadata and controls
94 lines (72 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: juanitagomez
"""
import json
import pickle
import time
from pathlib import Path
from scholarly import scholarly
PICKLE_CACHE_DIR = Path("pickle_cache")
def load_config(config_file):
"""
Load queries and year range from config file.
Parameters
----------
config_file : str
Path to the configuration file.
Returns
-------
list
List of queries.
int
Start year.
int
End year.
"""
with open(config_file, 'r') as file:
config = json.load(file)
return config.get("queries", []), config.get("start_year", None), config.get("end_year", None)
def search_papers(queries):
"""
Search for papers matching the queries
Parameters
----------
queries : list
List of search queries.
start_year : int
The starting year for filtering papers.
end_year : int
The ending year for filtering papers.
Returns
-------
results : list
List of paper metadata
bib_entries: list
List of BibTex entries
"""
try:
for query in queries:
query_cache_dir = PICKLE_CACHE_DIR / query
query_cache_dir.mkdir(parents=True, exist_ok=True)
search_results = scholarly.search_pubs(query)
for idx, paper in enumerate(search_results):
time.sleep(5)
scholarly.fill(paper)
with open(query_cache_dir / (str(idx) + '.pickle'), 'wb') as pickle_f:
pickle.dump(paper, pickle_f)
time.sleep(5)
except BaseException as e:
completed = False
error = str(e)
else:
completed = True
error = "dummy"
finally:
with open("progress.json", 'w', encoding="UTF-8") as f:
json.dump({"query": query, "idx": idx, "completed": completed, "error": error}, f, indent=4)
def scrape():
queries, start_year, end_year = load_config("config.json")
search_papers(queries)
scrape()