literature-scraper/data_scraper.py at main · juanis2112/literature-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: juanitagomez
"""
import json
import pickle
import time
from pathlib import Path
from scholarly import scholarly

PICKLE_CACHE_DIR = Path("pickle_cache")


def load_config(config_file):
    """
    Load queries and year range from config file.

    Parameters
    ----------
    config_file : str
        Path to the configuration file.

    Returns
    -------
    list
        List of queries.
    int
        Start year.
    int
        End year.

    """

    with open(config_file, 'r') as file:
        config = json.load(file)
    return config.get("queries", []), config.get("start_year", None), config.get("end_year", None)


def search_papers(queries):
    """
    Search for papers matching the queries

    Parameters
    ----------
    queries : list
        List of search queries.
    start_year : int
        The starting year for filtering papers.
    end_year : int
        The ending year for filtering papers.

    Returns
    -------
    results : list
        List of paper metadata

    bib_entries: list
        List of BibTex entries

    """
    try:
        for query in queries:
            query_cache_dir = PICKLE_CACHE_DIR / query
            query_cache_dir.mkdir(parents=True, exist_ok=True)

            search_results = scholarly.search_pubs(query)

            for idx, paper in enumerate(search_results):
                time.sleep(5)
                scholarly.fill(paper)
                with open(query_cache_dir / (str(idx) + '.pickle'), 'wb') as pickle_f:
                    pickle.dump(paper, pickle_f)

                time.sleep(5)

    except BaseException as e:
        completed = False
        error = str(e)
    else:
        completed = True
        error = "dummy"
    finally:
        with open("progress.json", 'w', encoding="UTF-8") as f:
            json.dump({"query": query, "idx": idx, "completed": completed, "error": error}, f, indent=4)


def scrape():
    queries, start_year, end_year = load_config("config.json")
    search_papers(queries)

scrape()