Scrape-Rxiv-Websites/1-arxiv_scrape.py at main · clotoole/Scrape-Rxiv-Websites · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun  3 13:04:14 2020

@author: codyotoole
"""


import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd


#get result for first page
#Make a get request
response = requests.get('https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=COVID-19&terms-0-field=title&terms-1-operator=OR&terms-1-term=SARS-CoV-2&terms-1-field=abstract&terms-3-operator=OR&terms-3-term=COVID-19&terms-3-field=abstract&terms-4-operator=OR&terms-4-term=SARS-CoV-2&terms-4-field=title&terms-5-operator=OR&terms-5-term=coronavirus&terms-5-field=title&terms-6-operator=OR&terms-6-term=coronavirus&terms-6-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&source=home-covid-19')

#Parse content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html5lib')

#News Identification
covid_news = page_html.find_all('li', class_='arxiv-result')


#create list corresponding to number of pages wanting to scrape
pages = [str(i) for i in range(200,600,200)]

#for every page in the given interval
for page in pages:

	#Make a get request
	response1 = requests.get('https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=COVID-19&terms-0-field=title&terms-1-operator=OR&terms-1-term=SARS-CoV-2&terms-1-field=abstract&terms-3-operator=OR&terms-3-term=COVID-19&terms-3-field=abstract&terms-4-operator=OR&terms-4-term=SARS-CoV-2&terms-4-field=title&terms-5-operator=OR&terms-5-term=coronavirus&terms-5-field=title&terms-6-operator=OR&terms-6-term=coronavirus&terms-6-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&source=home-covid-19&start='+page)

	#Parse content of the request with BeautifulSoup
	page_html1 = BeautifulSoup(response1.text, 'html5lib')

	#News Identification
	covid_news1 = page_html1.find_all('li', class_='arxiv-result')

	#bring results together
	covid_news.extend(covid_news1)

len(covid_news)


#empty lists for contents, links and titles
list_links = []
list_titles = []
list_dates = []
list_authors = []
list_abstracts = []

for n in np.arange(0, len(covid_news)):

    link_data = covid_news[n].find('p', class_='list-title is-inline-block')


	#Getting the link of the article
    link = link_data.find('a')['href']
    list_links.append(link)

	#Getting the title
    title = covid_news[n].find('p', class_='title is-5 mathjax').get_text()
    list_titles.append(title)


	#Get Dates
    date = covid_news[n].find('p', class_='is-size-7').get_text()
    list_dates.append(date)

    authors = covid_news[n].find('p', class_='authors').get_text()
    list_authors.append(authors)

    abstract = covid_news[n].find('span', class_='abstract-full has-text-grey-dark mathjax').get_text()
    list_abstracts.append(abstract)


#make into data frame (make sure all lists are the same length)
df_arxiv = pd.DataFrame(
	{'Link': list_links,
     'Title': list_titles,
     'Authors': list_authors,

     'Abstract': list_abstracts,
	 'Date': list_dates

	 })


#cleaning
for i in range(0,len(df_arxiv)):
    df_arxiv['Date'][i] = df_arxiv['Date'][i].split(";",1)
    df_arxiv['Date'][i] = df_arxiv['Date'][i][0]

df_arxiv = df_arxiv.replace('\n','', regex=True)

for i in range(0,len(df_arxiv)):
    df_arxiv['Title'][i] = df_arxiv['Title'][i].strip()

for i in range(0,len(df_arxiv)):
    df_arxiv['Abstract'][i] = df_arxiv['Abstract'][i].strip()

for i in range(0,len(df_arxiv)):
    df_arxiv['Abstract'][i] = df_arxiv['Abstract'][i].split("△",1)
    df_arxiv['Abstract'][i] = df_arxiv['Abstract'][i][0]


for i in range(0,len(df_arxiv)):
    df_arxiv['Authors'][i] = df_arxiv['Authors'][i].split(":",1)
    df_arxiv['Authors'][i] = df_arxiv['Authors'][i][1]

for i in range(0,len(df_arxiv)):
    df_arxiv['Authors'][i] = df_arxiv['Authors'][i].strip()

for i in range(0,len(df_arxiv)):
    df_arxiv['Link'][i] = df_arxiv['Link'][i].replace('abs', 'pdf')


#Make into csv
df_arxiv.to_csv('/Users/codyotoole/Desktop/arxiv_meta.csv')