-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1-arxiv_scrape.py
More file actions
138 lines (80 loc) · 4.33 KB
/
1-arxiv_scrape.py
File metadata and controls
138 lines (80 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 3 13:04:14 2020
@author: codyotoole
"""
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
#get result for first page
#Make a get request
response = requests.get('https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=COVID-19&terms-0-field=title&terms-1-operator=OR&terms-1-term=SARS-CoV-2&terms-1-field=abstract&terms-3-operator=OR&terms-3-term=COVID-19&terms-3-field=abstract&terms-4-operator=OR&terms-4-term=SARS-CoV-2&terms-4-field=title&terms-5-operator=OR&terms-5-term=coronavirus&terms-5-field=title&terms-6-operator=OR&terms-6-term=coronavirus&terms-6-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&source=home-covid-19')
#Parse content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html5lib')
#News Identification
covid_news = page_html.find_all('li', class_='arxiv-result')
#create list corresponding to number of pages wanting to scrape
pages = [str(i) for i in range(200,600,200)]
#for every page in the given interval
for page in pages:
#Make a get request
response1 = requests.get('https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=COVID-19&terms-0-field=title&terms-1-operator=OR&terms-1-term=SARS-CoV-2&terms-1-field=abstract&terms-3-operator=OR&terms-3-term=COVID-19&terms-3-field=abstract&terms-4-operator=OR&terms-4-term=SARS-CoV-2&terms-4-field=title&terms-5-operator=OR&terms-5-term=coronavirus&terms-5-field=title&terms-6-operator=OR&terms-6-term=coronavirus&terms-6-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&source=home-covid-19&start='+page)
#Parse content of the request with BeautifulSoup
page_html1 = BeautifulSoup(response1.text, 'html5lib')
#News Identification
covid_news1 = page_html1.find_all('li', class_='arxiv-result')
#bring results together
covid_news.extend(covid_news1)
len(covid_news)
#empty lists for contents, links and titles
list_links = []
list_titles = []
list_dates = []
list_authors = []
list_abstracts = []
for n in np.arange(0, len(covid_news)):
link_data = covid_news[n].find('p', class_='list-title is-inline-block')
#Getting the link of the article
link = link_data.find('a')['href']
list_links.append(link)
#Getting the title
title = covid_news[n].find('p', class_='title is-5 mathjax').get_text()
list_titles.append(title)
#Get Dates
date = covid_news[n].find('p', class_='is-size-7').get_text()
list_dates.append(date)
authors = covid_news[n].find('p', class_='authors').get_text()
list_authors.append(authors)
abstract = covid_news[n].find('span', class_='abstract-full has-text-grey-dark mathjax').get_text()
list_abstracts.append(abstract)
#make into data frame (make sure all lists are the same length)
df_arxiv = pd.DataFrame(
{'Link': list_links,
'Title': list_titles,
'Authors': list_authors,
'Abstract': list_abstracts,
'Date': list_dates
})
#cleaning
for i in range(0,len(df_arxiv)):
df_arxiv['Date'][i] = df_arxiv['Date'][i].split(";",1)
df_arxiv['Date'][i] = df_arxiv['Date'][i][0]
df_arxiv = df_arxiv.replace('\n','', regex=True)
for i in range(0,len(df_arxiv)):
df_arxiv['Title'][i] = df_arxiv['Title'][i].strip()
for i in range(0,len(df_arxiv)):
df_arxiv['Abstract'][i] = df_arxiv['Abstract'][i].strip()
for i in range(0,len(df_arxiv)):
df_arxiv['Abstract'][i] = df_arxiv['Abstract'][i].split("△",1)
df_arxiv['Abstract'][i] = df_arxiv['Abstract'][i][0]
for i in range(0,len(df_arxiv)):
df_arxiv['Authors'][i] = df_arxiv['Authors'][i].split(":",1)
df_arxiv['Authors'][i] = df_arxiv['Authors'][i][1]
for i in range(0,len(df_arxiv)):
df_arxiv['Authors'][i] = df_arxiv['Authors'][i].strip()
for i in range(0,len(df_arxiv)):
df_arxiv['Link'][i] = df_arxiv['Link'][i].replace('abs', 'pdf')
#Make into csv
df_arxiv.to_csv('/Users/codyotoole/Desktop/arxiv_meta.csv')