fed2vec/get_speeches.py at master · ancilcrayton/fed2vec · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import datetime as d
import os
import re
import requests
from bs4 import BeautifulSoup

# Create data directory if it doesn't exist
if os.path.isdir('data/') == False:
    os.system('mkdir data/')

# Function to scrape the speeches from the Fed website
def scrape_speeches():
    # Set the current year
    current_year = int(d.datetime.now().year)

    # Create list of years from 2006 to last year
    years = list(range(2006, current_year))

    dfs = []
    for year in years:
        # Choose the proper webpage to access
    	print('Collecting data for year {}...'.format(year))
    	if year < 2011:
            page = 'https://www.federalreserve.gov/newsevents/speech/{}speech.htm'.format(year)
    	else:
            page = 'https://www.federalreserve.gov/newsevents/speech/{}-speeches.htm'.format(year)
    	# Access page
    	page_response = requests.get(page)
    	page_content = BeautifulSoup(page_response.content, 'html.parser')
    	# Collect all of the links
    	rows = page_content.find_all('div', attrs={'class':'row'})
    	speeches = rows[9]
    	# Get the links
    	links = speeches.find_all('a', attrs={'href':re.compile('\w+\d+\w.htm')}) # Use regular expression to filter out URLS
    	# Get all of the links to the articles
    	urls = []
    	for link in links:
            urls.append(link['href'])
    	# Begin the process of scraping and collecting the data
    	dates = []
    	speakers = []
    	locations = []
    	speech_text = []
    	base_url = 'https://www.federalreserve.gov'
    	print('Scraping speeches...'.format(year))
    	for url in urls:
            link = base_url+url
            # Access page
            page_response = requests.get(link)
            page_content = BeautifulSoup(page_response.content, 'html.parser')
            # Collect data and append to lists
            # dates
            date = page_content.find('p', attrs={'class':'article__time'}).text
            dates.append(date)
            # speakers
            speaker = page_content.find('p', attrs={'class':'speaker'}).text
            speakers.append(speaker)
            # locations
            location = page_content.find('p', attrs={'class':'location'}).text
            locations.append(location)
            # speech text
            block = page_content.find('div', attrs={'class':'col-xs-12 col-sm-8 col-md-8'})
            paragraphs = block.find_all('p')
            text_list = []
            for paragraph in paragraphs:
                text_list.append(paragraph.text)
            text = ' '.join(text_list)
            speech_text.append(text)
            # Save info as a dataframe
            df = pd.DataFrame(data={'Date':dates, 'Speaker':speakers, 'Location':locations, 'Speech':speech_text})
    	# Append data frame to the main loop
    	dfs.append(df)
    	print('{} is done!'.format(year))
    # Combine every year to form dataset
    data = pd.concat(dfs, ignore_index = True)

    # Save the dataset
    data.to_json('data/fed_speeches.json', orient='records')

# Execute function from command line
if __name__ == '__main__':
	scrape_speeches()