Skip to content

Commit 3d17eeb

Browse files
committed
Scraping, Plotly example rewrite
1 parent 51d1127 commit 3d17eeb

File tree

2 files changed

+66
-46
lines changed

2 files changed

+66
-46
lines changed

README.md

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2510,10 +2510,9 @@ document = bs4.BeautifulSoup(response.text, 'html.parser')
25102510
table = document.find('table', class_='infobox vevent')
25112511
python_url = table.find('th', text='Website').next_sibling.a['href']
25122512
logo_url = table.find('img')['src']
2513-
logo = requests.get(f'https:{logo_url}').content
25142513
filename = os.path.basename(logo_url)
25152514
with open(filename, 'wb') as file:
2516-
file.write(logo)
2515+
file.write(requests.get(f'https:{logo_url}').content)
25172516
print(f'{python_url}, file://{os.path.abspath(filename)}')
25182517
```
25192518

@@ -2525,6 +2524,7 @@ from selenium import webdriver
25252524

25262525
<WebDrv> = webdriver.Chrome/Firefox/Safari/Edge() # Opens a browser. Also <WebDrv>.quit().
25272526
<WebDrv>.get('<url>') # Also <WebDrv>.implicitly_wait(seconds).
2527+
<str> = <WebDrv>.page_source # Returns HTML of fully rendered page.
25282528
<El> = <WebDrv/El>.find_element('css selector', …) # '<tag>#<id>.<class>[<attr>="<val>"]…'.
25292529
<list> = <WebDrv/El>.find_elements('xpath', …) # '//<tag>[@<attr>="<val>"]…'. See XPath.
25302530
<str> = <El>.get_attribute(<str>) # Property if exists. Also <El>.text.
@@ -3457,32 +3457,41 @@ px.line(df, x='Date', y='Total Deaths per Million', color='Continent').show()
34573457
<div id="e23ccacc-a456-478b-b467-7282a2165921" class="plotly-graph-div" style="height:287px; width:935px;"></div>
34583458

34593459
```python
3460-
import pandas as pd, plotly.graph_objects as go
3460+
# $ pip3 install pandas selenium plotly lxml
3461+
import pandas as pd, selenium.webdriver, plotly.graph_objects as go
3462+
34613463

34623464
def main():
3463-
covid, bitcoin, gold, dow = scrape_data()
3465+
covid, (bitcoin, gold, dow) = get_covid_cases(), get_tickers()
34643466
df = wrangle_data(covid, bitcoin, gold, dow)
34653467
display_data(df)
34663468

3467-
def scrape_data():
3468-
def get_covid_cases():
3469-
url = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
3470-
df = pd.read_csv(url, usecols=['location', 'date', 'total_cases'])
3471-
df = df[df.location == 'World']
3472-
return df.set_index('date').total_cases
3473-
def get_ticker(symbol):
3474-
url = (f'https://query1.finance.yahoo.com/v7/finance/download/{symbol}?'
3475-
'period1=1579651200&period2=9999999999&interval=1d&events=history')
3476-
df = pd.read_csv(url, usecols=['Date', 'Close'])
3477-
return df.set_index('Date').Close
3478-
out = get_covid_cases(), get_ticker('BTC-USD'), get_ticker('GC=F'), get_ticker('^DJI')
3479-
names = ['Total Cases', 'Bitcoin', 'Gold', 'Dow Jones']
3480-
return map(pd.Series.rename, out, names)
3469+
def get_covid_cases():
3470+
url = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
3471+
df = pd.read_csv(url, usecols=['location', 'date', 'total_cases'], parse_dates=['date'])
3472+
df = df[df.location == 'World']
3473+
s = df.set_index('date').total_cases
3474+
return s.rename('Total Cases')
3475+
3476+
def get_tickers():
3477+
with selenium.webdriver.Chrome() as driver:
3478+
symbols = {'Bitcoin': 'BTC-USD', 'Gold': 'GC=F', 'Dow Jones': '%5EDJI'}
3479+
for name, symbol in symbols.items():
3480+
yield get_ticker(driver, name, symbol)
3481+
3482+
def get_ticker(driver, name, symbol):
3483+
url = f'https://finance.yahoo.com/quote/{symbol}/history/'
3484+
driver.get(url + '?period1=1579651200&period2=9999999999')
3485+
if buttons := driver.find_elements('xpath', '//button[@name="reject"]'):
3486+
buttons[0].click()
3487+
dataframes = pd.read_html(driver.page_source, parse_dates=['Date'])
3488+
s = dataframes[0].set_index('Date').Open
3489+
return s.rename(name)
34813490

34823491
def wrangle_data(covid, bitcoin, gold, dow):
34833492
df = pd.concat([bitcoin, gold, dow], axis=1) # Creates table by joining columns on dates.
34843493
df = df.sort_index().interpolate() # Sorts rows by date and interpolates NaN-s.
3485-
df = df.loc['2020-02-23':] # Discards rows before '2020-02-23'.
3494+
df = df.loc['2020-02-23':'2021-12-20'] # Keeps rows between specified dates.
34863495
df = (df / df.iloc[0]) * 100 # Calculates percentages relative to day 1.
34873496
df = df.join(covid) # Adds column with covid cases.
34883497
return df.sort_values(df.index[-1], axis=1) # Sorts columns by last day's value.
@@ -3494,11 +3503,12 @@ def display_data(df):
34943503
trace = go.Scatter(x=df.index, y=df[col_name], name=col_name, yaxis=yaxis)
34953504
figure.add_trace(trace)
34963505
figure.update_layout(
3506+
width=944,
3507+
height=423,
34973508
yaxis1=dict(title='Total Cases', rangemode='tozero'),
34983509
yaxis2=dict(title='%', rangemode='tozero', overlaying='y', side='right'),
3499-
legend=dict(x=1.08),
3500-
width=944,
3501-
height=423
3510+
colorway=['#EF553B', '#636EFA', '#00CC96', '#FFA152'],
3511+
legend=dict(x=1.08)
35023512
)
35033513
figure.show()
35043514

index.html

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555

5656
<body>
5757
<header>
58-
<aside>December 20, 2024</aside>
58+
<aside>December 24, 2024</aside>
5959
<a href="https://gto76.github.io" rel="author">Jure Šorn</a>
6060
</header>
6161

@@ -2052,10 +2052,9 @@ <h3 id="format-2">Format</h3><div><h4 id="forstandardtypesizesandmanualalignment
20522052
table = document.find(<span class="hljs-string">'table'</span>, class_=<span class="hljs-string">'infobox vevent'</span>)
20532053
python_url = table.find(<span class="hljs-string">'th'</span>, text=<span class="hljs-string">'Website'</span>).next_sibling.a[<span class="hljs-string">'href'</span>]
20542054
logo_url = table.find(<span class="hljs-string">'img'</span>)[<span class="hljs-string">'src'</span>]
2055-
logo = requests.get(<span class="hljs-string">f'https:<span class="hljs-subst">{logo_url}</span>'</span>).content
20562055
filename = os.path.basename(logo_url)
20572056
<span class="hljs-keyword">with</span> open(filename, <span class="hljs-string">'wb'</span>) <span class="hljs-keyword">as</span> file:
2058-
file.write(logo)
2057+
file.write(requests.get(<span class="hljs-string">f'https:<span class="hljs-subst">{logo_url}</span>'</span>).content)
20592058
print(<span class="hljs-string">f'<span class="hljs-subst">{python_url}</span>, file://<span class="hljs-subst">{os.path.abspath(filename)}</span>'</span>)
20602059
</code></pre></div></div>
20612060

@@ -2065,6 +2064,7 @@ <h3 id="format-2">Format</h3><div><h4 id="forstandardtypesizesandmanualalignment
20652064

20662065
&lt;WebDrv&gt; = webdriver.Chrome/Firefox/Safari/Edge() <span class="hljs-comment"># Opens a browser. Also &lt;WebDrv&gt;.quit().</span>
20672066
&lt;WebDrv&gt;.get(<span class="hljs-string">'&lt;url&gt;'</span>) <span class="hljs-comment"># Also &lt;WebDrv&gt;.implicitly_wait(seconds).</span>
2067+
&lt;str&gt; = &lt;WebDrv&gt;.page_source <span class="hljs-comment"># Returns HTML of fully rendered page.</span>
20682068
&lt;El&gt; = &lt;WebDrv/El&gt;.find_element(<span class="hljs-string">'css selector'</span>, …) <span class="hljs-comment"># '&lt;tag&gt;#&lt;id&gt;.&lt;class&gt;[&lt;attr&gt;="&lt;val&gt;"]…'.</span>
20692069
&lt;list&gt; = &lt;WebDrv/El&gt;.find_elements(<span class="hljs-string">'xpath'</span>, …) <span class="hljs-comment"># '//&lt;tag&gt;[@&lt;attr&gt;="&lt;val&gt;"]…'. See XPath.</span>
20702070
&lt;str&gt; = &lt;El&gt;.get_attribute(&lt;str&gt;) <span class="hljs-comment"># Property if exists. Also &lt;El&gt;.text.</span>
@@ -2805,32 +2805,41 @@ <h3 id="format-2">Format</h3><div><h4 id="forstandardtypesizesandmanualalignment
28052805

28062806

28072807

2808-
<div><h4 id="displaysamultiaxislinechartoftotalcoronaviruscasesandchangesinpricesofbitcoindowjonesandgold">Displays a multi-axis line chart of total coronavirus cases and changes in prices of Bitcoin, Dow Jones and gold:</h4><p></p><div id="e23ccacc-a456-478b-b467-7282a2165921" class="plotly-graph-div" style="height:287px; width:935px;"></div><pre><code class="python language-python hljs"><span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd, plotly.graph_objects <span class="hljs-keyword">as</span> go
2808+
<div><h4 id="displaysamultiaxislinechartoftotalcoronaviruscasesandchangesinpricesofbitcoindowjonesandgold">Displays a multi-axis line chart of total coronavirus cases and changes in prices of Bitcoin, Dow Jones and gold:</h4><p></p><div id="e23ccacc-a456-478b-b467-7282a2165921" class="plotly-graph-div" style="height:287px; width:935px;"></div><pre><code class="python language-python hljs"><span class="hljs-comment"># $ pip3 install pandas selenium plotly lxml</span>
2809+
<span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd, selenium.webdriver, plotly.graph_objects <span class="hljs-keyword">as</span> go
2810+
28092811

28102812
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">main</span><span class="hljs-params">()</span>:</span>
2811-
covid, bitcoin, gold, dow = scrape_data()
2813+
covid, (bitcoin, gold, dow) = get_covid_cases(), get_tickers()
28122814
df = wrangle_data(covid, bitcoin, gold, dow)
28132815
display_data(df)
28142816

2815-
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">scrape_data</span><span class="hljs-params">()</span>:</span>
2816-
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">get_covid_cases</span><span class="hljs-params">()</span>:</span>
2817-
url = <span class="hljs-string">'https://covid.ourworldindata.org/data/owid-covid-data.csv'</span>
2818-
df = pd.read_csv(url, usecols=[<span class="hljs-string">'location'</span>, <span class="hljs-string">'date'</span>, <span class="hljs-string">'total_cases'</span>])
2819-
df = df[df.location == <span class="hljs-string">'World'</span>]
2820-
<span class="hljs-keyword">return</span> df.set_index(<span class="hljs-string">'date'</span>).total_cases
2821-
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">get_ticker</span><span class="hljs-params">(symbol)</span>:</span>
2822-
url = (<span class="hljs-string">f'https://query1.finance.yahoo.com/v7/finance/download/<span class="hljs-subst">{symbol}</span>?'</span>
2823-
<span class="hljs-string">'period1=1579651200&amp;period2=9999999999&amp;interval=1d&amp;events=history'</span>)
2824-
df = pd.read_csv(url, usecols=[<span class="hljs-string">'Date'</span>, <span class="hljs-string">'Close'</span>])
2825-
<span class="hljs-keyword">return</span> df.set_index(<span class="hljs-string">'Date'</span>).Close
2826-
out = get_covid_cases(), get_ticker(<span class="hljs-string">'BTC-USD'</span>), get_ticker(<span class="hljs-string">'GC=F'</span>), get_ticker(<span class="hljs-string">'^DJI'</span>)
2827-
names = [<span class="hljs-string">'Total Cases'</span>, <span class="hljs-string">'Bitcoin'</span>, <span class="hljs-string">'Gold'</span>, <span class="hljs-string">'Dow Jones'</span>]
2828-
<span class="hljs-keyword">return</span> map(pd.Series.rename, out, names)
2817+
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">get_covid_cases</span><span class="hljs-params">()</span>:</span>
2818+
url = <span class="hljs-string">'https://covid.ourworldindata.org/data/owid-covid-data.csv'</span>
2819+
df = pd.read_csv(url, usecols=[<span class="hljs-string">'location'</span>, <span class="hljs-string">'date'</span>, <span class="hljs-string">'total_cases'</span>], parse_dates=[<span class="hljs-string">'date'</span>])
2820+
df = df[df.location == <span class="hljs-string">'World'</span>]
2821+
s = df.set_index(<span class="hljs-string">'date'</span>).total_cases
2822+
<span class="hljs-keyword">return</span> s.rename(<span class="hljs-string">'Total Cases'</span>)
2823+
2824+
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">get_tickers</span><span class="hljs-params">()</span>:</span>
2825+
<span class="hljs-keyword">with</span> selenium.webdriver.Chrome() <span class="hljs-keyword">as</span> driver:
2826+
symbols = {<span class="hljs-string">'Bitcoin'</span>: <span class="hljs-string">'BTC-USD'</span>, <span class="hljs-string">'Gold'</span>: <span class="hljs-string">'GC=F'</span>, <span class="hljs-string">'Dow Jones'</span>: <span class="hljs-string">'%5EDJI'</span>}
2827+
<span class="hljs-keyword">for</span> name, symbol <span class="hljs-keyword">in</span> symbols.items():
2828+
<span class="hljs-keyword">yield</span> get_ticker(driver, name, symbol)
2829+
2830+
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">get_ticker</span><span class="hljs-params">(driver, name, symbol)</span>:</span>
2831+
url = <span class="hljs-string">f'https://finance.yahoo.com/quote/<span class="hljs-subst">{symbol}</span>/history/'</span>
2832+
driver.get(url + <span class="hljs-string">'?period1=1579651200&amp;period2=9999999999'</span>)
2833+
<span class="hljs-keyword">if</span> buttons := driver.find_elements(<span class="hljs-string">'xpath'</span>, <span class="hljs-string">'//button[@name="reject"]'</span>):
2834+
buttons[<span class="hljs-number">0</span>].click()
2835+
dataframes = pd.read_html(driver.page_source, parse_dates=[<span class="hljs-string">'Date'</span>])
2836+
s = dataframes[<span class="hljs-number">0</span>].set_index(<span class="hljs-string">'Date'</span>).Open
2837+
<span class="hljs-keyword">return</span> s.rename(name)
28292838

28302839
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">wrangle_data</span><span class="hljs-params">(covid, bitcoin, gold, dow)</span>:</span>
28312840
df = pd.concat([bitcoin, gold, dow], axis=<span class="hljs-number">1</span>) <span class="hljs-comment"># Creates table by joining columns on dates.</span>
28322841
df = df.sort_index().interpolate() <span class="hljs-comment"># Sorts rows by date and interpolates NaN-s.</span>
2833-
df = df.loc[<span class="hljs-string">'2020-02-23'</span>:] <span class="hljs-comment"># Discards rows before '2020-02-23'.</span>
2842+
df = df.loc[<span class="hljs-string">'2020-02-23'</span>:<span class="hljs-string">'2021-12-20'</span>] <span class="hljs-comment"># Keeps rows between specified dates.</span>
28342843
df = (df / df.iloc[<span class="hljs-number">0</span>]) * <span class="hljs-number">100</span> <span class="hljs-comment"># Calculates percentages relative to day 1.</span>
28352844
df = df.join(covid) <span class="hljs-comment"># Adds column with covid cases.</span>
28362845
<span class="hljs-keyword">return</span> df.sort_values(df.index[<span class="hljs-number">-1</span>], axis=<span class="hljs-number">1</span>) <span class="hljs-comment"># Sorts columns by last day's value.</span>
@@ -2842,11 +2851,12 @@ <h3 id="format-2">Format</h3><div><h4 id="forstandardtypesizesandmanualalignment
28422851
trace = go.Scatter(x=df.index, y=df[col_name], name=col_name, yaxis=yaxis)
28432852
figure.add_trace(trace)
28442853
figure.update_layout(
2854+
width=<span class="hljs-number">944</span>,
2855+
height=<span class="hljs-number">423</span>,
28452856
yaxis1=dict(title=<span class="hljs-string">'Total Cases'</span>, rangemode=<span class="hljs-string">'tozero'</span>),
28462857
yaxis2=dict(title=<span class="hljs-string">'%'</span>, rangemode=<span class="hljs-string">'tozero'</span>, overlaying=<span class="hljs-string">'y'</span>, side=<span class="hljs-string">'right'</span>),
2847-
legend=dict(x=<span class="hljs-number">1.08</span>),
2848-
width=<span class="hljs-number">944</span>,
2849-
height=<span class="hljs-number">423</span>
2858+
colorway=[<span class="hljs-string">'#EF553B'</span>, <span class="hljs-string">'#636EFA'</span>, <span class="hljs-string">'#00CC96'</span>, <span class="hljs-string">'#FFA152'</span>],
2859+
legend=dict(x=<span class="hljs-number">1.08</span>)
28502860
)
28512861
figure.show()
28522862

@@ -2924,7 +2934,7 @@ <h3 id="format-2">Format</h3><div><h4 id="forstandardtypesizesandmanualalignment
29242934

29252935

29262936
<footer>
2927-
<aside>December 20, 2024</aside>
2937+
<aside>December 24, 2024</aside>
29282938
<a href="https://gto76.github.io" rel="author">Jure Šorn</a>
29292939
</footer>
29302940

0 commit comments

Comments
 (0)