@@ -3155,7 +3155,7 @@ import pandas as pd, matplotlib.pyplot as plt
31553155** Ordered dictionary with a name.**
31563156
31573157``` python
3158- >> > pd.Series([1 , 2 ], index = [' x' , ' y' ], name = ' a' )
3158+ >> > sr = pd.Series([1 , 2 ], index = [' x' , ' y' ], name = ' a' ); sr
31593159x 1
31603160y 2
31613161Name: a, dtype: int64
@@ -3203,7 +3203,7 @@ plt.show() # Displays the plot. Also plt.sav
32033203```
32043204
32053205``` python
3206- >> > sr = pd.Series([2 , 3 ], index = [' x' , ' y' ])
3206+ >> > sr = pd.Series([2 , 3 ], index = [' x' , ' y' ]); sr
32073207x 2
32083208y 3
32093209```
@@ -3234,7 +3234,7 @@ y 3
32343234** Table with labeled rows and columns.**
32353235
32363236``` python
3237- >> > pd.DataFrame([[1 , 2 ], [3 , 4 ]], index = [' a' , ' b' ], columns = [' x' , ' y' ])
3237+ >> > l = pd.DataFrame([[1 , 2 ], [3 , 4 ]], index = [' a' , ' b' ], columns = [' x' , ' y' ]); l
32383238 x y
32393239a 1 2
32403240b 3 4
@@ -3270,13 +3270,14 @@ b 3 4
32703270< DF > = < DF > .sort_values(col_key/ s) # Sorts rows by passed column/s. Also `axis=1`.
32713271```
32723272
3273+ ``` python
3274+ < DF > .plot.line/ area/ bar/ scatter(x = col_key, …) # `y=col_key/s`. Also hist/box(by=col_key).
3275+ plt.show() # Displays the plot. Also plt.savefig(<path>).
3276+ ```
3277+
32733278#### DataFrame — Merge, Join, Concat:
32743279``` python
3275- >> > l = pd.DataFrame([[1 , 2 ], [3 , 4 ]], index = [' a' , ' b' ], columns = [' x' , ' y' ])
3276- x y
3277- a 1 2
3278- b 3 4
3279- >> > r = pd.DataFrame([[4 , 5 ], [6 , 7 ]], index = [' b' , ' c' ], columns = [' y' , ' z' ])
3280+ >> > r = pd.DataFrame([[4 , 5 ], [6 , 7 ]], index = [' b' , ' c' ], columns = [' y' , ' z' ]); r
32803281 y z
32813282b 4 5
32823283c 6 7
@@ -3323,7 +3324,7 @@ c 6 7
33233324* ** All operations operate on columns by default. Pass ` 'axis=1' ` to process the rows instead.**
33243325
33253326``` python
3326- >> > df = pd.DataFrame([[1 , 2 ], [3 , 4 ]], index = [' a' , ' b' ], columns = [' x' , ' y' ])
3327+ >> > df = pd.DataFrame([[1 , 2 ], [3 , 4 ]], index = [' a' , ' b' ], columns = [' x' , ' y' ]); df
33273328 x y
33283329a 1 2
33293330b 3 4
@@ -3350,15 +3351,11 @@ b 3 4
33503351```
33513352* ** Use ` '<DF>[col_key_1, col_key_2][row_key]' ` to get the fifth result's values.**
33523353
3353- #### DataFrame — Plot, Encode, Decode:
3354- ``` python
3355- < DF > .plot.line/ area/ bar/ scatter(x = col_key, …) # `y=col_key/s`. Also hist/box(by=col_key).
3356- plt.show() # Displays the plot. Also plt.savefig(<path>).
3357- ```
3354+ #### DataFrame — Encode, Decode:
33583355
33593356``` python
33603357< DF > = pd.read_json/ html(' <str/path/url>' ) # Run `$ pip3 install beautifulsoup4 lxml`.
3361- < DF > = pd.read_csv(' <path/url>' ) # `header/index_col/dtype/parse_dates /…=<obj>`.
3358+ < DF > = pd.read_csv(' <path/url>' ) # `header/index_col/dtype/usecols /…=<obj>`.
33623359< DF > = pd.read_pickle/ excel(' <path/url>' ) # Use `sheet_name=None` to get all Excel sheets.
33633360< DF > = pd.read_sql(' <table/query>' , < conn.> ) # SQLite3/SQLAlchemy connection (see #SQLite).
33643361```
@@ -3369,23 +3366,29 @@ plt.show() # Displays the plot. Also plt.sav
33693366< DF > .to_pickle/ excel(< path> ) # Run `$ pip3 install "pandas[excel]" odfpy`.
33703367< DF > .to_sql(' <table_name>' , < connection> ) # Also `if_exists='fail/replace/append'`.
33713368```
3369+ * ** Read\_ csv() only parses dates of columns that were specified by 'parse\_ dates' argument. It automatically tries to detect the format, but it can be helped with 'date\_ format' or 'datefirst' arguments. Both dates and datetimes get stored as pd.Timestamp objects.**
3370+ * ** If there's a single invalid date then it returns the whole column as a series of strings, unlike ` '<Sr> = pd.to_datetime(<Sr>, errors="coerce")' ` , which uses pd.NaT.**
3371+ * ** To get specific attributes from a series of Timestamps use ` '<Sr>.dt.year/date/…' ` .**
33723372
33733373### GroupBy
33743374** Object that groups together rows of a dataframe based on the value of the passed column.**
33753375
33763376``` python
33773377>> > df = pd.DataFrame([[1 , 2 , 3 ], [4 , 5 , 6 ], [7 , 8 , 6 ]], list (' abc' ), list (' xyz' ))
3378- >> > df.groupby(' z' ).get_group(6 )
3378+ >> > gb = df.groupby(' z' ); gb.apply(print )
3379+ x y z
3380+ a 1 2 3
33793381 x y z
33803382b 4 5 6
33813383c 7 8 6
33823384```
33833385
33843386``` python
3385- < GB > = < DF > .groupby(column_key / s) # Splits DF into groups based on passed column.
3387+ < GB > = < DF > .groupby(col_key / s) # Splits DF into groups based on passed column.
33863388< DF > = < GB > .apply(< func> ) # Maps each group. Func can return DF, Sr or el.
3387- < GB > = < GB > [column_key] # Single column GB. All operations return a Sr .
3389+ < DF > = < GB > .get_group( < num > ) # Selects a group by grouping column's value .
33883390< Sr> = < GB > .size() # A Sr of group sizes. Same keys as get_group().
3391+ < GB > = < GB > [col_key] # Single column GB. All operations return a Sr.
33893392```
33903393
33913394#### GroupBy — Aggregate, Transform, Map:
@@ -3396,37 +3399,20 @@ c 7 8 6
33963399```
33973400
33983401``` python
3399- >> > gb = df.groupby(' z' ); gb.apply(print )
3400- x y z
3401- a 1 2 3
3402- x y z
3403- b 4 5 6
3404- c 7 8 6
3405- ```
3406-
3407- ``` text
3408- +-----------------+-------------+-------------+-------------+---------------+
3409- | | 'sum' | 'rank' | ['rank'] | {'x': 'rank'} |
3410- +-----------------+-------------+-------------+-------------+---------------+
3411- | gb.agg(…) | x y | | x y | |
3412- | | z | x y | rank rank | x |
3413- | | 3 1 2 | a 1 1 | a 1 1 | a 1 |
3414- | | 6 11 13 | b 1 1 | b 1 1 | b 1 |
3415- | | | c 2 2 | c 2 2 | c 2 |
3416- +-----------------+-------------+-------------+-------------+---------------+
3417- | gb.transform(…) | x y | x y | | |
3418- | | a 1 2 | a 1 1 | | |
3419- | | b 11 13 | b 1 1 | | |
3420- | | c 11 13 | c 2 2 | | |
3421- +-----------------+-------------+-------------+-------------+---------------+
3402+ >> > gb.sum()
3403+ x y
3404+ z
3405+ 3 1 2
3406+ 6 11 13
34223407```
3408+ * ** Result has a named index that creates column ` 'z' ` instead of ` 'index' ` on reset_index().**
34233409
34243410### Rolling
34253411** Object for rolling window calculations.**
34263412
34273413``` python
34283414< RSr/ RDF / RGB > = < Sr/ DF / GB > .rolling(win_size) # Also: `min_periods=None, center=False`.
3429- < RSr/ RDF / RGB > = < RDF / RGB > [column_key / s] # Or: <RDF/RGB>.column_key
3415+ < RSr/ RDF / RGB > = < RDF / RGB > [col_key / s] # Or: <RDF/RGB>.col_key
34303416< Sr/ DF > = < R> .mean/ sum / max () # Or: <R>.apply/agg(<agg_func/str>)
34313417```
34323418
@@ -3435,10 +3421,20 @@ Plotly
34353421------
34363422``` python
34373423# $ pip3 install pandas plotly kaleido
3438- import pandas as pd, plotly.express as ex
3439- < Figure> = ex.line(< DF > , x = < col_name> , y = < col_name> ) # Or: ex.line(x=<list>, y=<list>)
3440- < Figure> .update_layout(margin = dict (t = 0 , r = 0 , b = 0 , l = 0 ), …) # `paper_bgcolor='rgb(0, 0, 0)'`.
3441- < Figure> .write_html/ json/ image(' <path>' ) # Also <Figure>.show().
3424+ import pandas as pd, plotly.express as px
3425+ ```
3426+
3427+ ``` python
3428+ < Fig> = px.line(< DF > , x = col_key, y = col_key) # Or: px.line(x=<list>, y=<list>)
3429+ < Fig> .update_layout(margin = dict (t = 0 , r = 0 , b = 0 , l = 0 )) # Also `paper_bgcolor='rgb(0, 0, 0)'`.
3430+ < Fig> .write_html/ json/ image(' <path>' ) # Also <Fig>.show().
3431+ ```
3432+
3433+ ``` python
3434+ < Fig> = px.area/ bar/ box(< DF > , x = col_key, y = col_key) # Also `color=col_key`.
3435+ < Fig> = px.scatter(< DF > , x = col_key, y = col_key) # Also `color/size/symbol=col_key`.
3436+ < Fig> = px.scatter_3d(< DF > , x = col_key, y = col_key, …) # `z=col_key`. Also color/size/symbol.
3437+ < Fig> = px.histogram(< DF > , x = col_key [, nbins=< int > ]) # Number of bins depends on DF size.
34423438```
34433439
34443440#### Displays a line chart of total coronavirus deaths per million grouped by continent:
@@ -3457,7 +3453,7 @@ df = df.groupby(['Continent_Name', 'date']).sum().reset_index()
34573453df[' Total Deaths per Million' ] = df.total_deaths * 1e6 / df.population
34583454df = df[df.date > ' 2020-03-14' ]
34593455df = df.rename({' date' : ' Date' , ' Continent_Name' : ' Continent' }, axis = ' columns' )
3460- ex .line(df, x = ' Date' , y = ' Total Deaths per Million' , color = ' Continent' ).show()
3456+ px .line(df, x = ' Date' , y = ' Total Deaths per Million' , color = ' Continent' ).show()
34613457```
34623458
34633459#### Displays a multi-axis line chart of total coronavirus cases and changes in prices of Bitcoin, Dow Jones and gold:
@@ -3470,20 +3466,23 @@ import pandas as pd, plotly.graph_objects as go
34703466
34713467def main ():
34723468 covid, bitcoin, gold, dow = scrape_data()
3473- display_data(wrangle_data(covid, bitcoin, gold, dow))
3469+ df = wrangle_data(covid, bitcoin, gold, dow)
3470+ display_data(df)
34743471
34753472def scrape_data ():
34763473 def get_covid_cases ():
34773474 url = ' https://covid.ourworldindata.org/data/owid-covid-data.csv'
34783475 df = pd.read_csv(url, usecols = [' location' , ' date' , ' total_cases' ])
3479- return df[df.location == ' World' ].set_index(' date' ).total_cases
3476+ df = df[df.location == ' World' ]
3477+ return df.set_index(' date' ).total_cases
34803478 def get_ticker (symbol ):
34813479 url = (f ' https://query1.finance.yahoo.com/v7/finance/download/ { symbol} ? '
34823480 ' period1=1579651200&period2=9999999999&interval=1d&events=history' )
34833481 df = pd.read_csv(url, usecols = [' Date' , ' Close' ])
34843482 return df.set_index(' Date' ).Close
34853483 out = get_covid_cases(), get_ticker(' BTC-USD' ), get_ticker(' GC=F' ), get_ticker(' ^DJI' )
3486- return map (pd.Series.rename, out, [' Total Cases' , ' Bitcoin' , ' Gold' , ' Dow Jones' ])
3484+ names = [' Total Cases' , ' Bitcoin' , ' Gold' , ' Dow Jones' ]
3485+ return map (pd.Series.rename, out, names)
34873486
34883487def wrangle_data (covid , bitcoin , gold , dow ):
34893488 df = pd.concat([bitcoin, gold, dow], axis = 1 ) # Creates table by joining columns on dates.
0 commit comments