Skip to content

Commit c87f3ee

Browse files
authored
Merge pull request #227 from kori73/join-example
add join example to dataframe tutorial
2 parents 1b7a0f1 + cf873d9 commit c87f3ee

File tree

4 files changed

+84
-2
lines changed

4 files changed

+84
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ data/random.hdf5
1010
data/weather-big
1111
data/myfile.hdf5
1212
data/flightjson
13+
data/holidays
1314
data/nycflights
1415
data/myfile.zarr
1516
data/accounts.parquet

04_dataframe.ipynb

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,33 @@
241241
"df.tail() # now works"
242242
]
243243
},
244+
{
245+
"cell_type": "markdown",
246+
"metadata": {},
247+
"source": [
248+
"Let's also read the holidays data which will use in the exercises"
249+
]
250+
},
251+
{
252+
"cell_type": "code",
253+
"execution_count": null,
254+
"metadata": {},
255+
"outputs": [],
256+
"source": [
257+
"holidays = dd.read_parquet(os.path.join('data', \"holidays\"))"
258+
]
259+
},
260+
{
261+
"cell_type": "code",
262+
"execution_count": null,
263+
"metadata": {
264+
"scrolled": true
265+
},
266+
"outputs": [],
267+
"source": [
268+
"holidays.head()"
269+
]
270+
},
244271
{
245272
"cell_type": "markdown",
246273
"metadata": {},
@@ -460,6 +487,35 @@
460487
"df.groupby(\"DayOfWeek\").DepDelay.mean().compute()"
461488
]
462489
},
490+
{
491+
"cell_type": "markdown",
492+
"metadata": {},
493+
"source": [
494+
"### 6.) What holiday has the worst average departure delay?\n",
495+
"\n",
496+
"*Hint*: use [`df.merge`](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html) to bring holiday information.\n",
497+
"\n",
498+
"*Note*: If you have prepared the dataset with `--small` argument or set the `DASK_TUTORIAL_SMALL` environment variable to `True`, you might see only a couple of holidays. This is because the small dataset contains a limited number of rows."
499+
]
500+
},
501+
{
502+
"cell_type": "code",
503+
"execution_count": null,
504+
"metadata": {},
505+
"outputs": [],
506+
"source": [
507+
"# Your code here"
508+
]
509+
},
510+
{
511+
"cell_type": "code",
512+
"execution_count": null,
513+
"metadata": {},
514+
"outputs": [],
515+
"source": [
516+
"df.merge(holidays, on=[\"Date\"], how=\"left\").groupby(\"holiday\").DepDelay.mean().compute()"
517+
]
518+
},
463519
{
464520
"cell_type": "markdown",
465521
"metadata": {},
@@ -817,7 +873,7 @@
817873
"metadata": {
818874
"anaconda-cloud": {},
819875
"kernelspec": {
820-
"display_name": "Python 3",
876+
"display_name": "Python 3 (ipykernel)",
821877
"language": "python",
822878
"name": "python3"
823879
},
@@ -831,7 +887,7 @@
831887
"name": "python",
832888
"nbconvert_exporter": "python",
833889
"pygments_lexer": "ipython3",
834-
"version": "3.7.6"
890+
"version": "3.8.12"
835891
}
836892
},
837893
"nbformat": 4,

binder/environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ dependencies:
77
- jupyterlab>=2.0.0,<3
88
- numpy>=1.18.1
99
- h5py
10+
- holidays
1011
- scipy>=1.3.0
1112
- toolz
1213
- bokeh>=2.0.0

prep.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import h5py
1212
import numpy as np
1313
import pandas as pd
14+
import holidays
1415
from skimage.transform import resize
1516

1617
from accounts import account_entries, account_params, json_entries
@@ -40,6 +41,28 @@ def parse_args(args=None):
4041
'directory.')
4142

4243

44+
def holiday():
45+
holidays_dir = os.path.join(data_dir, "holidays")
46+
if os.path.exists(holidays_dir):
47+
return
48+
49+
years = [
50+
1990, 1991, 1992, 1993, 1994,
51+
1995, 1996, 1997, 1998, 1999
52+
]
53+
holidays_dict = holidays.US(years=years)
54+
us_holidays = pd.DataFrame(
55+
data={
56+
"Date": holidays_dict.keys(),
57+
"holiday": holidays_dict.values()
58+
},
59+
)
60+
us_holidays = us_holidays.assign(
61+
Date=us_holidays.Date.astype("datetime64[ns]"))
62+
us_holidays.to_parquet(holidays_dir)
63+
print("Created holidays data.")
64+
65+
4366
def flights(small=None):
4467
start = time.time()
4568
flights_raw = os.path.join(data_dir, 'nycflights.tar.gz')
@@ -224,6 +247,7 @@ def main(args=None):
224247
accounts_json(args.small)
225248
if args.dataset == "flights" or args.dataset == "all":
226249
flights(args.small)
250+
holiday()
227251

228252

229253
if __name__ == '__main__':

0 commit comments

Comments
 (0)