Skip to content

Commit 3043db5

Browse files
Merge pull request #144 from TomAugspurger/hidden
Run the tutorial on CI
2 parents 4690e84 + 241e1fe commit 3043db5

38 files changed

+251
-293
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ data/myfile.hdf5
1212
data/flightjson
1313
data/nycflights
1414
data/myfile.zarr
15+
data/accounts.parquet
1516
profile.html
1617
log
1718
.idea/
19+
_build/

00_overview.ipynb

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@
141141
"Whereas there is a wealth of information in the documentation, linked above, here we aim to give practical advice to aid your understanding and application of Dask in everyday situations. This means that you should not expect every feature of Dask to be covered, but the examples hopefully are similar to the kinds of work-flows that you have in mind.\n",
142142
"\n",
143143
"## Exercise: Print `Hello, world!`\n",
144-
"Each notebook will have exercises for you to solve. You'll be given a blank or partially completed cell, followed by a \"magic\" cell that will load the solution. For example\n",
144+
"Each notebook will have exercises for you to solve. You'll be given a blank or partially completed cell, followed by a hidden cell with a solution. For example.\n",
145145
"\n",
146146
"\n",
147147
"Print the text \"Hello, world!\"."
@@ -157,19 +157,24 @@
157157
]
158158
},
159159
{
160-
"cell_type": "code",
161-
"execution_count": null,
160+
"cell_type": "markdown",
162161
"metadata": {},
163-
"outputs": [],
164162
"source": [
165-
"%load solutions/00-hello-world.py"
163+
"The next cell has the solution. Click the ellipses to expand the solution, and always make sure to run the solution cell,\n",
164+
"in case later sections of the notebook depend on the output from the solution."
166165
]
167166
},
168167
{
169-
"cell_type": "markdown",
170-
"metadata": {},
168+
"cell_type": "code",
169+
"execution_count": null,
170+
"metadata": {
171+
"jupyter": {
172+
"source_hidden": true
173+
}
174+
},
175+
"outputs": [],
171176
"source": [
172-
"The above cell needs to be executed twice, once to load the solution and once to run it."
177+
"print(\"Hello, world!\")"
173178
]
174179
}
175180
],
@@ -194,5 +199,5 @@
194199
}
195200
},
196201
"nbformat": 4,
197-
"nbformat_minor": 2
202+
"nbformat_minor": 4
198203
}

01_dask.delayed.ipynb

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,23 @@
243243
{
244244
"cell_type": "code",
245245
"execution_count": null,
246-
"metadata": {},
246+
"metadata": {
247+
"jupyter": {
248+
"source_hidden": true
249+
}
250+
},
247251
"outputs": [],
248252
"source": [
249-
"%load solutions/01-delayed-loop.py"
253+
"results = []\n",
254+
"\n",
255+
"for x in data:\n",
256+
" y = delayed(inc)(x)\n",
257+
" results.append(y)\n",
258+
" \n",
259+
"total = delayed(sum)(results)\n",
260+
"print(\"Before computing:\", total) # Let's see what type of thing total is\n",
261+
"result = total.compute()\n",
262+
"print(\"After computing :\", result) # After it's computed"
250263
]
251264
},
252265
{
@@ -322,10 +335,22 @@
322335
{
323336
"cell_type": "code",
324337
"execution_count": null,
325-
"metadata": {},
338+
"metadata": {
339+
"jupyter": {
340+
"source_hidden": true
341+
}
342+
},
326343
"outputs": [],
327344
"source": [
328-
"%load solutions/01-delayed-control-flow.py"
345+
"results = []\n",
346+
"for x in data:\n",
347+
" if is_even(x): # even\n",
348+
" y = delayed(double)(x)\n",
349+
" else: # odd\n",
350+
" y = delayed(inc)(x)\n",
351+
" results.append(y)\n",
352+
" \n",
353+
"total = delayed(sum)(results)"
329354
]
330355
},
331356
{
@@ -616,10 +641,42 @@
616641
{
617642
"cell_type": "code",
618643
"execution_count": null,
619-
"metadata": {},
644+
"metadata": {
645+
"jupyter": {
646+
"source_hidden": true
647+
}
648+
},
620649
"outputs": [],
621650
"source": [
622-
"%load solutions/01-delayed-groupby.py"
651+
"# This is just one possible solution, there are\n",
652+
"# several ways to do this using `delayed`\n",
653+
"\n",
654+
"sums = []\n",
655+
"counts = []\n",
656+
"for fn in filenames:\n",
657+
" # Read in file\n",
658+
" df = delayed(pd.read_csv)(fn)\n",
659+
"\n",
660+
" # Groupby origin airport\n",
661+
" by_origin = df.groupby('Origin')\n",
662+
"\n",
663+
" # Sum of all departure delays by origin\n",
664+
" total = by_origin.DepDelay.sum()\n",
665+
"\n",
666+
" # Number of flights by origin\n",
667+
" count = by_origin.DepDelay.count()\n",
668+
"\n",
669+
" # Save the intermediates\n",
670+
" sums.append(total)\n",
671+
" counts.append(count)\n",
672+
"\n",
673+
"# Compute the intermediates\n",
674+
"sums, counts = compute(sums, counts)\n",
675+
"\n",
676+
"# Combine intermediates to get total mean-delay-per-origin\n",
677+
"total_delays = sum(sums)\n",
678+
"n_flights = sum(counts)\n",
679+
"mean = total_delays / n_flights"
623680
]
624681
},
625682
{

01x_lazy.ipynb

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,35 @@
251251
{
252252
"cell_type": "code",
253253
"execution_count": null,
254-
"metadata": {},
254+
"metadata": {
255+
"jupyter": {
256+
"source_hidden": true
257+
}
258+
},
255259
"outputs": [],
256260
"source": [
257-
"%load solutions/Foundations-03.py"
261+
"## verbose version\n",
262+
"delayed_read_csv = delayed(pd.read_csv)\n",
263+
"a = delayed_read_csv(filenames[0])\n",
264+
"b = delayed_read_csv(filenames[1])\n",
265+
"c = delayed_read_csv(filenames[2])\n",
266+
"\n",
267+
"delayed_len = delayed(len)\n",
268+
"na = delayed_len(a)\n",
269+
"nb = delayed_len(b)\n",
270+
"nc = delayed_len(c)\n",
271+
"\n",
272+
"delayed_sum = delayed(sum)\n",
273+
"\n",
274+
"total = delayed_sum([na, nb, nc])\n",
275+
"%time print(total.compute())\n",
276+
"\n",
277+
"\n",
278+
"## concise version\n",
279+
"csvs = [delayed(pd.read_csv)(fn) for fn in filenames]\n",
280+
"lens = [delayed(len)(csv) for csv in csvs]\n",
281+
"total = delayed(sum)(lens)\n",
282+
"%time print(total.compute())\n"
258283
]
259284
},
260285
{
@@ -551,9 +576,9 @@
551576
"name": "python",
552577
"nbconvert_exporter": "python",
553578
"pygments_lexer": "ipython3",
554-
"version": "3.7.2"
579+
"version": "3.7.3"
555580
}
556581
},
557582
"nbformat": 4,
558-
"nbformat_minor": 2
583+
"nbformat_minor": 4
559584
}

03_array.ipynb

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,16 @@
159159
"metadata": {},
160160
"outputs": [],
161161
"source": [
162-
"%load solutions/Array-01.py"
162+
"sums = []\n",
163+
"lengths = []\n",
164+
"for i in range(0, 1000000000, 1000000):\n",
165+
" chunk = dset[i: i + 1000000] # pull out numpy array\n",
166+
" sums.append(chunk.sum())\n",
167+
" lengths.append(len(chunk))\n",
168+
"\n",
169+
"total = sum(sums)\n",
170+
"length = sum(lengths)\n",
171+
"print(total / length)\n"
163172
]
164173
},
165174
{
@@ -570,7 +579,8 @@
570579
"metadata": {},
571580
"outputs": [],
572581
"source": [
573-
"load solutions/02-dask-arrays-make-arrays.py"
582+
"arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets]\n",
583+
"arrays"
574584
]
575585
},
576586
{
@@ -600,7 +610,8 @@
600610
"metadata": {},
601611
"outputs": [],
602612
"source": [
603-
"%load solutions/02-dask-arrays-stacked.py"
613+
"x = da.stack(arrays, axis=0)\n",
614+
"x"
604615
]
605616
},
606617
{
@@ -613,7 +624,11 @@
613624
{
614625
"cell_type": "code",
615626
"execution_count": null,
616-
"metadata": {},
627+
"metadata": {
628+
"tags": [
629+
"raises-exception"
630+
]
631+
},
617632
"outputs": [],
618633
"source": [
619634
"# complete the following\n",
@@ -624,10 +639,16 @@
624639
{
625640
"cell_type": "code",
626641
"execution_count": null,
627-
"metadata": {},
642+
"metadata": {
643+
"jupyter": {
644+
"source_hidden": true
645+
}
646+
},
628647
"outputs": [],
629648
"source": [
630-
"%load solutions/02-dask-arrays-weather-mean.py"
649+
"result = x.mean(axis=0)\n",
650+
"fig = plt.figure(figsize=(16, 8))\n",
651+
"plt.imshow(result, cmap='RdBu_r')"
631652
]
632653
},
633654
{
@@ -647,10 +668,16 @@
647668
{
648669
"cell_type": "code",
649670
"execution_count": null,
650-
"metadata": {},
671+
"metadata": {
672+
"jupyter": {
673+
"source_hidden": true
674+
}
675+
},
651676
"outputs": [],
652677
"source": [
653-
"%load solutions/02-dask-arrays-weather-difference.py"
678+
"result = x[0] - x.mean(axis=0)\n",
679+
"fig = plt.figure(figsize=(16, 8))\n",
680+
"plt.imshow(result, cmap='RdBu_r')"
654681
]
655682
},
656683
{
@@ -705,10 +732,28 @@
705732
{
706733
"cell_type": "code",
707734
"execution_count": null,
708-
"metadata": {},
735+
"metadata": {
736+
"jupyter": {
737+
"source_hidden": true
738+
}
739+
},
709740
"outputs": [],
710741
"source": [
711-
"%load solutions/Array-03.py"
742+
"import h5py\n",
743+
"from glob import glob\n",
744+
"import os\n",
745+
"import dask.array as da\n",
746+
"\n",
747+
"filenames = sorted(glob(os.path.join('data', 'weather-big', '*.hdf5')))\n",
748+
"dsets = [h5py.File(filename, mode='r')['/t2m'] for filename in filenames]\n",
749+
"\n",
750+
"arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets]\n",
751+
"\n",
752+
"x = da.stack(arrays, axis=0)\n",
753+
"\n",
754+
"result = x[:, ::2, ::2]\n",
755+
"\n",
756+
"da.to_zarr(result, os.path.join('data', 'myfile.zarr'), overwrite=True)\n"
712757
]
713758
},
714759
{

0 commit comments

Comments
 (0)