dask
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎00_overview.ipynb‎
Lines changed: 14 additions & 9 deletions b/‎00_overview.ipynb‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎01_dask.delayed.ipynb‎
Lines changed: 63 additions & 6 deletions b/‎01_dask.delayed.ipynb‎
Lines changed: 63 additions & 6 deletions
diff --git a/‎01x_lazy.ipynb‎
Lines changed: 29 additions & 4 deletions b/‎01x_lazy.ipynb‎
Lines changed: 29 additions & 4 deletions
diff --git a/‎03_array.ipynb‎
Lines changed: 55 additions & 10 deletions b/‎03_array.ipynb‎
Lines changed: 55 additions & 10 deletions
@@ -12,6 +12,8 @@ data/myfile.hdf5
 data/flightjson
 data/nycflights
 data/myfile.zarr
+data/accounts.parquet
 profile.html
 log
 .idea/
+_build/
@@ -141,7 +141,7 @@
     "Whereas there is a wealth of information in the documentation, linked above, here we aim to give practical advice to aid your understanding and application of Dask in everyday situations. This means that you should not expect every feature of Dask to be covered, but the examples hopefully are similar to the kinds of work-flows that you have in mind.\n",
     "\n",
     "## Exercise: Print `Hello, world!`\n",
-    "Each notebook will have exercises for you to solve. You'll be given a blank or partially completed cell, followed by a \"magic\" cell that will load the solution. For example\n",
+    "Each notebook will have exercises for you to solve. You'll be given a blank or partially completed cell, followed by a hidden cell with a solution. For example.\n",
     "\n",
     "\n",
     "Print the text \"Hello, world!\"."
@@ -157,19 +157,24 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "%load solutions/00-hello-world.py"
+    "The next cell has the solution. Click the ellipses to expand the solution, and always make sure to run the solution cell,\n",
+    "in case later sections of the notebook depend on the output from the solution."
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
    "source": [
-    "The above cell needs to be executed twice, once to load the solution and once to run it."
+    "print(\"Hello, world!\")"
    ]
   }
  ],
@@ -194,5 +199,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
@@ -243,10 +243,23 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
-    "%load solutions/01-delayed-loop.py"
+    "results = []\n",
+    "\n",
+    "for x in data:\n",
+    "    y = delayed(inc)(x)\n",
+    "    results.append(y)\n",
+    "    \n",
+    "total = delayed(sum)(results)\n",
+    "print(\"Before computing:\", total)  # Let's see what type of thing total is\n",
+    "result = total.compute()\n",
+    "print(\"After computing :\", result)  # After it's computed"
    ]
   },
   {
@@ -322,10 +335,22 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
-    "%load solutions/01-delayed-control-flow.py"
+    "results = []\n",
+    "for x in data:\n",
+    "    if is_even(x):  # even\n",
+    "        y = delayed(double)(x)\n",
+    "    else:          # odd\n",
+    "        y = delayed(inc)(x)\n",
+    "    results.append(y)\n",
+    "    \n",
+    "total = delayed(sum)(results)"
    ]
   },
   {
@@ -616,10 +641,42 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
-    "%load solutions/01-delayed-groupby.py"
+    "# This is just one possible solution, there are\n",
+    "# several ways to do this using `delayed`\n",
+    "\n",
+    "sums = []\n",
+    "counts = []\n",
+    "for fn in filenames:\n",
+    "    # Read in file\n",
+    "    df = delayed(pd.read_csv)(fn)\n",
+    "\n",
+    "    # Groupby origin airport\n",
+    "    by_origin = df.groupby('Origin')\n",
+    "\n",
+    "    # Sum of all departure delays by origin\n",
+    "    total = by_origin.DepDelay.sum()\n",
+    "\n",
+    "    # Number of flights by origin\n",
+    "    count = by_origin.DepDelay.count()\n",
+    "\n",
+    "    # Save the intermediates\n",
+    "    sums.append(total)\n",
+    "    counts.append(count)\n",
+    "\n",
+    "# Compute the intermediates\n",
+    "sums, counts = compute(sums, counts)\n",
+    "\n",
+    "# Combine intermediates to get total mean-delay-per-origin\n",
+    "total_delays = sum(sums)\n",
+    "n_flights = sum(counts)\n",
+    "mean = total_delays / n_flights"
    ]
   },
   {
 
@@ -251,10 +251,35 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
-    "%load solutions/Foundations-03.py"
+    "## verbose version\n",
+    "delayed_read_csv = delayed(pd.read_csv)\n",
+    "a = delayed_read_csv(filenames[0])\n",
+    "b = delayed_read_csv(filenames[1])\n",
+    "c = delayed_read_csv(filenames[2])\n",
+    "\n",
+    "delayed_len = delayed(len)\n",
+    "na = delayed_len(a)\n",
+    "nb = delayed_len(b)\n",
+    "nc = delayed_len(c)\n",
+    "\n",
+    "delayed_sum = delayed(sum)\n",
+    "\n",
+    "total = delayed_sum([na, nb, nc])\n",
+    "%time print(total.compute())\n",
+    "\n",
+    "\n",
+    "## concise version\n",
+    "csvs = [delayed(pd.read_csv)(fn) for fn in filenames]\n",
+    "lens = [delayed(len)(csv) for csv in csvs]\n",
+    "total = delayed(sum)(lens)\n",
+    "%time print(total.compute())\n"
    ]
   },
   {
@@ -551,9 +576,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.2"
+   "version": "3.7.3"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
@@ -159,7 +159,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%load solutions/Array-01.py"
+    "sums = []\n",
+    "lengths = []\n",
+    "for i in range(0, 1000000000, 1000000):\n",
+    "    chunk = dset[i: i + 1000000]  # pull out numpy array\n",
+    "    sums.append(chunk.sum())\n",
+    "    lengths.append(len(chunk))\n",
+    "\n",
+    "total = sum(sums)\n",
+    "length = sum(lengths)\n",
+    "print(total / length)\n"
    ]
   },
   {
@@ -570,7 +579,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "load solutions/02-dask-arrays-make-arrays.py"
+    "arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets]\n",
+    "arrays"
    ]
   },
   {
@@ -600,7 +610,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%load solutions/02-dask-arrays-stacked.py"
+    "x = da.stack(arrays, axis=0)\n",
+    "x"
    ]
   },
   {
@@ -613,7 +624,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "raises-exception"
+    ]
+   },
    "outputs": [],
    "source": [
     "# complete the following\n",
@@ -624,10 +639,16 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
-    "%load solutions/02-dask-arrays-weather-mean.py"
+    "result = x.mean(axis=0)\n",
+    "fig = plt.figure(figsize=(16, 8))\n",
+    "plt.imshow(result, cmap='RdBu_r')"
    ]
   },
   {
@@ -647,10 +668,16 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
-    "%load solutions/02-dask-arrays-weather-difference.py"
+    "result = x[0] - x.mean(axis=0)\n",
+    "fig = plt.figure(figsize=(16, 8))\n",
+    "plt.imshow(result, cmap='RdBu_r')"
    ]
   },
   {
@@ -705,10 +732,28 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
    "outputs": [],
    "source": [
-    "%load solutions/Array-03.py"
+    "import h5py\n",
+    "from glob import glob\n",
+    "import os\n",
+    "import dask.array as da\n",
+    "\n",
+    "filenames = sorted(glob(os.path.join('data', 'weather-big', '*.hdf5')))\n",
+    "dsets = [h5py.File(filename, mode='r')['/t2m'] for filename in filenames]\n",
+    "\n",
+    "arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets]\n",
+    "\n",
+    "x = da.stack(arrays, axis=0)\n",
+    "\n",
+    "result = x[:, ::2, ::2]\n",
+    "\n",
+    "da.to_zarr(result, os.path.join('data', 'myfile.zarr'), overwrite=True)\n"
    ]
   },
   {