docs: Add introduction to flox (#206)

dcherian · pre-commit-ci[bot] · web-flow · commit 9aa6e88d10d6 · 2023-01-26T11:44:03.000-07:00
* Copy styling from cf-xarray * Add sphinx-codeautolink * Add intro docs * Add codespell * Add histogram * cache executed notebooks * Fix * IntervalIndex over isbin * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -46,6 +46,14 @@ repos:
       hooks:
         - id: nbstripout
           args: [--extra-keys=metadata.kernelspec metadata.language_info.version]
+
+    - repo: https://github.com/codespell-project/codespell
+      rev: v2.2.2
+      hooks:
+        - id: codespell
+          additional_dependencies:
+            - tomli
+
     - repo: https://github.com/asottile/pyupgrade
       rev: v3.3.1
       hooks:
diff --git a/ci/docs.yml b/ci/docs.yml
@@ -16,5 +16,6 @@ dependencies:
   - furo
   - ipykernel
   - jupyter
+  - sphinx-codeautolink
   - pip:
     - git+https://github.com/xarray-contrib/flox
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -39,8 +39,11 @@
     "numpydoc",
     "sphinx.ext.napoleon",
     "myst_nb",
+    "sphinx_codeautolink",
 ]
 
+codeautolink_concat_default = True
+
 extlinks = {
     "issue": ("https://github.com/xarray-contrib/flox/issues/%s", "GH#%s"),
     "pr": ("https://github.com/xarray-contrib/flox/pull/%s", "PR#%s"),
@@ -60,6 +63,7 @@
 # Myst_nb options
 nb_execution_excludepatterns = ["climatology-hourly.ipynb"]
 nb_execution_raise_on_error = True
+nb_execution_mode = "cache"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -94,13 +98,34 @@
 # show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
+pygments_style = "igor"
 
 
 # -- Options for HTML output ---------------------------------------------------
 
 html_theme = "furo"
 
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+css_vars = {
+    "admonition-font-size": "0.9rem",
+    "font-size--small": "92%",
+    "font-size--small--2": "87.5%",
+}
+html_theme_options = dict(
+    sidebar_hide_name=True,
+    light_css_variables=css_vars,
+    dark_css_variables=css_vars,
+)
+
+html_context = {
+    "github_user": "xarray-contrib",
+    "github_repo": "flox",
+    "github_version": "main",
+    "doc_path": "doc",
+}
+
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
diff --git a/docs/source/implementation.md b/docs/source/implementation.md
@@ -136,7 +136,7 @@ width: 100%
 1. Currently the rechunking is only implemented for 1D arrays (being motivated by time resampling),
    but a nD generalization seems possible.
 1. Only can use the `blockwise` strategy for grouping by `nD` arrays.
-1. Works better when multiple groups are already in a single block; so that the intial
+1. Works better when multiple groups are already in a single block; so that the initial
    rechunking only involves a small amount of communication.
 
 (method-cohorts)=
@@ -198,8 +198,8 @@ width: 100%
 
 1. Group labels must be known at graph construction time, so this only works for numpy arrays.
 1. This does require more tasks and a more complicated graph, but the communication overhead can be significantly lower.
-1. The detection of "cohorts" is currrently slow but could be improved.
-1. The extra effort of detecting cohorts and mutiple copying of intermediate blocks may be worthwhile only if the chunk sizes are small
+1. The detection of "cohorts" is currently slow but could be improved.
+1. The extra effort of detecting cohorts and mul;tiple copying of intermediate blocks may be worthwhile only if the chunk sizes are small
    relative to the approximate period of group labels, or small relative to the size of spatially localized groups.
 
 ### Example : sensitivity to chunking
@@ -211,15 +211,15 @@ Consider our earlier example, `groupby("time.month")` with monthly frequency dat
 `flox` can find these cohorts, below it identifies the cohorts with labels `1,2,3,4`; `5,6,7,8`, and `9,10,11,12`.
 
 ```python
->>> flox.find_group_cohorts(labels, array.chunks[-1]))
+>>> flox.find_group_cohorts(labels, array.chunks[-1]).values()
 [[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]  # 3 cohorts
 ```
 
 Now consider `chunksize=5`.
 ![cohorts-schematic](/../diagrams/cohorts-month-chunk5.png)
 
 ```python
->>> flox.core.find_group_cohorts(labels, array.chunks[-1]))
+>>> flox.core.find_group_cohorts(labels, array.chunks[-1]).values()
 [[1], [2, 3], [4, 5], [6], [7, 8], [9, 10], [11], [12]]  # 8 cohorts
 ```
 
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -62,6 +62,7 @@ It was motivated by many discussions in the [Pangeo](https://pangeo.io) communit
 .. toctree::
    :maxdepth: 1
 
+   intro.md
    aggregations.md
    engines.md
    arrays.md
diff --git a/docs/source/intro.md b/docs/source/intro.md
@@ -0,0 +1,186 @@
+---
+jupytext:
+  text_representation:
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  name: python3
+---
+
+```{eval-rst}
+.. currentmodule:: flox
+```
+
+# 10 minutes to flox
+
+## GroupBy single variable
+
+```{code-cell}
+import numpy as np
+import xarray as xr
+
+from flox.xarray import xarray_reduce
+
+labels = xr.DataArray(
+    [1, 2, 3, 1, 2, 3, 0, 0, 0],
+    dims="x",
+    name="label",
+)
+labels
+```
+
+### With numpy
+
+```{code-cell}
+da = xr.DataArray(
+    np.ones((9,)), dims="x", name="array"
+)
+```
+
+Apply the reduction using {py:func}`flox.xarray.xarray_reduce` specifying the reduction operation in `func`
+
+```{code-cell}
+xarray_reduce(da, labels, func="sum")
+```
+
+### With dask
+
+Let's first chunk `da` and `labels`
+
+```{code-cell}
+da_chunked = da.chunk(x=2)
+labels_chunked = labels.chunk(x=3)
+```
+
+Grouping a dask array by a numpy array is unchanged
+
+```{code-cell}
+xarray_reduce(da_chunked, labels, func="sum")
+```
+
+When grouping **by** a dask array, we need to specify the "expected group labels" on the output so we can construct the result DataArray.
+Without the `expected_groups` kwarg, an error is raised
+
+```{code-cell}
+---
+tags: [raises-exception]
+---
+xarray_reduce(da_chunked, labels_chunked, func="sum")
+```
+
+Now we specify `expected_groups`:
+
+```{code-cell}
+dask_result = xarray_reduce(
+    da_chunked, labels_chunked, func="sum", expected_groups=[0, 1, 2, 3],
+)
+dask_result
+```
+
+Note that any group labels not present in `expected_groups` will be ignored.
+You can also provide `expected_groups` for the pure numpy GroupBy.
+
+```{code-cell}
+numpy_result = xarray_reduce(
+    da, labels, func="sum", expected_groups=[0, 1, 2, 3],
+)
+numpy_result
+```
+
+The two are identical:
+
+```{code-cell}
+numpy_result.identical(dask_result)
+```
+
+## Binning by a single variable
+
+For binning, specify the bin edges in `expected_groups` using {py:class}`pandas.IntervalIndex`:
+
+```{code-cell}
+import pandas as pd
+
+xarray_reduce(
+    da,
+    labels,
+    func="sum",
+    expected_groups=pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5, 6]),
+)
+```
+
+Similarly for dask inputs
+
+```{code-cell}
+xarray_reduce(
+    da_chunked,
+    labels_chunked,
+    func="sum",
+    expected_groups=pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5, 6]),
+)
+```
+
+For more control over the binning (which edge is closed), pass the appropriate kwarg to {py:class}`pandas.IntervalIndex`:
+
+```{code-cell}
+xarray_reduce(
+    da_chunked,
+    labels_chunked,
+    func="sum",
+    expected_groups=pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5, 6], closed="left"),
+)
+```
+
+## Grouping by multiple variables
+
+```{code-cell}
+arr = np.ones((4, 12))
+labels1 = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"])
+labels2 = np.array([1, 2, 2, 1])
+
+da = xr.DataArray(
+    arr, dims=("x", "y"), coords={"labels2": ("x", labels2), "labels1": ("y", labels1)}
+)
+da
+```
+
+To group by multiple variables simply pass them as `*args`:
+
+```{code-cell}
+xarray_reduce(da, "labels1", "labels2", func="sum")
+```
+
+## Histogramming (Binning by multiple variables)
+
+An unweighted histogram is simply a groupby multiple variables with count.
+
+```{code-cell} python
+arr = np.ones((4, 12))
+labels1 = np.array(np.linspace(0, 10, 12))
+labels2 = np.array([1, 2, 2, 1])
+
+da = xr.DataArray(
+    arr, dims=("x", "y"), coords={"labels2": ("x", labels2), "labels1": ("y", labels1)}
+)
+da
+```
+
+Specify bins in `expected_groups`
+
+```{code-cell} python
+xarray_reduce(
+    da,
+    "labels1",
+    "labels2",
+    func="count",
+    expected_groups=(
+        pd.IntervalIndex.from_breaks([-0.5, 4.5, 6.5, 8.9]),  # labels1
+        pd.IntervalIndex.from_breaks([0.5, 1.5, 1.9]),  # labels2
+    ),
+)
+```
+
+## Resampling
+
+Use the xarray interface i.e. `da.resample(time="M").mean()`.
+
+Optionally pass [`method="blockwise"`](method-blockwise): `da.resample(time="M").mean(method="blockwise")`
diff --git a/flox/core.py b/flox/core.py
@@ -258,7 +258,7 @@ def rechunk_for_cohorts(
         Labels at which we always start a new chunk. For
         the example ``labels`` array, this would be `1`.
     chunksize : int, optional
-        nominal chunk size. Chunk size is exceded when the label
+        nominal chunk size. Chunk size is exceeded when the label
         in ``force_new_chunk_at`` is less than ``chunksize//2`` elements away.
         If None, uses median chunksize along axis.
 
@@ -447,7 +447,7 @@ def factorize_(
     for groupvar, expect in zip(by, expected_groups):
         flat = groupvar.reshape(-1)
         if isinstance(expect, pd.RangeIndex):
-            # idx is a view of the original `by` aray
+            # idx is a view of the original `by` array
             # copy here so we don't have a race condition with the
             # group_idx[nanmask] = nan_sentinel assignment later
             # this is important in shared-memory parallelism with dask
@@ -861,7 +861,7 @@ def _simple_combine(
     2. _expand_dims was used to insert an extra axis DUMMY_AXIS
     3. Here we concatenate along DUMMY_AXIS, and then call the combine function along
        DUMMY_AXIS
-    4. At the final agggregate step, we squeeze out DUMMY_AXIS
+    4. At the final aggregate step, we squeeze out DUMMY_AXIS
     """
     from dask.array.core import deepfirst
     from dask.utils import deepmap
diff --git a/flox/xarray.py b/flox/xarray.py
@@ -347,7 +347,7 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs):
         array, *by = _broadcast_size_one_dims(array, *by, core_dims=core_dims)
 
         # Handle skipna here because I need to know dtype to make a good default choice.
-        # We cannnot handle this easily for xarray Datasets in xarray_reduce
+        # We cannot handle this easily for xarray Datasets in xarray_reduce
         if skipna and func in ["all", "any", "count"]:
             raise ValueError(f"skipna cannot be truthy for {func} reductions.")
 
@@ -511,7 +511,7 @@ def rechunk_for_cohorts(
         Labels at which we always start a new chunk. For
         the example ``labels`` array, this would be `1`.
     chunksize : int, optional
-        nominal chunk size. Chunk size is exceded when the label
+        nominal chunk size. Chunk size is exceeded when the label
         in ``force_new_chunk_at`` is less than ``chunksize//2`` elements away.
         If None, uses median chunksize along ``dim``.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,3 +53,8 @@ ignore_missing_imports = true
 
 [tool.pytest.ini_options]
 addopts = "--tb=short"
+
+
+[tool.codespell]
+ignore-words-list = "nd,nax"
+skip = "*.html"