Optimize slice_slice for faster isel of huge datasets (#4560)

dionhaefner · max-sixty · web-flow · commit 235b2e5bcec2 · 2020-11-05T11:07:23.000-08:00
* optimize slice_slice for faster isel of huge datasets

* satisfy isort

* update whats-new.rst

* add benchmark for huge axis small index

* add some more comments

* fix benchmark

* lint

* Update doc/whats-new.rst

Co-authored-by: Maximilian Roos &lt;5635139+max-sixty@users.noreply.github.com&gt;

* use range instead of computing slice length

* compare to 0

Co-authored-by: Maximilian Roos &lt;5635139+max-sixty@users.noreply.github.com&gt;
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -1,3 +1,5 @@
+import os
+
 import numpy as np
 import pandas as pd
 
@@ -138,3 +140,22 @@ def setup(self):
 
     def time_indexing(self):
         self.ds.isel(time=self.time_filter)
+
+
+class HugeAxisSmallSliceIndexing:
+    # https://github.com/pydata/xarray/pull/4560
+    def setup(self):
+        self.filepath = "test_indexing_huge_axis_small_slice.nc"
+        if not os.path.isfile(self.filepath):
+            xr.Dataset(
+                {"a": ("x", np.arange(10_000_000))},
+                coords={"x": np.arange(10_000_000)},
+            ).to_netcdf(self.filepath, format="NETCDF4")
+
+        self.ds = xr.open_dataset(self.filepath)
+
+    def time_indexing(self):
+        self.ds.isel(x=slice(100))
+
+    def cleanup(self):
+        self.ds.close()
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -64,6 +64,7 @@ Bug fixes
   By `Mathias Hauser <https://github.com/mathause>`_.
 - :py:func:`combine_by_coords` now raises an informative error when passing coordinates
   with differing calendars (:issue:`4495`). By `Mathias Hauser <https://github.com/mathause>`_.
+- Improve performance where reading small slices from huge dimensions was slower than necessary (:pull:`4560`). By `Dion Häfner <https://github.com/dionhaefner>`_.
 
 Documentation
 ~~~~~~~~~~~~~
@@ -72,7 +73,7 @@ Documentation
   (:pull:`4532`);
   By `Jimmy Westling <https://github.com/illviljan>`_.
 - Raise a more informative error when :py:meth:`DataArray.to_dataframe` is
-  is called on a scalar, (:issue:`4228`); 
+  is called on a scalar, (:issue:`4228`);
   By `Pieter Gijsbers <https://github.com/pgijsbers>`_.
 - Fix grammar and typos in the :doc:`contributing` guide (:pull:`4545`).
   By `Sahid Velji <https://github.com/sahidvelji>`_.
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -275,25 +275,38 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None):
     return pos_indexers, new_indexes
 
 
+def _normalize_slice(sl, size):
+    """Ensure that given slice only contains positive start and stop values
+    (stop can be -1 for full-size slices with negative steps, e.g. [-10::-1])"""
+    return slice(*sl.indices(size))
+
+
 def slice_slice(old_slice, applied_slice, size):
     """Given a slice and the size of the dimension to which it will be applied,
     index it with another slice to return a new slice equivalent to applying
     the slices sequentially
     """
-    step = (old_slice.step or 1) * (applied_slice.step or 1)
-
-    # For now, use the hack of turning old_slice into an ndarray to reconstruct
-    # the slice start and stop. This is not entirely ideal, but it is still
-    # definitely better than leaving the indexer as an array.
-    items = _expand_slice(old_slice, size)[applied_slice]
-    if len(items) > 0:
-        start = items[0]
-        stop = items[-1] + int(np.sign(step))
-        if stop < 0:
-            stop = None
-    else:
-        start = 0
-        stop = 0
+    old_slice = _normalize_slice(old_slice, size)
+
+    size_after_old_slice = len(range(old_slice.start, old_slice.stop, old_slice.step))
+    if size_after_old_slice == 0:
+        # nothing left after applying first slice
+        return slice(0)
+
+    applied_slice = _normalize_slice(applied_slice, size_after_old_slice)
+
+    start = old_slice.start + applied_slice.start * old_slice.step
+    if start < 0:
+        # nothing left after applying second slice
+        # (can only happen for old_slice.step < 0, e.g. [10::-1], [20:])
+        return slice(0)
+
+    stop = old_slice.start + applied_slice.stop * old_slice.step
+    if stop < 0:
+        stop = None
+
+    step = old_slice.step * applied_slice.step
+
     return slice(start, stop, step)