rerun_py.dataframe: add support for .filter_index_values (#7670)

jleibs · web-flow · commit 370d99237323 · 2024-10-10T09:58:39.000-04:00
### What - Add a new type helper for IndexValues (in general some variation of this will be useful for other APIs) - Added a dependency on `numpy` package for ArrayLike functionality - `numpy` has an old dep on ndarray. - This has already been fixed but not yet released: PyO3/rust-numpy#439 - ChunkedArrays are sadly are more complicated (see the note in: https://docs.rs/arrow/latest/arrow/pyarrow/index.html) ### Checklist * [x] I have read and agree to [Contributor Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and the [Code of Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md) * [x] I've included a screenshot or gif (if applicable) * [x] I have tested the web demo (if applicable): * Using examples from latest `main` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7670?manifest_url=https://app.rerun.io/version/main/examples_manifest.json) * Using full set of examples from `nightly` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7670?manifest_url=https://app.rerun.io/version/nightly/examples_manifest.json) * [x] The PR title and labels are set such as to maximize their usefulness for the next release's CHANGELOG * [x] If applicable, add a new check to the [release checklist](https://github.com/rerun-io/rerun/blob/main/tests/python/release_checklist)! * [x] If have noted any breaking changes to the log API in `CHANGELOG.md` and the migration guide - [PR Build Summary](https://build.rerun.io/pr/7670) - [Recent benchmark results](https://build.rerun.io/graphs/crates.html) - [Wasm size tracking](https://build.rerun.io/graphs/sizes.html) To run all checks from `main`, comment on the PR with `@rerun-bot full-check`.
diff --git a/Cargo.lock b/Cargo.lock
@@ -3588,6 +3588,19 @@ version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "308d96db8debc727c3fd9744aac51751243420e46edf401010908da7f8d5e57c"
 
+[[package]]
+name = "ndarray"
+version = "0.15.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "rawpointer",
+]
+
 [[package]]
 name = "ndarray"
 version = "0.16.1"
@@ -3609,7 +3622,7 @@ version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f093b3db6fd194718dcdeea6bd8c829417deae904e3fcc7732dabcd4416d25d8"
 dependencies = [
- "ndarray",
+ "ndarray 0.16.1",
  "rand",
  "rand_distr",
 ]
@@ -3846,6 +3859,21 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
+[[package]]
+name = "numpy"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec170733ca37175f5d75a5bea5911d6ff45d2cd52849ce98b685394e4f2f37f4"
+dependencies = [
+ "libc",
+ "ndarray 0.15.6",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "pyo3",
+ "rustc-hash",
+]
+
 [[package]]
 name = "objc"
 version = "0.2.7"
@@ -5457,7 +5485,7 @@ dependencies = [
  "document-features",
  "itertools 0.13.0",
  "libc",
- "ndarray",
+ "ndarray 0.16.1",
  "ndarray-rand",
  "once_cell",
  "parking_lot",
@@ -5647,7 +5675,7 @@ dependencies = [
  "bytemuck",
  "egui",
  "half 2.3.1",
- "ndarray",
+ "ndarray 0.16.1",
  "re_chunk_store",
  "re_data_ui",
  "re_log_types",
@@ -5798,7 +5826,7 @@ dependencies = [
  "linked-hash-map",
  "mime_guess2",
  "mint",
- "ndarray",
+ "ndarray 0.16.1",
  "nohash-hasher",
  "once_cell",
  "ply-rs",
@@ -6015,7 +6043,7 @@ dependencies = [
  "indexmap 2.1.0",
  "itertools 0.13.0",
  "linked-hash-map",
- "ndarray",
+ "ndarray 0.16.1",
  "nohash-hasher",
  "once_cell",
  "parking_lot",
@@ -6289,6 +6317,7 @@ dependencies = [
  "infer",
  "itertools 0.13.0",
  "mimalloc",
+ "numpy",
  "once_cell",
  "parking_lot",
  "pyo3",
@@ -6461,7 +6490,7 @@ dependencies = [
  "clap",
  "half 2.3.1",
  "image",
- "ndarray",
+ "ndarray 0.16.1",
  "re_log",
  "rerun",
 ]
@@ -6533,7 +6562,7 @@ version = "0.19.0-alpha.1+dev"
 dependencies = [
  "anyhow",
  "clap",
- "ndarray",
+ "ndarray 0.16.1",
  "re_log",
  "rerun",
 ]
@@ -7016,7 +7045,7 @@ name = "snippets"
 version = "0.19.0-alpha.1+dev"
 dependencies = [
  "itertools 0.13.0",
- "ndarray",
+ "ndarray 0.16.1",
  "rand",
  "rand_distr",
  "re_build_tools",
@@ -7188,7 +7217,7 @@ dependencies = [
  "clap",
  "glam",
  "itertools 0.13.0",
- "ndarray",
+ "ndarray 0.16.1",
  "ndarray-rand",
  "rand",
  "re_log",
diff --git a/Cargo.toml b/Cargo.toml
@@ -195,7 +195,7 @@ memory-stats = "1.1"
 # This version is not pinned to avoid creating version requirement conflicts,
 # but other packages pin it to exactly "=0.1.37"
 mimalloc = "0.1.37"
-mime_guess2 = "2.0" # infer MIME type by file extension, and map mime to file extension
+mime_guess2 = "2.0"                                         # infer MIME type by file extension, and map mime to file extension
 mint = "0.5.9"
 re_mp4 = "0.1.0"
 natord = "1.0.9"
@@ -206,6 +206,9 @@ nohash-hasher = "0.2"
 notify = { version = "6.1.1", features = ["macos_kqueue"] }
 num-derive = "0.4"
 num-traits = "0.2"
+# TODO(#7676) This pulls in an older ndarray. Remove it from the skip list in `deny.toml` and
+# close the issue when updating to 0.22.
+numpy = "0.21"
 once_cell = "1.17" # No lazy_static - use `std::sync::OnceLock` or `once_cell` instead
 ordered-float = "4.2"
 parking_lot = "0.12"
diff --git a/deny.toml b/deny.toml
@@ -55,6 +55,7 @@ skip = [
   { name = "hashbrown" },         # Old version used by polar-rs
   { name = "libloading" },        # Old version used by ash (vulkan binding), newer version used by khronos-egl
   { name = "memoffset" },         # Small crate
+  { name = "ndarray" },           # Needed by `numpy<0.22` in `rerun_py`
   { name = "prettyplease" },      # Old version being used by prost
   { name = "pulldown-cmark" },    # Build-dependency via `ply-rs` (!). TODO(emilk): use a better crate for .ply parsing
   { name = "raw-window-handle" }, # Pretty small crate; some crates still on old version
diff --git a/rerun_py/Cargo.toml b/rerun_py/Cargo.toml
@@ -67,6 +67,7 @@ infer.workspace = true
 # TODO(#5875): `mimalloc` starts leaking OS pages starting with `0.1.38`.
 # When the bug is fixed, change this back to `mimalloc = { workspace = true, …`.
 mimalloc = { version = "=0.1.37", features = ["local_dynamic_tls"] }
+numpy.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
 pyo3 = { workspace = true, features = ["abi3-py38"] }
diff --git a/rerun_py/rerun_bindings/rerun_bindings.pyi b/rerun_py/rerun_bindings/rerun_bindings.pyi
@@ -3,7 +3,7 @@ from typing import Optional, Sequence
 
 import pyarrow as pa
 
-from .types import AnyColumn, ComponentLike, ViewContentsLike
+from .types import AnyColumn, ComponentLike, IndexValuesLike, ViewContentsLike
 
 class IndexColumnDescriptor:
     """A column containing the index values for when the component data was updated."""
@@ -57,6 +57,16 @@ class RecordingView:
         """Filter the view to only include data between the given index time values."""
         ...
 
+    def filter_index_values(self, values: IndexValuesLike) -> RecordingView:
+        """
+        Filter the view to only include data at the given index values.
+
+        This requires index values to be a precise match.  Index values in Rerun are
+        represented as i64 sequence counts or nanoseconds. This API does not expose an interface
+        in floating point seconds, as the numerical conversion would risk false mismatches.
+        """
+        ...
+
     def select(self, *args: AnyColumn, columns: Optional[Sequence[AnyColumn]] = None) -> pa.RecordBatchReader: ...
 
 class Recording:
diff --git a/rerun_py/rerun_bindings/types.py b/rerun_py/rerun_bindings/types.py
@@ -2,23 +2,27 @@
 
 from typing import TYPE_CHECKING, Sequence, TypeAlias, Union
 
+import numpy as np
+import numpy.typing as npt
+import pyarrow as pa
+
 if TYPE_CHECKING:
     from rerun._baseclasses import ComponentMixin
 
     from .rerun_bindings import (
         ComponentColumnDescriptor as ComponentColumnDescriptor,
         ComponentColumnSelector as ComponentColumnSelector,
-        TimeColumnDescriptor as TimeColumnDescriptor,
-        TimeColumnSelector as TimeColumnSelector,
+        IndexColumnSelector as IndexColumnDescriptor,
+        IndexColumnSelector as IndexColumnSelector,
     )
 
 ComponentLike: TypeAlias = Union[str, type["ComponentMixin"]]
 
 AnyColumn: TypeAlias = Union[
-    "TimeColumnDescriptor",
     "ComponentColumnDescriptor",
-    "TimeColumnSelector",
     "ComponentColumnSelector",
+    "IndexColumnDescriptor",
+    "IndexColumnSelector",
 ]
 
 AnyComponentColumn: TypeAlias = Union[
@@ -30,3 +34,5 @@
     str,
     dict[str, Union[AnyColumn, Sequence[ComponentLike]]],
 ]
+
+IndexValuesLike: TypeAlias = Union[npt.NDArray[np.int_], pa.Int64Array]
diff --git a/rerun_py/src/dataframe.rs b/rerun_py/src/dataframe.rs
@@ -5,9 +5,10 @@
 use std::collections::{BTreeMap, BTreeSet};
 
 use arrow::{
-    array::{RecordBatchIterator, RecordBatchReader},
+    array::{make_array, Array, ArrayData, Int64Array, RecordBatchIterator, RecordBatchReader},
     pyarrow::PyArrowType,
 };
+use numpy::PyArrayMethods as _;
 use pyo3::{
     exceptions::{PyRuntimeError, PyTypeError, PyValueError},
     prelude::*,
@@ -195,6 +196,108 @@ impl AnyComponentColumn {
     }
 }
 
+#[derive(FromPyObject)]
+enum IndexValuesLike<'py> {
+    PyArrow(PyArrowType<ArrayData>),
+    NumPy(numpy::PyArrayLike1<'py, i64>),
+
+    // Catch all to support ChunkedArray and other types
+    #[pyo3(transparent)]
+    CatchAll(Bound<'py, PyAny>),
+}
+
+impl<'py> IndexValuesLike<'py> {
+    fn to_index_values(&self) -> PyResult<BTreeSet<re_chunk_store::TimeInt>> {
+        match self {
+            Self::PyArrow(array) => {
+                let array = make_array(array.0.clone());
+
+                let int_array = array.as_any().downcast_ref::<Int64Array>().ok_or_else(|| {
+                    PyTypeError::new_err("pyarrow.Array for IndexValuesLike must be of type int64.")
+                })?;
+
+                let values: BTreeSet<re_chunk_store::TimeInt> = int_array
+                    .iter()
+                    .map(|v| {
+                        v.map_or_else(
+                            || re_chunk_store::TimeInt::STATIC,
+                            // The use of temporal here should be fine even if the data is
+                            // not actually temporal. The important thing is we are converting
+                            // from an i64 input
+                            re_chunk_store::TimeInt::new_temporal,
+                        )
+                    })
+                    .collect();
+
+                if values.len() != int_array.len() {
+                    return Err(PyValueError::new_err("Index values must be unique."));
+                }
+
+                Ok(values)
+            }
+            Self::NumPy(array) => {
+                let values: BTreeSet<re_chunk_store::TimeInt> = array
+                    .readonly()
+                    .as_array()
+                    .iter()
+                    // The use of temporal here should be fine even if the data is
+                    // not actually temporal. The important thing is we are converting
+                    // from an i64 input
+                    .map(|v| re_chunk_store::TimeInt::new_temporal(*v))
+                    .collect();
+
+                if values.len() != array.len()? {
+                    return Err(PyValueError::new_err("Index values must be unique."));
+                }
+
+                Ok(values)
+            }
+            Self::CatchAll(any) => {
+                // If any has the `.chunks` attribute, we can try to try each chunk as pyarrow array
+                if let Ok(chunks) = any.getattr("chunks") {
+                    let mut values = BTreeSet::new();
+                    for chunk in chunks.iter()? {
+                        let chunk = chunk?.extract::<PyArrowType<ArrayData>>()?;
+                        let array = make_array(chunk.0.clone());
+
+                        let int_array =
+                            array.as_any().downcast_ref::<Int64Array>().ok_or_else(|| {
+                                PyTypeError::new_err(
+                                    "pyarrow.Array for IndexValuesLike must be of type int64.",
+                                )
+                            })?;
+
+                        values.extend(
+                            int_array
+                                .iter()
+                                .map(|v| {
+                                    v.map_or_else(
+                                        || re_chunk_store::TimeInt::STATIC,
+                                        // The use of temporal here should be fine even if the data is
+                                        // not actually temporal. The important thing is we are converting
+                                        // from an i64 input
+                                        re_chunk_store::TimeInt::new_temporal,
+                                    )
+                                })
+                                .collect::<BTreeSet<_>>(),
+                        );
+                    }
+
+                    if values.len() != any.len()? {
+                        return Err(PyValueError::new_err("Index values must be unique."));
+                    }
+
+                    Ok(values)
+                } else {
+                    Err(PyTypeError::new_err(
+                        "IndexValuesLike must be a pyarrow.Array, pyarrow.ChunkedArray, or numpy.ndarray",
+                    ))
+                }
+            }
+        }
+    }
+}
+
 struct ComponentLike(re_sdk::ComponentName);
 
 impl FromPyObject<'_> for ComponentLike {
@@ -438,6 +541,18 @@ impl PyRecordingView {
             query_expression,
         })
     }
+
+    fn filter_index_values(&self, values: IndexValuesLike<'_>) -> PyResult<Self> {
+        let values = values.to_index_values()?;
+
+        let mut query_expression = self.query_expression.clone();
+        query_expression.filtered_index_values = Some(values);
+
+        Ok(Self {
+            recording: self.recording.clone(),
+            query_expression,
+        })
+    }
 }
 
 impl PyRecording {
diff --git a/rerun_py/tests/unit/test_dataframe.py b/rerun_py/tests/unit/test_dataframe.py