add pagination to examples

czaloom · czaloom · commit 0c7e5a419e2b · 2026-01-26T16:22:11.000-05:00
diff --git a/src/pyproject.toml b/src/pyproject.toml
@@ -33,7 +33,7 @@ docs = [
 ]
 test = ["pytest", "coverage", "pre-commit"]
 benchmark = ["requests"]
-dev = ["valor-lite[nlp, openai, mistral, benchmark, test, docs]"]
+dev = ["valor-lite[nlp, openai, mistral, benchmark, test, docs]", "pyarrow-stubs"]
 
 [tool.black]
 line-length = 79
diff --git a/src/valor_lite/cache/compute.py b/src/valor_lite/cache/compute.py
@@ -1,9 +1,10 @@
 import heapq
 import tempfile
 from pathlib import Path
-from typing import Callable
+from typing import Callable, Generator
 
 import pyarrow as pa
+import pyarrow.compute as pc
 
 from valor_lite.cache.ephemeral import MemoryCacheReader, MemoryCacheWriter
 from valor_lite.cache.persistent import FileCacheReader, FileCacheWriter
@@ -152,3 +153,56 @@ def sort(
             columns=columns,
             table_sort_override=table_sort_override,
         )
+
+
+def paginate_index(
+    source: MemoryCacheReader | FileCacheReader,
+    column_key: str,
+    modifier: pc.Expression | None = None,
+    limit: int | None = None,
+    offset: int = 0,
+) -> Generator[pa.Table, None, None]:
+    """
+    Create a filter that performs a pagination operation on an index.
+
+    Note this function expects unqiue keys to be fragment-aligned and in ascending order.
+    """
+    total = source.count_rows()
+    limit = limit if limit else total
+
+    # pagination broader than data scope
+    if offset == 0 and limit >= total:
+        for tbl in source.iterate_tables(filter=modifier):
+            yield tbl
+        return
+    elif offset >= total:
+        return
+
+    curr_idx = 0
+    for tbl in source.iterate_tables(filter=modifier):
+        if tbl.num_rows == 0:
+            continue
+
+        unique_values = pc.unique(tbl[column_key]).sort()  # type: ignore[reportAttributeAccessIssue]
+        n_unique = len(unique_values)
+        prev_idx = curr_idx
+        curr_idx += n_unique
+
+        # check for page overlap
+        if curr_idx <= offset:
+            continue
+        elif prev_idx >= (offset + limit):
+            return
+
+        # apply any pagination conditions
+        condition = pc.scalar(True)
+        if prev_idx < offset and curr_idx > offset:
+            condition &= (
+                pc.field(column_key) >= unique_values[offset - prev_idx]
+            )
+        if prev_idx < (offset + limit) and curr_idx > (offset + limit):
+            condition &= (
+                pc.field(column_key) < unique_values[offset + limit - prev_idx]
+            )
+
+        yield tbl.filter(condition)
diff --git a/src/valor_lite/classification/evaluator.py b/src/valor_lite/classification/evaluator.py
@@ -528,23 +528,6 @@ def iterate_values(self, datums: pc.Expression | None = None):
             matches = tbl["match"].to_numpy()
             yield ids, scores, winners, matches
 
-    def iterate_values_with_tables(self, datums: pc.Expression | None = None):
-        for tbl in self._reader.iterate_tables(filter=datums):
-            ids = np.column_stack(
-                [
-                    tbl[col].to_numpy()
-                    for col in [
-                        "datum_id",
-                        "gt_label_id",
-                        "pd_label_id",
-                    ]
-                ]
-            )
-            scores = tbl["pd_score"].to_numpy()
-            winners = tbl["pd_winner"].to_numpy()
-            matches = tbl["match"].to_numpy()
-            yield ids, scores, winners, matches, tbl
-
     def compute_rocauc(self) -> dict[MetricType, list[Metric]]:
         """
         Compute ROCAUC.
@@ -723,6 +706,8 @@ def compute_examples(
         score_thresholds: list[float] = [0.0],
         hardmax: bool = True,
         datums: pc.Expression | None = None,
+        limit: int | None = None,
+        offset: int = 0,
     ) -> list[Metric]:
         """
         Compute examples per datum.
@@ -737,6 +722,10 @@ def compute_examples(
             Toggles whether a hardmax is applied to predictions.
         datums : pyarrow.compute.Expression, optional
             Option to filter datums by an expression.
+        limit : int, optional
+            Option to set a limit to the number of returned datum examples.
+        offset : int, default=0
+            Option to offset where examples are being created in the datum index.
 
         Returns
         -------
@@ -747,16 +736,29 @@ def compute_examples(
             raise ValueError("At least one score threshold must be passed.")
 
         metrics = []
-        for (
-            ids,
-            scores,
-            winners,
-            _,
-            tbl,
-        ) in self.iterate_values_with_tables(datums=datums):
-            if ids.size == 0:
+        for tbl in compute.paginate_index(
+            source=self._reader,
+            column_key="datum_id",
+            modifier=datums,
+            limit=limit,
+            offset=offset,
+        ):
+            if tbl.num_rows == 0:
                 continue
 
+            ids = np.column_stack(
+                [
+                    tbl[col].to_numpy()
+                    for col in [
+                        "datum_id",
+                        "gt_label_id",
+                        "pd_label_id",
+                    ]
+                ]
+            )
+            scores = tbl["pd_score"].to_numpy()
+            winners = tbl["pd_winner"].to_numpy()
+
             # extract external identifiers
             index_to_datum_id = create_mapping(
                 tbl, ids, 0, "datum_id", "datum_uid"
@@ -829,16 +831,23 @@ def compute_confusion_matrix_with_examples(
             )
             for score_idx, score_thresh in enumerate(score_thresholds)
         }
-        for (
-            ids,
-            scores,
-            winners,
-            _,
-            tbl,
-        ) in self.iterate_values_with_tables(datums=datums):
-            if ids.size == 0:
+        for tbl in self._reader.iterate_tables(filter=datums):
+            if tbl.num_rows == 0:
                 continue
 
+            ids = np.column_stack(
+                [
+                    tbl[col].to_numpy()
+                    for col in [
+                        "datum_id",
+                        "gt_label_id",
+                        "pd_label_id",
+                    ]
+                ]
+            )
+            scores = tbl["pd_score"].to_numpy()
+            winners = tbl["pd_winner"].to_numpy()
+
             # extract external identifiers
             index_to_datum_id = create_mapping(
                 tbl, ids, 0, "datum_id", "datum_uid"
diff --git a/src/valor_lite/object_detection/evaluator.py b/src/valor_lite/object_detection/evaluator.py
@@ -600,6 +600,8 @@ def compute_examples(
         iou_thresholds: list[float],
         score_thresholds: list[float],
         datums: pc.Expression | None = None,
+        limit: int | None = None,
+        offset: int = 0,
     ) -> list[Metric]:
         """
         Computes examples at various thresholds.
@@ -614,6 +616,10 @@ def compute_examples(
             A list of score thresholds to compute metrics over.
         datums : pyarrow.compute.Expression, optional
             Option to filter datums by an expression.
+        limit : int, optional
+            Option to set a limit to the number of returned datum examples.
+        offset : int, default=0
+            Option to offset where examples are being created in the datum index.
 
         Returns
         -------
@@ -626,11 +632,6 @@ def compute_examples(
             raise ValueError("At least one score threshold must be passed.")
 
         metrics = []
-        tbl_columns = [
-            "datum_uid",
-            "gt_uid",
-            "pd_uid",
-        ]
         numeric_columns = [
             "datum_id",
             "gt_id",
@@ -640,14 +641,20 @@ def compute_examples(
             "iou",
             "pd_score",
         ]
-        for tbl, pairs in self._detailed_reader.iterate_tables_with_arrays(
-            columns=tbl_columns + numeric_columns,
-            numeric_columns=numeric_columns,
-            filter=datums,
+        for tbl in compute.paginate_index(
+            source=self._detailed_reader,
+            column_key="datum_id",
+            modifier=datums,
+            limit=limit,
+            offset=offset,
         ):
-            if pairs.size == 0:
+            if tbl.num_rows == 0:
                 continue
 
+            pairs = np.column_stack(
+                [tbl[col].to_numpy() for col in numeric_columns]
+            )
+
             index_to_datum_id = {}
             index_to_groundtruth_id = {}
             index_to_prediction_id = {}
diff --git a/tests/classification/test_examples.py b/tests/classification/test_examples.py
@@ -681,3 +681,65 @@ def test_examples_without_hardmax_animal_example(
         assert m in expected_metrics
     for m in expected_metrics:
         assert m in actual_metrics
+
+
+def test_examples_with_color_example_paginated(
+    loader: Loader,
+    classifications_color_example: list[Classification],
+):
+
+    loader.add_data(classifications_color_example)
+    evaluator = loader.finalize()
+
+    actual_metrics = evaluator.compute_examples(
+        score_thresholds=[0.5],
+        limit=3,
+        offset=1,
+    )
+
+    actual_metrics = [m.to_dict() for m in actual_metrics]
+    expected_metrics = [
+        {
+            "type": "Examples",
+            "value": {
+                "datum_id": "uid1",
+                "true_positives": [],
+                "false_positives": ["blue"],
+                "false_negatives": ["white"],
+            },
+            "parameters": {
+                "score_threshold": 0.5,
+                "hardmax": True,
+            },
+        },
+        {
+            "type": "Examples",
+            "value": {
+                "datum_id": "uid2",
+                "true_positives": [],
+                "false_positives": [],
+                "false_negatives": ["red"],
+            },
+            "parameters": {
+                "score_threshold": 0.5,
+                "hardmax": True,
+            },
+        },
+        {
+            "type": "Examples",
+            "value": {
+                "datum_id": "uid3",
+                "true_positives": [],
+                "false_positives": ["white"],
+                "false_negatives": ["blue"],
+            },
+            "parameters": {
+                "score_threshold": 0.5,
+                "hardmax": True,
+            },
+        },
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
diff --git a/tests/common/conftest.py b/tests/common/conftest.py
@@ -1,5 +1,7 @@
 from pathlib import Path
+from typing import Callable
 
+import pyarrow as pa
 import pytest
 
 from valor_lite.cache.ephemeral import MemoryCacheWriter
@@ -20,7 +22,9 @@
         "in-memory_small_chunks",
     ],
 )
-def create_writer(request, tmp_path: Path):
+def create_writer(
+    request, tmp_path: Path
+) -> Callable[[pa.Schema], MemoryCacheWriter | FileCacheWriter]:
     file_type, batch_size, rows_per_file = request.param
     match file_type:
         case "memory":
@@ -35,3 +39,5 @@ def create_writer(request, tmp_path: Path):
                 batch_size=batch_size,
                 rows_per_file=rows_per_file,
             )
+        case unknown:
+            raise RuntimeError(unknown)
diff --git a/tests/common/test_cache_pagination.py b/tests/common/test_cache_pagination.py
diff --git a/tests/object_detection/test_examples.py b/tests/object_detection/test_examples.py

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ docs = [`
`33`	`33`	`]`
`34`	`34`	`test = ["pytest", "coverage", "pre-commit"]`
`35`	`35`	`benchmark = ["requests"]`
`36`		`-dev = ["valor-lite[nlp, openai, mistral, benchmark, test, docs]"]`
	`36`	`+dev = ["valor-lite[nlp, openai, mistral, benchmark, test, docs]", "pyarrow-stubs"]`
`37`	`37`
`38`	`38`	`[tool.black]`
`39`	`39`	`line-length = 79`