Skip to content

Commit 8a8ab86

Browse files
authored
Incremental indexing: unification, performance, maintenance APIs (#11178)
Just the public facing API changes. All the important stuff is implemented by the sibling PR: * rerun-io/dataplatform#1708 Breaking Protobuf changes only apply to the JSON representation, which isn't used anywhere.
1 parent 3c5569b commit 8a8ab86

File tree

8 files changed

+50
-132
lines changed

8 files changed

+50
-132
lines changed

crates/store/re_protos/proto/rerun/v1alpha1/cloud.proto

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,6 @@ service RerunCloudService {
8484
// Creates a custom index for a specific column (vector search, full-text search, etc).
8585
rpc CreateIndex(CreateIndexRequest) returns (CreateIndexResponse) {}
8686

87-
// Recreate an index with the same configuration but (potentially) new data.
88-
rpc ReIndex(ReIndexRequest) returns (ReIndexResponse) {}
89-
9087
/* Queries */
9188

9289
// Search a previously created index.
@@ -291,15 +288,6 @@ message CreateIndexResponse {
291288
rerun.common.v1alpha1.DataframePart data = 1;
292289
}
293290

294-
message ReIndexRequest {
295-
reserved 1;
296-
reserved "dataset_id";
297-
}
298-
299-
message ReIndexResponse {
300-
rerun.common.v1alpha1.DataframePart data = 1;
301-
}
302-
303291
message IndexConfig {
304292
// what kind of index do we want to create and what are its index specific properties.
305293
IndexProperties properties = 1;
@@ -623,28 +611,38 @@ message ScanTableResponse {
623611
message DoMaintenanceRequest {
624612
rerun.common.v1alpha1.EntryId dataset_id = 1;
625613

626-
// Create the acceleration structures for temporal queries.
614+
// Optimize all builtin and user-defined indexes on this dataset.
615+
//
616+
// This merges all individual index deltas back in the main index, improving runtime performance
617+
// of all indexes.
618+
bool optimize_indexes = 2;
619+
620+
// Retrain all user-defined indexes on this dataset from scratch.
627621
//
628-
// This will recreate all scalar indexes from scratch everytime.
622+
// This retrains all user-defined indexes from scratch for optimal runtime performance.
623+
// This is faster than re-creating the indexes, and automatically keeps track of their configurations.
629624
//
630-
// TODO(cmc): support incremental scalar indexing & index compaction
631-
bool build_scalar_indexes = 2;
625+
// This implies `optimize_indexes`.
626+
bool retrain_indexes = 6;
632627

633628
// Compact the underlying Lance fragments, for all Rerun Manifests.
634629
//
635630
// Hardcoded to the default (optimal) settings.
636631
bool compact_fragments = 3;
637632

638633
// If set, all Lance fragments older than this date will be removed, for all Rerun Manifests.
634+
//
639635
// In case requested date is more recent than 1 hour, it will be ignored and 1 hour ago
640636
// timestamp will be used. This is to prevent still used files (like recent transaction files)
641637
// to be removed and cause Lance Dataset update issues.
638+
//
642639
// See https://docs.rs/lance/latest/lance/dataset/cleanup/index.html
643640
// and https://docs.rs/lance/latest/lance/dataset/cleanup/fn.cleanup_old_versions.html
644641
google.protobuf.Timestamp cleanup_before = 4;
645642

646643
// Override default platform behavior and allow cleanup of recent files. This will respect
647644
// the value of `cleanup_before` timestamp even if it's more recent than 1 hour.
645+
//
648646
// ⚠️ Do not ever use this unless you know exactly what you're doing. Improper use will lead to data loss.
649647
bool unsafe_allow_recent_cleanup = 5;
650648
}

crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.ext.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@ impl TryFrom<crate::cloud::v1alpha1::GetChunksRequest> for GetChunksRequest {
113113
#[derive(Debug, Clone)]
114114
pub struct DoMaintenanceRequest {
115115
pub dataset_id: Option<crate::common::v1alpha1::EntryId>,
116-
pub build_scalar_indexes: bool,
116+
pub optimize_indexes: bool,
117+
pub retrain_indexes: bool,
117118
pub compact_fragments: bool,
118119
pub cleanup_before: Option<jiff::Timestamp>,
119120
pub unsafe_allow_recent_cleanup: bool,
@@ -123,7 +124,8 @@ impl From<DoMaintenanceRequest> for crate::cloud::v1alpha1::DoMaintenanceRequest
123124
fn from(value: DoMaintenanceRequest) -> Self {
124125
Self {
125126
dataset_id: value.dataset_id,
126-
build_scalar_indexes: value.build_scalar_indexes,
127+
optimize_indexes: value.optimize_indexes,
128+
retrain_indexes: value.retrain_indexes,
127129
compact_fragments: value.compact_fragments,
128130
cleanup_before: value.cleanup_before.map(|ts| prost_types::Timestamp {
129131
seconds: ts.as_second(),

crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.rs

Lines changed: 15 additions & 95 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/store/re_redap_client/src/connection_client.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,8 @@ where
357357
pub async fn do_maintenance(
358358
&mut self,
359359
dataset_id: EntryId,
360-
build_scalar_indexes: bool,
360+
optimize_indexes: bool,
361+
retrain_indexes: bool,
361362
compact_fragments: bool,
362363
cleanup_before: Option<jiff::Timestamp>,
363364
unsafe_allow_recent_cleanup: bool,
@@ -366,7 +367,8 @@ where
366367
.do_maintenance(tonic::Request::new(
367368
re_protos::cloud::v1alpha1::ext::DoMaintenanceRequest {
368369
dataset_id: Some(dataset_id.into()),
369-
build_scalar_indexes,
370+
optimize_indexes,
371+
retrain_indexes,
370372
compact_fragments,
371373
cleanup_before,
372374
unsafe_allow_recent_cleanup,

crates/store/re_server/src/rerun_cloud.rs

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -594,16 +594,6 @@ impl RerunCloudService for RerunCloudHandler {
594594
Err(tonic::Status::unimplemented("create_index not implemented"))
595595
}
596596

597-
async fn re_index(
598-
&self,
599-
_request: tonic::Request<re_protos::cloud::v1alpha1::ReIndexRequest>,
600-
) -> std::result::Result<
601-
tonic::Response<re_protos::cloud::v1alpha1::ReIndexResponse>,
602-
tonic::Status,
603-
> {
604-
Err(tonic::Status::unimplemented("re_index not implemented"))
605-
}
606-
607597
/* Queries */
608598

609599
type SearchDatasetStream = SearchDatasetResponseStream;

rerun_py/rerun_bindings/rerun_bindings.pyi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1517,7 +1517,8 @@ class DatasetEntry(Entry):
15171517

15181518
def do_maintenance(
15191519
self,
1520-
build_scalar_index: bool = False,
1520+
optimize_indexes: bool = False,
1521+
retrain_indexes: bool = False,
15211522
compact_fragments: bool = False,
15221523
cleanup_before: datetime | None = None,
15231524
unsafe_allow_recent_cleanup: bool = False,

rerun_py/src/catalog/connection_handle.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -300,12 +300,13 @@ impl ConnectionHandle {
300300
}
301301

302302
#[tracing::instrument(level = "info", skip_all)]
303-
#[allow(clippy::fn_params_excessive_bools)]
303+
#[allow(clippy::fn_params_excessive_bools, clippy::too_many_arguments)]
304304
pub fn do_maintenance(
305305
&self,
306306
py: Python<'_>,
307307
dataset_id: EntryId,
308-
build_scalar_indexes: bool,
308+
optimize_indexes: bool,
309+
retrain_indexes: bool,
309310
compact_fragments: bool,
310311
cleanup_before: Option<jiff::Timestamp>,
311312
unsafe_allow_recent_cleanup: bool,
@@ -317,7 +318,8 @@ impl ConnectionHandle {
317318
.await?
318319
.do_maintenance(
319320
dataset_id,
320-
build_scalar_indexes,
321+
optimize_indexes,
322+
retrain_indexes,
321323
compact_fragments,
322324
cleanup_before,
323325
unsafe_allow_recent_cleanup,

rerun_py/src/catalog/dataset_entry.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,8 @@ impl PyDatasetEntry {
729729

730730
/// Perform maintenance tasks on the datasets.
731731
#[pyo3(signature = (
732-
build_scalar_index = false,
732+
optimize_indexes = false,
733+
retrain_indexes = false,
733734
compact_fragments = false,
734735
cleanup_before = None,
735736
unsafe_allow_recent_cleanup = false,
@@ -739,7 +740,8 @@ impl PyDatasetEntry {
739740
fn do_maintenance(
740741
self_: PyRef<'_, Self>,
741742
py: Python<'_>,
742-
build_scalar_index: bool,
743+
optimize_indexes: bool,
744+
retrain_indexes: bool,
743745
compact_fragments: bool,
744746
cleanup_before: Option<Bound<'_, PyAny>>,
745747
unsafe_allow_recent_cleanup: bool,
@@ -766,7 +768,8 @@ impl PyDatasetEntry {
766768
connection.do_maintenance(
767769
py,
768770
dataset_id,
769-
build_scalar_index,
771+
optimize_indexes,
772+
retrain_indexes,
770773
compact_fragments,
771774
cleanup_before,
772775
unsafe_allow_recent_cleanup,

0 commit comments

Comments
 (0)