Skip to content

Commit 9908271

Browse files
committed
[#28911] DocDB: Remove DummyANN from DocDB
Summary: DummyANN was created to provide ability to create postgres side part of vector index stuff, while DocDB lack vector index support. Now DummyANN is not used, and just consumes time in case of refactoring or API changes. This diff removes DummyANN from DocDB. **Upgrade/rollback safety:** Marked test only enum entry as deprecated. Jira: DB-18634 Test Plan: Jenkins Reviewers: arybochkin Reviewed By: arybochkin Subscribers: ybase, yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D47396
1 parent 1380d80 commit 9908271

File tree

10 files changed

+13
-419
lines changed

10 files changed

+13
-419
lines changed

src/postgres/third-party-extensions/pgvector/test/expected/yb.orig.index.out

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3));
99
ERROR: all nodes in the cluster need to upgrade before creating a vector table
1010
SET yb_enable_docdb_vector_type = true;
1111
CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3)) SPLIT INTO 1 TABLETS;
12-
CREATE INDEX ON items USING ybdummyann (embedding vector_l2_ops);
13-
WARNING: ybdummyann is meant for internal-testing only and does not yield ordered results
12+
CREATE INDEX ON items USING ybhnsw (embedding vector_l2_ops);
1413
INSERT INTO items VALUES (1, '[1.0, 0.4, 0.3]');
1514
INSERT INTO items VALUES (2, '[0.001, 0.432, 0.32]');
1615
\d items
@@ -21,7 +20,7 @@ INSERT INTO items VALUES (2, '[0.001, 0.432, 0.32]');
2120
embedding | vector(3) | | |
2221
Indexes:
2322
"items_pkey" PRIMARY KEY, lsm (id HASH)
24-
"items_embedding_idx" ybdummyann (embedding)
23+
"items_embedding_idx" ybhnsw (embedding)
2524

2625
EXPLAIN (COSTS OFF) SELECT * FROM items ORDER BY embedding <-> '[1.0, 0.4, 0.3]' LIMIT 5;
2726
QUERY PLAN
@@ -39,10 +38,10 @@ SELECT * FROM items ORDER BY embedding <-> '[1.0, 0.4, 0.3]' LIMIT 5;
3938
(2 rows)
4039

4140
EXPLAIN (COSTS OFF) SELECT embedding FROM items ORDER BY embedding <-> '[1.0, 0.4, 0.3]' LIMIT 5;
42-
QUERY PLAN
43-
----------------------------------------------------------
41+
QUERY PLAN
42+
---------------------------------------------------------
4443
Limit
45-
-> Index Only Scan using items_embedding_idx on items
44+
-> Index Scan using items_embedding_idx on items
4645
Order By: (embedding <-> '[1,0.4,0.3]'::vector)
4746
(3 rows)
4847

@@ -289,8 +288,7 @@ ERROR: different vector dimensions 10 and 3
289288

290289
DROP INDEX items_embedding_idx;
291290
-- Dummy implementation, should only provide Exact ANN within a tablet.
292-
CREATE INDEX ON items USING ybdummyann (embedding vector_l2_ops);
293-
WARNING: ybdummyann is meant for internal-testing only and does not yield ordered results
291+
CREATE INDEX ON items USING ybhnsw (embedding vector_l2_ops);
294292
EXPLAIN (COSTS OFF) SELECT * FROM items ORDER BY embedding <-> '[1,1,1,1,1,1,1,1,1,1]';
295293
QUERY PLAN
296294
-------------------------------------------------------------

src/postgres/third-party-extensions/pgvector/test/sql/yb.orig.index.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ SET yb_enable_docdb_vector_type = false;
33
CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3));
44
SET yb_enable_docdb_vector_type = true;
55
CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3)) SPLIT INTO 1 TABLETS;
6-
CREATE INDEX ON items USING ybdummyann (embedding vector_l2_ops);
6+
CREATE INDEX ON items USING ybhnsw (embedding vector_l2_ops);
77
INSERT INTO items VALUES (1, '[1.0, 0.4, 0.3]');
88
INSERT INTO items VALUES (2, '[0.001, 0.432, 0.32]');
99
\d items
@@ -92,7 +92,7 @@ SELECT * FROM items ORDER BY embedding <-> '[1.0, 0.4, 0.3]' LIMIT 5;
9292
DROP INDEX items_embedding_idx;
9393

9494
-- Dummy implementation, should only provide Exact ANN within a tablet.
95-
CREATE INDEX ON items USING ybdummyann (embedding vector_l2_ops);
95+
CREATE INDEX ON items USING ybhnsw (embedding vector_l2_ops);
9696
EXPLAIN (COSTS OFF) SELECT * FROM items ORDER BY embedding <-> '[1,1,1,1,1,1,1,1,1,1]';
9797
SELECT * FROM items ORDER BY embedding <-> '[1,1,1,1,1,1,1,1,1,1]';
9898

src/yb/common/common.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ enum PgVectorDistanceType {
129129

130130
enum PgVectorIndexType {
131131
UNKNOWN_IDX = 0;
132-
DUMMY = 1;
132+
DEPRECATED_DUMMY = 1;
133133
IVFFLAT = 2;
134134
HNSW = 3;
135135
}

src/yb/docdb/doc_vector_index.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ auto GetVectorLSMFactory(
109109
switch (options.idx_type()) {
110110
case PgVectorIndexType::HNSW:
111111
return VectorLSMFactory<LSM>(block_cache, options, mem_tracker);
112-
case PgVectorIndexType::DUMMY: [[fallthrough]];
112+
case PgVectorIndexType::DEPRECATED_DUMMY: [[fallthrough]];
113113
case PgVectorIndexType::IVFFLAT: [[fallthrough]];
114114
case PgVectorIndexType::UNKNOWN_IDX:
115115
break;
@@ -123,7 +123,7 @@ std::string GetFileExtension(const PgVectorIdxOptionsPB& options) {
123123
switch (options.idx_type()) {
124124
case PgVectorIndexType::HNSW:
125125
return "." + boost::to_lower_copy(HnswBackend_Name(options.hnsw().backend()));
126-
case PgVectorIndexType::DUMMY: [[fallthrough]];
126+
case PgVectorIndexType::DEPRECATED_DUMMY: [[fallthrough]];
127127
case PgVectorIndexType::IVFFLAT: [[fallthrough]];
128128
case PgVectorIndexType::UNKNOWN_IDX:
129129
break;

src/yb/docdb/pgsql_operation.cc

Lines changed: 1 addition & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -78,19 +78,13 @@
7878
#include "yb/util/trace.h"
7979
#include "yb/util/yb_pg_errcodes.h"
8080

81-
#include "yb/vector_index/vectorann.h"
81+
#include "yb/vector_index/vector_index_if.h"
8282

8383
#include "yb/yql/pggate/util/pg_doc_data.h"
8484
#include "yb/yql/pgwrapper/pg_wrapper.h"
8585

8686
using namespace std::literals;
8787

88-
using yb::vector_index::ANNPagingState;
89-
using yb::vector_index::DocKeyWithDistance;
90-
using yb::vector_index::IndexableVectorType;
91-
using yb::vector_index::DummyANNFactory;
92-
using yb::vector_index::VectorANN;
93-
9488
DECLARE_bool(ysql_disable_index_backfill);
9589

9690
DEPRECATE_FLAG(double, ysql_scan_timeout_multiplier, "10_2022");
@@ -2115,59 +2109,6 @@ class PgsqlReadRequestYbctidProvider {
21152109
std::optional<int64> current_order_;
21162110
};
21172111

2118-
template<IndexableVectorType Vector>
2119-
class ANNKeyProvider {
2120-
public:
2121-
explicit ANNKeyProvider(
2122-
const DocReadContext& doc_read_context, PgsqlResponsePB& response, size_t prefetch_size,
2123-
const Vector& query_vec, VectorANN<Vector>* ann, const ANNPagingState& paging_state)
2124-
: response_(response), prefetch_size_(prefetch_size), query_vec_(query_vec), ann_(ann) {
2125-
next_batch_paging_state_ = paging_state;
2126-
CHECK_GT(prefetch_size_, 0);
2127-
}
2128-
2129-
bool RefillBatch() {
2130-
CHECK(key_batch_.empty());
2131-
auto batch = ann_->GetTopKVectors(
2132-
query_vec_, prefetch_size_, next_batch_paging_state_.distance(),
2133-
next_batch_paging_state_.main_key(), false);
2134-
2135-
std::sort(batch.begin(), batch.end(), std::less<DocKeyWithDistance>());
2136-
key_batch_.insert(key_batch_.end(), batch.begin(), batch.end());
2137-
if (key_batch_.empty()) {
2138-
return false;
2139-
}
2140-
next_batch_paging_state_ =
2141-
ANNPagingState(key_batch_.back().distance_, key_batch_.back().dockey_);
2142-
return true;
2143-
}
2144-
2145-
Slice FetchKey() {
2146-
if (key_batch_.empty() && !RefillBatch()) {
2147-
return Slice();
2148-
}
2149-
2150-
auto ret = key_batch_.front();
2151-
current_entry_ = ret;
2152-
key_batch_.pop_front();
2153-
return ret.dockey_;
2154-
}
2155-
2156-
ANNPagingState GetNextBatchPagingState() const { return next_batch_paging_state_; }
2157-
2158-
void AddedKeyToResultSet() { response_.add_vector_index_distances(current_entry_->distance_); }
2159-
2160-
private:
2161-
PgsqlResponsePB& response_;
2162-
size_t prefetch_size_;
2163-
const FloatVector& query_vec_;
2164-
VectorANN<Vector>* ann_;
2165-
std::deque<DocKeyWithDistance> key_batch_;
2166-
ANNPagingState next_batch_paging_state_;
2167-
2168-
std::optional<DocKeyWithDistance> current_entry_;
2169-
};
2170-
21712112
Result<size_t> PgsqlReadOperation::Execute() {
21722113
// Verify that this request references no columns marked for deletion.
21732114
RETURN_NOT_OK(VerifyNoRefColsMarkedForDeletion(data_.doc_read_context.schema(), request_));
@@ -2205,9 +2146,6 @@ Result<size_t> PgsqlReadOperation::Execute() {
22052146
} else {
22062147
std::tie(fetched_rows, has_paging_state) = VERIFY_RESULT(ExecuteSample());
22072148
}
2208-
} else if (request_.has_vector_idx_options()) {
2209-
std::tie(fetched_rows, has_paging_state) = VERIFY_RESULT(ExecuteVectorSearch(
2210-
data_.doc_read_context, request_.vector_idx_options()));
22112149
} else if (request_.index_request().has_vector_idx_options()) {
22122150
fetched_rows = VERIFY_RESULT(ExecuteVectorLSMSearch(
22132151
request_.index_request().vector_idx_options()));
@@ -2680,91 +2618,6 @@ Result<std::tuple<size_t, bool>> PgsqlReadOperation::ExecuteSampleBlockBased() {
26802618
return std::tuple<size_t, bool>{fetched_items, false};
26812619
}
26822620

2683-
Result<std::tuple<size_t, bool>> PgsqlReadOperation::ExecuteVectorSearch(
2684-
const DocReadContext& doc_read_context, const PgVectorReadOptionsPB& options) {
2685-
// Build the vectorann and then make an index_doc_read_context on the vectorann
2686-
// to get the index iterator. Then do ExecuteBatchKeys on the index iterator.
2687-
RSTATUS_DCHECK(options.has_vector(), InvalidArgument, "Query vector not provided");
2688-
2689-
auto query_vec = options.vector().binary_value();
2690-
2691-
auto ysql_query_vec = pointer_cast<const vector_index::YSQLVector*>(query_vec.data());
2692-
2693-
auto query_vec_ref = VERIFY_RESULT(
2694-
VectorANN<FloatVector>::GetVectorFromYSQLWire(*ysql_query_vec, query_vec.size()));
2695-
2696-
DummyANNFactory<FloatVector> ann_factory;
2697-
auto ann_store = ann_factory.Create(ysql_query_vec->dim);
2698-
2699-
dockv::ReaderProjection index_doc_projection;
2700-
2701-
auto key_col_id = doc_read_context.schema().column_id(0);
2702-
2703-
// Building the schema to extract the vector and key from the main DocDB store.
2704-
// Vector should be the first value after the key.
2705-
auto vector_col_id =
2706-
doc_read_context.schema().column_id(doc_read_context.schema().num_key_columns());
2707-
index_doc_projection.Init(doc_read_context.schema(), {key_col_id, vector_col_id});
2708-
2709-
FilteringIterator table_iter(&table_iter_);
2710-
RETURN_NOT_OK(table_iter.Init(data_, request_, index_doc_projection, doc_read_context));
2711-
dockv::PgTableRow row(index_doc_projection);
2712-
const auto& table_id = request_.table_id();
2713-
2714-
// Build the VectorANN.
2715-
for (;;) {
2716-
const auto fetch_result =
2717-
VERIFY_RESULT(FetchTableRow(table_id, &table_iter, nullptr /* index */, &row));
2718-
// If changing this code, see also PgsqlReadOperation::ExecuteBatchKeys.
2719-
if (fetch_result == FetchResult::NotFound) {
2720-
break;
2721-
}
2722-
++scanned_table_rows_;
2723-
if (fetch_result == FetchResult::Found) {
2724-
auto vec_value = row.GetValueByColumnId(vector_col_id);
2725-
if (!vec_value.has_value()) {
2726-
continue;
2727-
}
2728-
2729-
// Add the vector to the ANN store
2730-
auto encoded = dockv::EncodedDocVectorValue::FromSlice(vec_value->binary_value());
2731-
auto vec = VERIFY_RESULT(VectorANN<FloatVector>::GetVectorFromYSQLWire(encoded.data));
2732-
auto doc_iter = down_cast<DocRowwiseIterator*>(table_iter_.get());
2733-
ann_store->Add(VERIFY_RESULT(encoded.DecodeId()), std::move(vec), doc_iter->GetTupleId());
2734-
}
2735-
}
2736-
2737-
// Check for paging state.
2738-
ANNPagingState ann_paging_state;
2739-
if (request_.has_paging_state()) {
2740-
ann_paging_state =
2741-
ANNPagingState{request_.paging_state().distance(), request_.paging_state().main_key()};
2742-
}
2743-
2744-
// All rows have been added to the ANN store, now we can create the iterator.
2745-
auto initial_prefetch_size = request_.vector_idx_options().prefetch_size();
2746-
initial_prefetch_size = std::max(initial_prefetch_size, 25);
2747-
2748-
ANNKeyProvider key_provider(
2749-
doc_read_context, response_, initial_prefetch_size, query_vec_ref, ann_store.get(),
2750-
ann_paging_state);
2751-
auto fetched_rows = VERIFY_RESULT(ExecuteBatchKeys(key_provider));
2752-
2753-
auto next_paging_state = key_provider.GetNextBatchPagingState();
2754-
2755-
// Set paging state.
2756-
bool has_paging_state = !next_paging_state.valid();
2757-
if (has_paging_state) {
2758-
auto* paging_state = response_.mutable_paging_state();
2759-
paging_state->set_distance(next_paging_state.distance());
2760-
paging_state->set_main_key(next_paging_state.main_key().ToBuffer());
2761-
2762-
BindReadTimeToPagingState(data_.read_operation_data.read_time);
2763-
}
2764-
2765-
return std::tuple<size_t, bool>{fetched_rows, has_paging_state};
2766-
}
2767-
27682621
Result<size_t> PgsqlReadOperation::ExecuteVectorLSMSearch(const PgVectorReadOptionsPB& options) {
27692622
RSTATUS_DCHECK(
27702623
data_.vector_index, IllegalState, "Search vector when vector index is null: $0", request_);

src/yb/docdb/pgsql_operation.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,6 @@ class PgsqlReadOperation : public DocExprExecutor {
212212
// Execute a READ operator for a given scalar argument.
213213
Result<std::tuple<size_t, bool>> ExecuteScalar();
214214

215-
// Execute a READ operator for a given vector search.
216-
Result<std::tuple<size_t, bool>> ExecuteVectorSearch(
217-
const DocReadContext& doc_read_context, const PgVectorReadOptionsPB& options);
218-
219215
// Execute a READ operator for a given batch of keys.
220216
template <class KeyProvider>
221217
Result<size_t> ExecuteBatchKeys(KeyProvider& key_provider);

src/yb/tablet/tablet_metadata.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,7 @@ TableInfoPtr TableInfo::TEST_CreateWithLogPrefix(
463463

464464
bool TableInfo::NeedVectorIndex() const {
465465
return index_info && index_info->is_vector_index() &&
466-
index_info->vector_idx_options().idx_type() != PgVectorIndexType::DUMMY;
466+
index_info->vector_idx_options().idx_type() != PgVectorIndexType::DEPRECATED_DUMMY;
467467
}
468468

469469
Status KvStoreInfo::LoadTablesFromPB(

src/yb/vector_index/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ set(VECTOR_INDEX_SRCS
3434
vector_id.cc
3535
vector_lsm.cc
3636
vector_lsm_metadata.cc
37-
vectorann.cc
3837
)
3938

4039
set(VECTOR_INDEX_LIBS

0 commit comments

Comments
 (0)