Skip to content

vector index: poor recall after many deletions #67

@wey-gu

Description

@wey-gu

Ladybug version

v0.12.0

What operating system are you using?

macOS 15

What happened?

Issue: if we delete all records from a vector index, this index will be broken and query result will be forever empty.

seemed @1amageek (THANKS!!) tried with a fix kuzudb/kuzu#6046 , while i cherry-picked it to 0.11.1(i am using 0.11.1, confirm lb 0.12.0 with this bug, too) and the issue not fixed

Are there known steps to reproduce?

Reproduction:

# pip install sentence-transformers
import real_ladybug as lb
import os

db = lb.Database("example.lbug")
conn = lb.Connection(db)

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

conn.execute("INSTALL vector; LOAD vector;")

conn.execute("CREATE NODE TABLE Book(id SERIAL PRIMARY KEY, title STRING, title_embedding FLOAT[384], published_year INT64);")
conn.execute("CREATE NODE TABLE Publisher(name STRING PRIMARY KEY);")
conn.execute("CREATE REL TABLE PublishedBy(FROM Book TO Publisher);")

titles = [
    "The Quantum World",
    "Chronicles of the Universe",
]
publishers = ["Harvard University Press", "Independent Publisher", "Pearson", "McGraw-Hill Ryerson", "O'Reilly"]
published_years = [2004, 2022, 2019, 2010, 2015]

for title, published_year in zip(titles, published_years):
    embeddings = model.encode(title).tolist()
    conn.execute(
        """
        CREATE (b:Book {
            title: $title,
            title_embedding: $embeddings,
            published_year: $year
        });""",
        {"title": title, "year": published_year, "embeddings": embeddings}
    )

    print(f"Inserted book: {title}")

conn.execute("""
    CALL CREATE_VECTOR_INDEX(
        'Book',
        'book_title_index',
        'title_embedding',
        metric := 'l2'
    );
    """)

query_vector = model.encode("quantum machine learning").tolist()

print("query QUERY_VECTOR_INDEX")
result = conn.execute(
    """
    CALL QUERY_VECTOR_INDEX(
        'Book',
        'book_title_index',
        $query_vector,
        $limit,
        efs := 500
    )
    RETURN node.title
    ORDER BY distance;
    """,
    {"query_vector": query_vector, "limit": 2})

print("query result:\n")
print(result.get_as_df())

conn.execute("MATCH (b:Book) DETACH DELETE b;")
print("Deleted all Book nodes and their relationships\n\n")

for title, published_year in zip(titles, published_years):
    embeddings = model.encode(title).tolist()
    conn.execute(
        """
        CREATE (b:Book {
            title: $title,
            title_embedding: $embeddings,
            published_year: $year
        });""",
        {"title": title, "year": published_year, "embeddings": embeddings}
    )

    print(f"Inserted book: {title}")

print("query QUERY_VECTOR_INDEX")
result = conn.execute(
    """
    CALL QUERY_VECTOR_INDEX(
        'Book',
        'book_title_index',
        $query_vector,
        $limit,
        efs := 500
    )
    RETURN node.title
    ORDER BY distance;
    """,
    {"query_vector": query_vector, "limit": 2})
print("query result:\n")
print(result.get_as_df()) # here we can see the query result is empty anyway from now on.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions