Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llama-index-core/llama_index/core/vector_stores/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ class VectorStoreQueryMode(str, Enum):
TEXT_SEARCH = "text_search"
SEMANTIC_HYBRID = "semantic_hybrid"

# NOTE: currently only used by postgres filters search
FILTERS = "filters"

# fit learners
SVM = "svm"
LOGISTIC_REGRESSION = "logistic_regression"
Expand Down Expand Up @@ -257,6 +260,8 @@ class VectorStoreQuery:
sparse_top_k: Optional[int] = None
# NOTE: return top k results from hybrid search. similarity_top_k is used for dense search top k
hybrid_top_k: Optional[int] = None
# NOTE: currently only used by postgres filters search
filters_top_k: Optional[int] = None


@runtime_checkable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,28 @@ async def _aquery_with_score(
for item in res.all()
]

def _build_filters_query(
self,
limit: int,
metadata_filters: Optional[MetadataFilters] = None,
) -> Any:
from sqlalchemy import select
from sqlalchemy.sql import text

stmt = (
select( # type: ignore
self._table_class.id,
self._table_class.node_id,
self._table_class.text,
self._table_class.metadata_,
self._table_class.text_search_tsv.label("rank"),
)
.order_by(text("rank desc"))
)

# type: ignore
return self._apply_filters_and_limit(stmt, limit, metadata_filters)

def _build_sparse_query(
self,
query_str: Optional[str],
Expand Down Expand Up @@ -799,6 +821,24 @@ def _sparse_query_with_rank(
for item in res.all()
]

def _query_filters_only(
self,
limit: int = 10,
metadata_filters: Optional[MetadataFilters] = None,
) -> List[DBEmbeddingRow]:
stmt = self._build_filters_query(limit, metadata_filters)
with self._session() as session, session.begin():
res = session.execute(stmt)
return [
DBEmbeddingRow(
node_id=item.node_id,
text=item.text,
metadata=item.metadata_,
similarity=item.rank,
)
for item in res.all()
]

async def _async_hybrid_query(
self, query: VectorStoreQuery, **kwargs: Any
) -> List[DBEmbeddingRow]:
Expand Down Expand Up @@ -880,6 +920,9 @@ async def aquery(
self._initialize()
if query.mode == VectorStoreQueryMode.HYBRID:
results = await self._async_hybrid_query(query, **kwargs)
elif query.mode == VectorStoreQueryMode.FILTERS:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kind if disagree with this change. We are adding a new enum value that is only used by single vector store. The user experience with these modes is already not great across vector stores, this makes it even worse.

I would prefer that we implement the get_nodes() / aget_nodes() function, that allows you to pass in either node ids or filters to get a list of nodes. This method is on the base class, but has only been implemented in a handful of vector stores so far (there is also delete_nodes() / adelete_nodes() and clear() / aclear())

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh nice, get nodes is already implemented, just checked

Copy link
Collaborator

@logan-markewich logan-markewich Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to handle empty queries, that doesn't need to be its own mode. We can adjust the default query mode to work with or without a query string

Or, you can just use the get_nodes method described above

Copy link
Author

@vlreinier vlreinier Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understandable. The get_nodes solution seems to be sufficient. however I still think it adds some value because of the way llama-index builds the table columns. For each column a query mode is available except for metadata.

filters_top_k = query.filters_top_k or query.similarity_top_k
results = self._query_filters_only(filters_top_k, query.filters)
elif query.mode in [
VectorStoreQueryMode.SPARSE,
VectorStoreQueryMode.TEXT_SEARCH,
Expand All @@ -904,6 +947,9 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul
self._initialize()
if query.mode == VectorStoreQueryMode.HYBRID:
results = self._hybrid_query(query, **kwargs)
elif query.mode == VectorStoreQueryMode.FILTERS:
filters_top_k = query.filters_top_k or query.similarity_top_k
results = self._query_filters_only(filters_top_k, query.filters)
elif query.mode in [
VectorStoreQueryMode.SPARSE,
VectorStoreQueryMode.TEXT_SEARCH,
Expand Down