Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
MetadataFilter,
MetadataFilters,
VectorStoreQuery,
VectorStoreQueryMode,
VectorStoreQueryResult,
)
from llama_index.core.vector_stores.utils import (
Expand Down Expand Up @@ -284,6 +285,13 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul
filter_expression=metadata_filters_to_sql(query.filters),
**kwargs,
)
if query.mode == VectorStoreQueryMode.HYBRID:
text_results = self._full_text_search(
query_str=query.query_str,
**kwargs,
)
results = self._dedup_results(results + text_results)

nodes = []
similarities = []
ids = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -714,3 +714,105 @@ def _get_by_ids(self, ids: Sequence[str], /) -> list[dict[str, Any]]:
for result in resultset
]
return documents

def _full_text_search(
self,
query_str: str,
k: int = 4,
language: str = "english",
**kwargs: Any,
) -> list[tuple[dict, float, None]]:
"""Run a Postgres full-text search using plainto_tsquery and return ranked results.

Args:
query_str: The free-text query string to search for.
k: Maximum number of results to return.
language: The text search configuration/language to use (e.g. 'english').
**kwargs: Reserved for future options; currently ignored.

Returns:
List of tuples (document_dict, rank, None). Document dict contains id, content, and metadata.
"""
with (
self.connection_pool.connection() as conn,
conn.cursor(row_factory=dict_row) as cursor,
):
# normalize metadata column(s)
metadata_columns: list[str]
if isinstance(self.metadata_column, list):
metadata_columns = [
col if isinstance(col, str) else col[0]
for col in self.metadata_column
]
elif isinstance(self.metadata_column, str):
metadata_columns = [self.metadata_column]
else:
metadata_columns = []

sql_query = sql.SQL(
"""
SELECT {id_col}, {content_col},
rank() OVER (
ORDER BY ts_rank_cd(
to_tsvector({lang}, {content_col}),
plainto_tsquery({lang}, %(q)s)
) DESC
) AS rank
FROM {table}
WHERE plainto_tsquery({lang}, %(q)s) @@ to_tsvector({lang}, {content_col})
ORDER BY rank
LIMIT %(top_k)s
"""
).format(
id_col=sql.Identifier(self.id_column),
content_col=sql.Identifier(self.content_column),
lang=sql.Literal(language),
table=sql.Identifier(self.schema_name, self.table_name),
)

cursor.execute(sql_query, {"q": query_str, "top_k": k})
rows = cursor.fetchall()

results: list[tuple[dict, float, None]] = []
for row in rows:
doc = {
"id": row[self.id_column],
"content": row[self.content_column],
"metadata": (
row[metadata_columns[0]]
if isinstance(self.metadata_column, str)
else {col: row[col] for col in metadata_columns}
),
}
rank_val = float(row["rank"]) if row.get("rank") is not None else 0.0
results.append((doc, rank_val, None))

return results

def _dedup_results(
self, results: list[tuple[dict, float, Any]]
) -> list[tuple[dict, float, Any]]:
"""Deduplicate search results by document id, preserving order.

Accepts a list of tuples (document_dict, score, optional_embedding) where
document_dict contains at least the id column (self.id_column) or 'id'.
Returns a filtered list keeping the first occurrence of each id.
"""
seen_ids: set = set()
deduped: list[tuple[dict, float, Any]] = []
for doc, score, emb in results:
# robustly get id value using configured id_column or fallback to 'id'
doc_id = doc.get(self.id_column) if isinstance(doc, dict) else None
if doc_id is None:
doc_id = doc.get("id") if isinstance(doc, dict) else None

# If there's no id, treat the row as unique and keep it
if doc_id is None:
deduped.append((doc, score, emb))
continue

if doc_id not in seen_ids:
deduped.append((doc, score, emb))
seen_ids.add(doc_id)

return deduped
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from llama_index.core.vector_stores.types import (
MetadataFilters,
VectorStoreQuery,
VectorStoreQueryMode,
)
from llama_index.vector_stores.azure_postgres import AzurePGVectorStore
from llama_index.vector_stores.azure_postgres.common import DiskANN
Expand Down Expand Up @@ -316,16 +317,18 @@ def test_clear(
assert not remaining_set, "All document IDs should have been deleted"

@pytest.mark.parametrize(
["query", "embedding", "k", "filters"],
["query", "embedding", "k", "filters", "mode"],
[
("query about cats", [0.99] * 1536, 2, None),
("query about animals", [0.5] * 1536, 3, None),
("query about cats", [0.99] * 1536, 2, "filter1"),
("query about cats", [0.99] * 1536, 2, "filter2"),
("query about cats", [0.99] * 1536, 2, None, None),
("query about cats", [0.99] * 1536, 2, None, "hybrid"),
("query about animals", [0.5] * 1536, 3, None, None),
("query about cats", [0.99] * 1536, 2, "filter1", None),
("query about cats", [0.99] * 1536, 2, "filter2", None),
],
indirect=["filters"],
ids=[
"search-cats",
"search-cats-hybrid",
"search-animals",
"search-cats-filtered",
"search-cats-multifiltered",
Expand All @@ -338,6 +341,7 @@ def test_query(
embedding: list[float],
k: int,
filters: MetadataFilters | None,
mode: str | None,
):
"""Run a similarity query and assert returned documents match expectations.

Expand All @@ -350,6 +354,11 @@ def test_query(
query_embedding=embedding,
similarity_top_k=k,
filters=filters,
mode=(
VectorStoreQueryMode.HYBRID
if mode == "hybrid"
else VectorStoreQueryMode.DEFAULT
),
)
results = vectorstore.query(query=vsquery)

Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading