Add query validation to indexes and validate EF_RUNTIME

abrookins · abrookins · commit 493172fce7d5 · 2025-04-15T13:36:25.000-07:00
diff --git a/redisvl/exceptions.py b/redisvl/exceptions.py
@@ -30,3 +30,9 @@ def __init__(self, message, index=None):
         if index is not None:
             message = f"Validation failed for object at index {index}: {message}"
         super().__init__(message)
+
+
+class QueryValidationError(RedisVLError):
+    """Error when validating a query."""
+
+    pass
diff --git a/redisvl/index/index.py b/redisvl/index/index.py
@@ -18,6 +18,7 @@
     Union,
 )
 
+from redisvl.query.query import VectorQuery
 from redisvl.redis.utils import convert_bytes, make_dict
 from redisvl.utils.utils import deprecated_argument, deprecated_function, sync_wrapper
 
@@ -34,6 +35,7 @@
 from redis.commands.search.indexDefinition import IndexDefinition
 
 from redisvl.exceptions import (
+    QueryValidationError,
     RedisModuleVersionError,
     RedisSearchError,
     RedisVLError,
@@ -835,8 +837,21 @@ def batch_query(
             all_parsed.append(parsed)
         return all_parsed
 
+    def _validate_query(self, query: BaseQuery) -> None:
+        """Validate a query."""
+        if isinstance(query, VectorQuery):
+            field = self.schema.fields[query._vector_field_name]
+            if query.ef_runtime and field.attrs.algorithm != "hnsw":  # type: ignore
+                raise QueryValidationError(
+                    "Flat index does not support vector queries."
+                )
+
     def _query(self, query: BaseQuery) -> List[Dict[str, Any]]:
         """Execute a query and process results."""
+        try:
+            self._validate_query(query)
+        except QueryValidationError as e:
+            raise QueryValidationError(f"Invalid query: {str(e)}") from e
         results = self.search(query.query, query_params=query.params)
         return process_results(results, query=query, schema=self.schema)
 
@@ -1527,8 +1542,21 @@ async def batch_query(
 
         return all_parsed
 
+    def _validate_query(self, query: BaseQuery) -> None:
+        """Validate a query."""
+        if isinstance(query, VectorQuery):
+            field = self.schema.fields[query._vector_field_name]
+            if query.ef_runtime and field.attrs.algorithm != "hnsw":  # type: ignore
+                raise QueryValidationError(
+                    "Flat index does not support vector queries."
+                )
+
     async def _query(self, query: BaseQuery) -> List[Dict[str, Any]]:
         """Asynchronously execute a query and process results."""
+        try:
+            self._validate_query(query)
+        except QueryValidationError as e:
+            raise QueryValidationError(f"Invalid query: {str(e)}") from e
         results = await self.search(query.query, query_params=query.params)
         return process_results(results, query=query, schema=self.schema)
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,7 +4,9 @@
 import pytest
 from testcontainers.compose import DockerCompose
 
+from redisvl.index.index import AsyncSearchIndex, SearchIndex
 from redisvl.redis.connection import RedisConnectionFactory
+from redisvl.redis.utils import array_to_buffer
 from redisvl.utils.vectorize import HFTextVectorizer
 
 
@@ -191,3 +193,111 @@ def pytest_collection_modifyitems(
     for item in items:
         if item.get_closest_marker("requires_api_keys"):
             item.add_marker(skip_api)
+
+
+@pytest.fixture
+def flat_index(sample_data, redis_url):
+    """
+    A fixture that uses the "flag" algorithm for its vector field.
+    """
+    # construct a search index from the schema
+    index = SearchIndex.from_dict(
+        {
+            "index": {
+                "name": "user_index",
+                "prefix": "v1",
+                "storage_type": "hash",
+            },
+            "fields": [
+                {"name": "description", "type": "text"},
+                {"name": "credit_score", "type": "tag"},
+                {"name": "job", "type": "text"},
+                {"name": "age", "type": "numeric"},
+                {"name": "last_updated", "type": "numeric"},
+                {"name": "location", "type": "geo"},
+                {
+                    "name": "user_embedding",
+                    "type": "vector",
+                    "attrs": {
+                        "dims": 3,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    },
+                },
+            ],
+        },
+        redis_url=redis_url,
+    )
+
+    # create the index (no data yet)
+    index.create(overwrite=True)
+
+    # Prepare and load the data
+    def hash_preprocess(item: dict) -> dict:
+        return {
+            **item,
+            "user_embedding": array_to_buffer(item["user_embedding"], "float32"),
+        }
+
+    index.load(sample_data, preprocess=hash_preprocess)
+
+    # run the test
+    yield index
+
+    # clean up
+    index.delete(drop=True)
+
+
+@pytest.fixture
+async def async_flat_index(sample_data, redis_url):
+    """
+    A fixture that uses the "flag" algorithm for its vector field.
+    """
+    # construct a search index from the schema
+    index = AsyncSearchIndex.from_dict(
+        {
+            "index": {
+                "name": "user_index",
+                "prefix": "v1",
+                "storage_type": "hash",
+            },
+            "fields": [
+                {"name": "description", "type": "text"},
+                {"name": "credit_score", "type": "tag"},
+                {"name": "job", "type": "text"},
+                {"name": "age", "type": "numeric"},
+                {"name": "last_updated", "type": "numeric"},
+                {"name": "location", "type": "geo"},
+                {
+                    "name": "user_embedding",
+                    "type": "vector",
+                    "attrs": {
+                        "dims": 3,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    },
+                },
+            ],
+        },
+        redis_url=redis_url,
+    )
+
+    # create the index (no data yet)
+    await index.create(overwrite=True)
+
+    # Prepare and load the data
+    def hash_preprocess(item: dict) -> dict:
+        return {
+            **item,
+            "user_embedding": array_to_buffer(item["user_embedding"], "float32"),
+        }
+
+    await index.load(sample_data, preprocess=hash_preprocess)
+
+    # run the test
+    yield index
+
+    # clean up
+    await index.delete(drop=True)
diff --git a/tests/integration/test_async_search_index.py b/tests/integration/test_async_search_index.py
@@ -5,7 +5,12 @@
 from redis import Redis as SyncRedis
 from redis.asyncio import Redis as AsyncRedis
 
-from redisvl.exceptions import RedisModuleVersionError, RedisSearchError, RedisVLError
+from redisvl.exceptions import (
+    QueryValidationError,
+    RedisModuleVersionError,
+    RedisSearchError,
+    RedisVLError,
+)
 from redisvl.index import AsyncSearchIndex
 from redisvl.query import VectorQuery
 from redisvl.query.query import FilterQuery
@@ -614,3 +619,16 @@ async def test_async_search_index_expire_keys(async_index):
         ttl = await client.ttl(key)
         assert ttl > 0
         assert ttl <= 30
+
+
+@pytest.mark.asyncio
+async def test_search_index_validates_query(async_flat_index, sample_data):
+    query = VectorQuery(
+        [0.1, 0.1, 0.5],
+        "user_embedding",
+        return_fields=["user", "credit_score", "age", "job", "location"],
+        num_results=7,
+        ef_runtime=100,
+    )
+    with pytest.raises(QueryValidationError):
+        await async_flat_index.query(query)
diff --git a/tests/integration/test_query.py b/tests/integration/test_query.py
@@ -3,6 +3,7 @@
 import pytest
 from redis.commands.search.result import Result
 
+from redisvl.exceptions import QueryValidationError
 from redisvl.index import SearchIndex
 from redisvl.query import (
     CountQuery,
@@ -898,3 +899,23 @@ def test_vector_query_with_ef_runtime(index, vector_query, sample_data):
     assert len(results) > 0
     for result in results:
         assert "vector_distance" in result
+
+
+def test_vector_query_with_ef_runtime_flat_index(flat_index, vector_query, sample_data):
+    """
+    Integration test: Verify that Redis ignores EF_RUNTIME on a query if the
+    algo is "flat." EF_RUNTIME is only valid with the "hnsw" algorithm.
+    """
+    vector_query.set_ef_runtime(100)
+
+    # The vector query does not know if the index field supports EF_RUNTIME,
+    # so it should include this param in the query string if asked.
+    query_string = str(vector_query)
+    assert (
+        f"{vector_query.__class__.EF_RUNTIME} ${vector_query.__class__.EF_RUNTIME_PARAM}"
+        in query_string
+    ), "EF_RUNTIME should be in query string"
+
+    # However, the index should raise an error if EF_RUNTIME is set on a flat index.
+    with pytest.raises(QueryValidationError):  # noqa: F821
+        flat_index.query(vector_query)
diff --git a/tests/integration/test_search_index.py b/tests/integration/test_search_index.py
@@ -4,7 +4,12 @@
 import pytest
 from redis import Redis
 
-from redisvl.exceptions import RedisModuleVersionError, RedisSearchError, RedisVLError
+from redisvl.exceptions import (
+    QueryValidationError,
+    RedisModuleVersionError,
+    RedisSearchError,
+    RedisVLError,
+)
 from redisvl.index import SearchIndex
 from redisvl.query import VectorQuery
 from redisvl.query.query import FilterQuery
@@ -556,3 +561,15 @@ def test_search_index_expire_keys(index):
         ttl = index.client.ttl(key)
         assert ttl > 0
         assert ttl <= 30
+
+
+def test_search_index_validates_query(flat_index, sample_data):
+    query = VectorQuery(
+        [0.1, 0.1, 0.5],
+        "user_embedding",
+        return_fields=["user", "credit_score", "age", "job", "location"],
+        num_results=7,
+        ef_runtime=100,
+    )
+    with pytest.raises(QueryValidationError):
+        flat_index.query(query)