adds word weights to HybridQuery class

justin-cechmanek · justin-cechmanek · commit 299566dec05f · 2025-10-21T15:24:41.000-07:00
diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py
@@ -94,6 +94,7 @@ def __init__(
         return_fields: Optional[List[str]] = None,
         stopwords: Optional[Union[str, Set[str]]] = "english",
         dialect: int = 2,
+        text_weights: Optional[Dict[str, float]] = None,
     ):
         """
         Instantiates a HybridQuery object.
@@ -119,7 +120,9 @@ def __init__(
                 set, or tuple of strings is provided then those will be used as stopwords.
                 Defaults to "english". if set to "None" then no stopwords will be removed.
             dialect (int, optional): The Redis dialect version. Defaults to 2.
-
+            text_weights (Optional[Dict[str, float]): The importance weighting of individual words
+                within the query text. Defaults to None, as no modifications will be made to the
+                text_scorer score.
         Raises:
             ValueError: If the text string is empty, or if the text string becomes empty after
                 stopwords are removed.
@@ -138,6 +141,7 @@ def __init__(
         self._dtype = dtype
         self._num_results = num_results
         self._set_stopwords(stopwords)
+        self._text_weights = self._parse_text_weights(text_weights)
 
         query_string = self._build_query_string()
         super().__init__(query_string)
@@ -225,13 +229,60 @@ def _tokenize_and_escape_query(self, user_query: str) -> str:
             )
             for token in user_query.split()
         ]
-        tokenized = " | ".join(
-            [token for token in tokens if token and token not in self._stopwords]
-        )
+        ##tokenized = " | ".join(
+        ##    [token for token in tokens if token and token not in self._stopwords]
+        ##)
 
-        if not tokenized:
+        token_list = [
+            token for token in tokens if token and token not in self._stopwords
+        ]
+        for i, token in enumerate(token_list):
+            if token in self._text_weights:
+                token_list[i] = f"{token}=>{{weight:{self._text_weights[token]}}}"
+
+        if not token_list:
             raise ValueError("text string cannot be empty after removing stopwords")
-        return tokenized
+        return " | ".join(token_list)
+
+    def _parse_text_weights(
+        self, weights: Optional[Dict[str, float]]
+    ) -> Dict[str, float]:
+        parsed_weights: Dict[str, float] = {}
+        if not weights:
+            return parsed_weights
+        for word, weight in weights.items():
+            word = word.strip().lower()
+            if not word or " " in word:
+                raise ValueError(
+                    f"Only individual words may be weighted. Got {{ {word}:{weight} }}"
+                )
+            if (
+                not (isinstance(weight, float) or isinstance(weight, int))
+                or weight < 0.0
+            ):
+                raise ValueError(
+                    f"Weights must be positive number. Got {{ {word}:{weight} }}"
+                )
+            parsed_weights[word] = weight
+        return parsed_weights
+
+    def set_text_weights(self, weights: Dict[str, float]):
+        """Set or update the text weights for the query.
+
+        Args:
+            text_weights: Dictionary of word:weight mappings
+        """
+        self._text_weights = self._parse_text_weights(weights)
+        self._built_query_string = None
+
+    @property
+    def text_weights(self) -> Dict[str, float]:
+        """Get the text weights.
+
+        Returns:
+            Dictionary of word:weight mappings.
+        """
+        return self._text_weights
 
     def _build_query_string(self) -> str:
         """Build the full query string for text search with optional filtering."""
diff --git a/tests/unit/test_aggregation_types.py b/tests/unit/test_aggregation_types.py
@@ -196,6 +196,83 @@ def test_hybrid_query_with_string_filter():
     assert "AND" not in query_string_wildcard
 
 
+def test_hybrid_query_text_weights():
+    # verify word weights get added into the raw Redis query syntax
+    vector = [0.1, 0.1, 0.5]
+    vector_field = "user_embedding"
+
+    query = HybridQuery(
+        text="query string alpha bravo delta tango alpha",
+        text_field_name="description",
+        vector=vector,
+        vector_field_name=vector_field,
+        text_weights={"alpha": 2, "delta": 0.555, "gamma": 0.95},
+    )
+
+    assert (
+        str(query)
+        == "(~@description:(query | string | alpha=>{weight:2} | bravo | delta=>{weight:0.555} | tango | alpha=>{weight:2}))=>[KNN 10 @user_embedding $vector AS vector_distance] SCORER BM25STD ADDSCORES DIALECT 2 APPLY (2 - @vector_distance)/2 AS vector_similarity APPLY @__score AS text_score APPLY 0.30000000000000004*@text_score + 0.7*@vector_similarity AS hybrid_score SORTBY 2 @hybrid_score DESC MAX 10"
+    )
+
+    # raise an error if weights are not positive floats
+    with pytest.raises(ValueError):
+        _ = HybridQuery(
+            text="sample text query",
+            text_field_name="description",
+            vector=vector,
+            vector_field_name=vector_field,
+            text_weights={"first": 0.2, "second": -0.1},
+        )
+
+    with pytest.raises(ValueError):
+        _ = HybridQuery(
+            text="sample text query",
+            text_field_name="description",
+            vector=vector,
+            vector_field_name=vector_field,
+            text_weights={"first": 0.2, "second": "0.1"},
+        )
+
+    # no error is weights dictiionary is empty or None
+    query = HybridQuery(
+        text="sample text query",
+        text_field_name="description",
+        vector=vector,
+        vector_field_name=vector_field,
+        text_weights={},
+    )
+    assert query
+
+    query = HybridQuery(
+        text="sample text query",
+        text_field_name="description",
+        vector=vector,
+        vector_field_name=vector_field,
+        text_weights=None,
+    )
+    assert query
+
+    # no error if the words in weights dictionary don't appear in query
+    query = HybridQuery(
+        text="sample text query",
+        text_field_name="description",
+        vector=vector,
+        vector_field_name=vector_field,
+        text_weights={"alpha": 0.2, "bravo": 0.4},
+    )
+    assert query
+
+    # we can access the word weights on a query object
+    assert query.text_weights == {"alpha": 0.2, "bravo": 0.4}
+
+    # we can change the text weights on a query object
+    query.set_text_weights(weights={"new": 0.3, "words": 0.125, "here": 99})
+    assert query.text_weights == {"new": 0.3, "words": 0.125, "here": 99}
+
+    query.set_text_weights(weights={})
+    assert query.text_weights == {}
+
+
 def test_multi_vector_query():
     # test we require Vector objects
     with pytest.raises(TypeError):