update to bm25std and add escaper for safety

rbs333 · rbs333 · commit ad9df2f423a5 · 2025-03-27T12:32:07.000-04:00
diff --git a/python-recipes/vector-search/02_hybrid_search.ipynb b/python-recipes/vector-search/02_hybrid_search.ipynb
@@ -367,7 +367,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -382,6 +382,10 @@
     }
    ],
    "source": [
+    "from redisvl.utils.token_escaper import TokenEscaper\n",
+    "\n",
+    "escaper = TokenEscaper()\n",
+    "\n",
     "# list of stopwords to filter out noise from query string\n",
     "stopwords = set([\n",
     "    \"a\", \"is\", \"the\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
@@ -391,8 +395,8 @@
     "\n",
     "def tokenize_query(user_query: str) -> str:\n",
     "    \"\"\"Convert a raw user query to a redis full text query joined by ORs\"\"\"\n",
-    "    tokens = [token.strip().strip(\",\").lower() for token in user_query.split()]\n",
-    "    return \" | \".join([token for token in tokens if token not in stopwords])\n",
+    "    tokens = [escaper.escape(token.strip().strip(\",\").lower()) for token in user_query.split()]\n",
+    "    return \" | \".join([token for token in tokens if token and token not in stopwords])\n",
     "\n",
     "# Example\n",
     "tokenize_query(user_query)"
@@ -407,7 +411,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -438,8 +442,8 @@
     "        filter_expression=f\"~({Text(text_field) % tokenize_query(user_query)})\",\n",
     "        num_results=num_results,\n",
     "        return_fields=[\"title\", \"description\"],\n",
-    "        dialect=4,\n",
-    "    ).scorer(\"BM25\").with_scores()"
+    "        dialect=2,\n",
+    "    ).scorer(\"BM25STD\").with_scores()"
    ]
   },
   {
@@ -540,7 +544,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -581,13 +585,13 @@
     "# Build the aggregation request\n",
     "req = (\n",
     "    AggregateRequest(query.query_string())\n",
-    "        .scorer(\"BM25\")\n",
+    "        .scorer(\"BM25STD\")\n",
     "        .add_scores()\n",
     "        .apply(cosine_similarity=\"(2 - @vector_distance)/2\", bm25_score=\"@__score\")\n",
     "        .apply(hybrid_score=f\"0.3*@bm25_score + 0.7*@cosine_similarity\")\n",
     "        .load(\"title\", \"description\", \"cosine_similarity\", \"bm25_score\", \"hybrid_score\")\n",
     "        .sort_by(Desc(\"@hybrid_score\"), max=3)\n",
-    "        .dialect(4)\n",
+    "        .dialect(2)\n",
     ")\n",
     "\n",
     "# Run the query\n",
@@ -620,7 +624,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -634,13 +638,13 @@
     "    # Build aggregation\n",
     "    req = (\n",
     "        AggregateRequest(query.query_string())\n",
-    "            .scorer(\"BM25\")\n",
+    "            .scorer(\"BM25STD\")\n",
     "            .add_scores()\n",
     "            .apply(cosine_similarity=\"(2 - @vector_distance)/2\", bm25_score=\"@__score\")\n",
     "            .apply(hybrid_score=f\"{1-alpha}*@bm25_score + {alpha}*@cosine_similarity\")\n",
     "            .sort_by(Desc(\"@hybrid_score\"), max=num_results)\n",
     "            .load(\"title\", \"description\", \"cosine_similarity\", \"bm25_score\", \"hybrid_score\")\n",
-    "            .dialect(4)\n",
+    "            .dialect(2)\n",
     "    )\n",
     "\n",
     "    # Run the query\n",