|
367 | 367 | }, |
368 | 368 | { |
369 | 369 | "cell_type": "code", |
370 | | - "execution_count": 10, |
| 370 | + "execution_count": null, |
371 | 371 | "metadata": {}, |
372 | 372 | "outputs": [ |
373 | 373 | { |
|
382 | 382 | } |
383 | 383 | ], |
384 | 384 | "source": [ |
| 385 | + "from redisvl.utils.token_escaper import TokenEscaper\n", |
| 386 | + "\n", |
| 387 | + "escaper = TokenEscaper()\n", |
| 388 | + "\n", |
385 | 389 | "# list of stopwords to filter out noise from query string\n", |
386 | 390 | "stopwords = set([\n", |
387 | 391 | " \"a\", \"is\", \"the\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n", |
|
391 | 395 | "\n", |
392 | 396 | "def tokenize_query(user_query: str) -> str:\n", |
393 | 397 | " \"\"\"Convert a raw user query to a redis full text query joined by ORs\"\"\"\n", |
394 | | - " tokens = [token.strip().strip(\",\").lower() for token in user_query.split()]\n", |
395 | | - " return \" | \".join([token for token in tokens if token not in stopwords])\n", |
| 398 | + " tokens = [escaper.escape(token.strip().strip(\",\").lower()) for token in user_query.split()]\n", |
| 399 | + " return \" | \".join([token for token in tokens if token and token not in stopwords])\n", |
396 | 400 | "\n", |
397 | 401 | "# Example\n", |
398 | 402 | "tokenize_query(user_query)" |
|
407 | 411 | }, |
408 | 412 | { |
409 | 413 | "cell_type": "code", |
410 | | - "execution_count": 11, |
| 414 | + "execution_count": null, |
411 | 415 | "metadata": {}, |
412 | 416 | "outputs": [], |
413 | 417 | "source": [ |
|
438 | 442 | " filter_expression=f\"~({Text(text_field) % tokenize_query(user_query)})\",\n", |
439 | 443 | " num_results=num_results,\n", |
440 | 444 | " return_fields=[\"title\", \"description\"],\n", |
441 | | - " dialect=4,\n", |
442 | | - " ).scorer(\"BM25\").with_scores()" |
| 445 | + " dialect=2,\n", |
| 446 | + " ).scorer(\"BM25STD\").with_scores()" |
443 | 447 | ] |
444 | 448 | }, |
445 | 449 | { |
|
540 | 544 | }, |
541 | 545 | { |
542 | 546 | "cell_type": "code", |
543 | | - "execution_count": 14, |
| 547 | + "execution_count": null, |
544 | 548 | "metadata": {}, |
545 | 549 | "outputs": [ |
546 | 550 | { |
|
581 | 585 | "# Build the aggregation request\n", |
582 | 586 | "req = (\n", |
583 | 587 | " AggregateRequest(query.query_string())\n", |
584 | | - " .scorer(\"BM25\")\n", |
| 588 | + " .scorer(\"BM25STD\")\n", |
585 | 589 | " .add_scores()\n", |
586 | 590 | " .apply(cosine_similarity=\"(2 - @vector_distance)/2\", bm25_score=\"@__score\")\n", |
587 | 591 | " .apply(hybrid_score=f\"0.3*@bm25_score + 0.7*@cosine_similarity\")\n", |
588 | 592 | " .load(\"title\", \"description\", \"cosine_similarity\", \"bm25_score\", \"hybrid_score\")\n", |
589 | 593 | " .sort_by(Desc(\"@hybrid_score\"), max=3)\n", |
590 | | - " .dialect(4)\n", |
| 594 | + " .dialect(2)\n", |
591 | 595 | ")\n", |
592 | 596 | "\n", |
593 | 597 | "# Run the query\n", |
|
620 | 624 | }, |
621 | 625 | { |
622 | 626 | "cell_type": "code", |
623 | | - "execution_count": 15, |
| 627 | + "execution_count": null, |
624 | 628 | "metadata": {}, |
625 | 629 | "outputs": [], |
626 | 630 | "source": [ |
|
634 | 638 | " # Build aggregation\n", |
635 | 639 | " req = (\n", |
636 | 640 | " AggregateRequest(query.query_string())\n", |
637 | | - " .scorer(\"BM25\")\n", |
| 641 | + " .scorer(\"BM25STD\")\n", |
638 | 642 | " .add_scores()\n", |
639 | 643 | " .apply(cosine_similarity=\"(2 - @vector_distance)/2\", bm25_score=\"@__score\")\n", |
640 | 644 | " .apply(hybrid_score=f\"{1-alpha}*@bm25_score + {alpha}*@cosine_similarity\")\n", |
641 | 645 | " .sort_by(Desc(\"@hybrid_score\"), max=num_results)\n", |
642 | 646 | " .load(\"title\", \"description\", \"cosine_similarity\", \"bm25_score\", \"hybrid_score\")\n", |
643 | | - " .dialect(4)\n", |
| 647 | + " .dialect(2)\n", |
644 | 648 | " )\n", |
645 | 649 | "\n", |
646 | 650 | " # Run the query\n", |
|
0 commit comments