diff --git a/python-recipes/vector-search/02_hybrid_search.ipynb b/python-recipes/vector-search/02_hybrid_search.ipynb index f36b055c..577d4e52 100644 --- a/python-recipes/vector-search/02_hybrid_search.ipynb +++ b/python-recipes/vector-search/02_hybrid_search.ipynb @@ -9,11 +9,11 @@ "\n", "Hybrid search is all about combining lexical search with semantic vector search to improve result relevancy. This notebook will cover 3 different hybrid search strategies with Redis:\n", "\n", - "1. Linear combination of scores from lexical search (BM25) and vector search (Cosine Distance) with the aggregation API\n", + "1. Linear combination of scores from lexical search (BM25) and vector search (Cosine Distance) with the HybridQuery class\n", "2. Client-Side Reciprocal Rank Fusion (RRF)\n", "3. Client-Side Reranking with a cross encoder model\n", "\n", - ">Note: Additional work is planed within the Redis core and ecosystem to add more flexible hybrid search capabilities in the future.\n", + ">Note: Additional work is planed within Redis Query Engine core to add more flexible hybrid search capabilities in the future.\n", "\n", "## Let's Begin!\n", "\"Open\n" @@ -32,7 +32,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -q \"redisvl>=0.4.1\" sentence-transformers pandas \"redis>=5.2.0\"" + "%pip install sentence-transformers pandas nltk \"redisvl==0.5.1\"" ] }, { @@ -125,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -152,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -175,22 +175,16 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from redisvl.utils.vectorize import HFTextVectorizer\n", "\n", "# load model for embedding our movie descriptions\n", - "model = HFTextVectorizer('sentence-transformers/all-MiniLM-L6-v2')" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ + "model = HFTextVectorizer('sentence-transformers/all-MiniLM-L6-v2')\n", + "\n", + "# embed movie descriptions\n", "movie_data = [\n", " {\n", " **movie,\n", @@ -201,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -211,26 +205,16 @@ " 'genre': 'action',\n", " 'rating': 7,\n", " 'description': 'A daring cop chases a notorious criminal across the city in a high-stakes game of cat and mouse.',\n", - " 'description_vector': b'\\x8bf|=\\xc3`\\n;\\xf2\\x91\\xb7;?\\xcb~\\xbd\\xdfd\\xce\\xbb\\xc7\\x16J=H\\xa7?=\\xdfv\\x95\\x17\\xbeA\\x1e\\x05\\xb9Hu\\xbfg3\\xbd$\\xcd\\xbd\\xbd\\xa1$\\xf7;\\x04\\xf5z=\\xfc\\xb4\\x8c=\\x89\\x0e\\xc6\\xbdhI\\x90\\xbd^\\x16\\xbd;z\\xe7\\x0c\\xbd\\x1b3\\xc9\\xbc\\x89\\xf8\\xbb\\xbc\\x18\\'u\\xbb>\\x8f\\xca<\\x02\\x80J=\\x0e\\xaf*=\\x8dOU\\xbd\\xcf\\xf0\\x95\\xbc \\x02\\x19=\\x19\\xf4K<\\xc5\\xc2\\t=J\\x83\\xac=\\x95\\xd7\\xb8\\xbd\\xf2\\xb5\\x9c\\xbd=\\x85\\x18=\\x94d&=03\\xf8<\\xee\\xf7\\x88<\\x80v\\xf2\\xbb9=[\\xbdG\\xac\\xee\\xbb<:A\\xbd\\xe1d\\x19\\xbd!d\\xf2\\xbb\\x1d\\xbax;\\xec;O<\\xd21,\\xbc\\xec\\xae\\xae=r\\x00-\\xbc\"\\x06\\xae\\xbdl\\xd6\\x1a=\\xc4\\xbf\\xcd=\\x19\\x150=\\xe3\\xf1\\x9d\\xbc\\xa6GK=\\xb2\\xb8 =\\xb2\\xf1I\\xbd-e\\x9e\\xbb\\xe9\\x8a\\xf7:\\x88\\xf8\\x1c=\\x7f\\xba\\xde<\\xd2n\\x16\\xbb\\xb4\\\\p\\xbb\\xd4\\xd5<<\\x89\\xa5\\xa3\\xb8\\xc79s<=4&<\\x84\\x1c\\x18<\\x18\\xd9-\\xbd\\xdf\\xe6\\x98<\\x15\\xa1N=\\xa2/\\xa5=\\x1d\\xf3\\xdd<\\x17L\\x13<\\x10\\x10\\xce\\xbac\\x9e\\xdc\\xbc\\xa68\\x05=+\\xa1\\xf5\\xbd\\x84\\x1bF\\xbd\\xa0?\\x14\\xbe\\xc4\\x8f(\\xbd\\xe6O\\x89\\xbd\\xf7\\xad\\xd4<\\xa7\\x12\\xc3=\\xaf\\x05O\\xbd\\x99\\x8ep\\xbc\\x18\\xb5\\xac\\xbc\\xc9\\x9ee\\xbdH\\x8es;$a\\xc1;\\xd9\\xfaB\\xbd\\xa8#\\xfe:\\x92\\xe6\\xf4=\\xcd\\x15*<\\x86\\xf8\\x1b=\\x01\\xfcV\\xbd\\xd3\\xd1\\r=9\\xee\\x06=\\x13u\\xba\\xbd\\xf7\\xa3\\xd6<\\x1a\\xec\\xd9;\\xb79/=\\xa4\\xc2\\x85=p\\x0b\"=\\xe1i\\xef<:\\xe8c=\\xfb2\\x08\\xbe\\xce\\x12;=OVW;V\\xa4b<\\xd0\\x9d\\xb7<\\x87r;\\xbdqz\\x91\\xbcV\\x00<\\xbd\\xfe\\x19\\xa3<\\xeaJ%\\xbc!\\xe7\\xbf\\xbb\\x7f\\x87\\x12=\\x94\\x1d\\x95=b|\\xfd\\xbc\\xf3\\xf1\\xd1\\xbd\\xf5y\\x84;\\xc9\\tu=]\\x8ai<3\\x91R\\xbd\\xec\\xf3m\\xbd\\x93\\xb83=V\\xedF=\\x1f\\xf3\\xd1\\x08yA\\xba<#\\xacO\\xbd\\x01\\x0f\\xc7;\\x7f\\xf4\\x04\\xbdP\\x82\\x92\\xbd\\x9b\\xddD=p\\xd8;\\xbc\\xd3;\\xf4\\xbc\\xb3\\x8f\\x97\\xbd1\\\\\\r\\xbd\\xea\\x8c\\xf5\\xbd\\x8c\\x13(=\\x9e\\xc8\\xc6=\\xa3\\xed\\x1a=\\x98\\xa8\\xf8=\\x84\\xc1\\xee\\xbc\\xcd-\\x18\\xbb\\xf5~;<\\xd6F\\t\\xbd\\x14\\x08\\x17=\\xa5\\xa5\\x1e=\\x14K\\xcb\\xbd.\\xf7\\x8c\\xbdyb\\xed\\xbb\\x86[\\x19\\xbc]\\x0c\\x13\\xbcgq\\x83=\\xf0wd\\xbd\\xe3\\xc7\\xd1\\xbb8lY\\xbc\\xa7|a=3\\xcf\\xfd\\xbc\\x1f\\xa5\\x83\\xbb\\x99O\\x19\\xbd6\\x02]\\xbd\\xbb\\xeaz=\\x036\\x9c=:^\\xa9\\xbd)^9\\xbcg\\xe4N\\xbcs\\x07x\\xbd\\x18{\\xa0=:\\x9f\\x96<\\xecq8\\xba\\x9e\\xbb=\\xbd\\xe4|(<\\x96\\xdf\\xb4\\xbbl\\xc9\\x0b\\xbd\\xc4\\x01\\x95\\xbd\\xf7\\xc6T=\\tp\\xd1\\x85<\\xa69,=\\x17\\xf6\\xdf\\xba\\xac\\x14:\\xbdM9\\x08\\xbd\\xf9\\x0b\\xbe\\xbcV\\xb9\\x9a<\\xb3_6=!Ub\\xbd\\xb7\\xb7#=\\xcf\\xed\\xa2\\xbaW\\x95\\xe1\\xbc\\xec\\xefX=\\x9cu\\x11=?\\xd86=\\xb9\\x06\\x9f\\xbc,\\xe5\\xf0<\\'\\x15t=;\\xaf\\xd0\\xbbwK-=\\xceH\\x11\\xbd\\xc7\\x036=\\xf6\\x15\\xd8<8x\\xfd\\xbcM\\x10\\x9b=\\xdb\\xdf_\\xbc\\xad\\xff\\x03\\xbd\\xfcD\\xaa=\\xbf\\xab\\x0b\\xbd\\x08$\\xe6\\xbcG\\x0cr=m\\xbc\\x99=\\xab\\xae\\xa6<)\\x9b$\\xbd\\x99y\\x06\\xbd\\xe3\\xcf\\xde=\\xecX\\x8f=s%\\r\\xbd\\x1dz\\x0e\\xbc\\xeb\\xdf\\t<$\\tI=\\x01x\\x10\\xbd\\xfb\\xd4;\\xbdY\\x0f\\xd9<\\xe8\\xe8\\x93\\xbbb\\xdf\\xba\\xbde c=>\\x9b\\x97;\\x18u\\xe0\\xbc\\x8e\\x10\\x9e\\xbdx\\xf4~=a\\x9eh<\\x91\\x070;#\\x9br=iD\\xe8:?\\xd8\\xa6\\xbb\\xcaa2\\xbd\\x9b\\xcbg\\xbb\\xf4\\xe9\\x00=\\x1b\\xc4\\x85;\\xf6\\xf7g\\xbd\\x12\\xb2\\xdc=O\\xca\\x83\\xbd\\xa5R\\x8b\\xbc\\x97F2=\\x11\\xe4\\x9b\\xbd%\\xbb\\x91=\\xfe=^<\\xcc\\x92\\x06=-@\\x90\\xbdg<\\xf5;\\xb0\\xc6\\xdc\\xbcX\\xd8\\x19=#b\\x0b\\xbd\\xccb\\xfa\\x88}x\\xef\\xbbU\\xe0\\xc6;\\\\\\x08~=\\xde/&\\xbdN\\x87\\x93\\xbd\\xab(\\xe7\\xbc\\xb8&\\x08\\xbd\\xdb\\x86\\xb1=:\\xd8p\\xbc^\\xb4\\xa2<\\xb8\\xdd|\\xbd\\xb3\\xe5\\x83\\xbc\\xfd\\x94\\x94\\xaf\\xf9\\xc2\\xbd\\xf2f\\x11=\\xd8\\xdc\\xd7\\xbd\\xcb\\xfax<#\\xc1\\xa9;\\x9a\\xd6\\xc8\\xbb=1I\\xbd\\x0b\\x7f\\x0c\\xbd\\xb2R\\xb8\\xba\\x11\\x14\\xf1\\xbc\\xa3\\xf2\\xca\\xbd\\xe6uA\\xbc\\xe2\\xf8<;\\xe2\\r\\x9d\\xbbp\\xd1r\\xbd\\xe4\\xc3}\\xbc\\xc2\\xc0\\xe5\\xbd\\x0f\\x18\\xf4=\\xb5Tp\\xbd1gC<\\xdd:\\x16\\xbd\\xf3{\\x19\\xbb\\xfe.\\xbf<\\xe3$5=AGl=-\\xbd\\\\=hGE\\xbc\\xab\\xb8\\x85\\xbd\\xd6\\xd8Y\\xbd>\\xfb\\xff;2\\r\\x88=\\x96\\xe1\\xab=y{@\\xbd\\x16O\\xc6\\xbb\\xa5$o=\\x0b#\\xf4\\xbdj\\x98\\xde=\\x96~0>\\x81 \\x98\\xbc{\\xd9\\x03\\xbe\\xb1k\\x8a\\xbd\\x9bl/=\\xddul\\xbd\\xdf\\xfa\\xd5\\x07\\xd7\\xe9\\xcd\\xbc\\xf1\\x17>\\xbdF\\xc0\\x83\\xbc1\\x1bY\\xbdF\\xd8\\x94\\xbd\\xc8/\\x1d\\xbc5M\\x07\\xbeJ\\xdd\\x8f=-\\x08\\xc1\\xbcx\\xe6N>\\x8f\\x7f<\\xd1E\\xb5\\xbd\\x0fF\\x05=b@/=\\x86\\xad1\\xbd\\x1f\\xb1\\x8a=\\x01u\\x04\\xbc\\x96I \\xbd2\\x8b\\x9b\\xbd\\x95F\\xc4<\\x85\\x0b\\xae<\\xea\\x9eA>\\xc8\\xf7\\xf7;M\\xa6\\x05\\xbd\\x85u\\xe8<\\xb5\\x88N=\\xa7\\x13\\x07\\xbd\\xe9_`\\xbdV\\xc7\\x99\\xbd\\xe7\\x92\\xb9\\x19H\\xbc@\\xc6t\\xbd\\xac\\xa2C=<\\x0f\\x18\\xbc\\xbeKz=\\xe4\\x13\\xa0=\\xea\\xe1\\x8c\\xbd\\xb6\\x84&=qZ\\x07=\\xb0\\xa8M\\xbc\\xb4\\xfaq=^\\x8b\\xe3\\xbc\\xdf\\xa3A\\xbd*\\'\\x13\\xbd\\x03\\x84\\x8a=\\x9a\\x9e\\xdd;A&s=u0l<\\xccS\\x03\\xbc\\t\\xf1:\\xbc\\xe9\\x07\\x14=e\\r\\x03\\xbd\\xad\\x18\\xb6\\xbd\\xc2\\xf0\\xbf=_(\\xae=>t\\x91\\xbd\\xe7\\x96n\\xbc\\xe0>\\xbb\\xbc\\xb4\\x87\\t=b\\xc0\\xda\\xbc\\x97\\xf6@\\xbcf\\xcd\\'\\xbcj\\x9a\\x10\\xbe\\x01\\x98\\xaf=\\xa1\\x8f\\xd1\\xbc\\x12\\xa4C=6\\xee)\\xbcvg\\x9d\\xbci6\\x98\\xbd\\x05\\x01\\x8a\\xbd\\x8el\\x15<\\x17m\\x15;\\x8d\\x97 =\\xcb\\r\\x98\\xbd\\xb7L\\x89\\xbc\\x13;\\xc3\\xbdC\\xaf\\x9b=\\xa2\\x04\\xd0\\xbb\\xe4S\\x03\\xbd_\\x99\\xbe\\xbd\\x8c\\x02[=B/D\\xbc\\xad\\xab>=5\\x19\\x03\\xbd\\x13\\xbb\\xd5<\\r\\x8b\\xa2\\xbc|K\\x8a=\\xf7\\'h<\\x87\\xe2\\n\\xbdB\\xd4\\xcd<\\xe2>9\\xbcT\\x1eh=\\xdd\\xa8a=\\x87\\xcd\\xbf\\xbb-\\x00;=fK\\x9e=\\x99\\x84\\x97\\xbdt\\x82\\xb3=}\\xd8\\xb8;\\xcf\\xa6j<\\x84\\xdd\\x9b\\xbc;\\x03}\\xbd\\xae\\xa3\\xdc<\\x80-\\xd8<\\x18\\xaa\\xf6;\\x93\\x1e\\xfc\\xba.\\x83\\xf5=\\x1en;\\xbd\\x01\\xd4G=\\xa7L\\x10\\xbdQ\\\\\\xe6\\xbd\\xe7\\x9e|\\xbc\\x11Y\\xb9\\xba\\x8e\\xceo=u\\x0f\\x19\\xbd\\xf7\\xcd%\\xbd\\xea\\xd5+=1P\\xce<\\xc7:\\xa0\\xbdm\\xb9\\x85;\\x07&\\xa2=\\xbf\\xb6\\x96=\\x80r\\xa4<\\xa8qg=\\xdb\\x0f1=%\\x8b\\xe0<\\x84\\xaev\\xbd\\xe6\\x12\\x19=\\x83\"\\x9f=Q\\x8e\\x8c<\\x0b&\\xe6\\xbd\\x03\\xb9\\x91\\xbc\\x1f\\xe8,\\xbc6\\x87\\xb7\\xbc@PF=\\xbc\\xe11< <\\xf4\\xbb*\\\\\\x8b\\xbd\\x08\\x04\\xb2\\xd8\\xd1Q\\xbdA\\x15\\xcf\\xbc\\xfb\\x0b\\xb0\\xbc+\\xc8\\xfc<\\x02\\x8d\\x98=h\\x0e9=I3K=5\\xf2\\xcd\\xbc\\xf5\\x04E\\xbd\\xab\\x997;\\xad\\x9ct\\xbd\\xcfy|=\\x04\\xd3\\x80<\\xbd\\xa3\\x0c<\\x01\\x0e\\x18>\\x0f\\x14q\\xbdi\\xe6Q=yR:\\xbd\\xbb\\xd4k\\xbd\\xb8X\\x81=\\x13|\\x98\\xbc\\x0b\\xbe\\xaf\\xbd\\xc5\\xe4\\xc6=\\xed\\xc7\\x8e\\xbdI\\xd9\\xff\\xbc\\xca\\xe50\\xbds-\\xaa\\xbcQ\\xdf\\x92;{\\x9e\\xc2\\xbbXB=\\r\\xb5\\x99\\xbb\\x046\\x90\\xbc\\xaf\\x99\\x98=y\\xb16\\xbc\\xc4E\\xba\\xbd\\x88\\x93W=\\xdc\\r\\xe9<\\xbc\\xb7\\x8e=\\xf0X\\xa9=\\x1a<\\x18\\xba\\x87U\\x15\\xbd\\x02I\\x00\\xbdBg\\xa2;~\\xb0\\xb3\\xbd\\x8c\\x8c =\\xd1\\x7fv\\xbb\\x16y\\x84\\xbc\\xa5\\'\\xd8\\xba\\x19\\x92\\xa5\\xbc\\r\\x1b)\\xbc(3\\xae\\xbb\\x16O\\x95<\\xfe\\x82\\x9d\\xbc\\x8bO\\x08<\\x0fk\\x93\\xbb\\x01!\\xe7\\xbaA\\x94\\x82=\\xf59\\n='}]" + " 'description_vector': b'\\x91f|=\\xb6`\\n;g\\x92\\xb7;3\\xcb~\\xbd\\x16e\\xce\\xbb\\xd7\\x16J=P\\xa7?=\\xc8v\\x95\\x17\\xbe\\x02\\x1a\\x05\\xb9@u\\xbf<\\xd6\\xe2b\\xba\\xd0\\xa6\\xa8\\xbdo\\xdc\\xec\\xbcQc%=N\\xe7r\\xbb\\x1dOG==(\\x85=y@\\xa2\\xbc7Z\\xd0\\xbdB%K\\xbd\\xba\\xed\\x94\\xbcU\\xddH=\\xbe&F<\\xbc*\\xec<\\x8c\\xd8\\x8d\\xbd\\xf3Z\\x98<\\x15\\xa3\\xa3=3g3\\xbd$\\xcd\\xbd\\xbd\\xf7$\\xf7;\\xf6\\xf4z=\\x02\\xb5\\x8c=\\x8d\\x0e\\xc6\\xbdhI\\x90\\xbdq\\x16\\xbd;u\\xe7\\x0c\\xbd&3\\xc9\\xbc\\x82\\xf8\\xbb\\xbc\\xa7&u\\xbb-\\x8f\\xca<\\xf2\\x7fJ=\\x14\\xaf*=\\x87OU\\xbd\\xde\\xf0\\x95\\xbc \\x02\\x19=\\x1b\\xf4K<\\xd0\\xc2\\t=F\\x83\\xac=\\x9e\\xd7\\xb8\\xbd\\xf3\\xb5\\x9c\\xbdB\\x85\\x18=\\xa4d&=\\'3\\xf8<\\xd3\\xf7\\x88\\xbd\\x7f\\x1bF\\xbd\\x9f?\\x14\\xbe\\xc9\\x8f(\\xbd\\xe4O\\x89\\xbd\\x18\\xae\\xd4<\\xb2\\x12\\xc3=\\xb0\\x05O\\xbd\\x8f\\x8ep\\xbc\\x1a\\xb5\\xac\\xbc\\xcc\\x9ee\\xbdv\\x8es;\\x0ca\\xc1;\\xd5\\xfaB\\xbde%\\xfe:\\x99\\xe6\\xf4=\\xa7\\x15*<\\x8c\\xf8\\x1b=\\x08\\xfcV\\xbd\\xce\\xd1\\r=<\\xee\\x06=\\x17u\\xba\\xbd\\r\\xa4\\xd6<\\x12\\xec\\xd9;\\xc89/=\\xa6\\xc2\\x85=x\\x0b\"=\\xe3i\\xef<4\\xe8c=\\xfc2\\x08\\xbe\\xd2\\x12;=\\x98VW;N\\xa4b<\\xe8\\x9d\\xb7<\\x90r;\\xbd\\\\z\\x91\\xbcO\\x00<\\xbd\\x13\\x1a\\xa3<\\x05K%\\xbcc\\xe7\\xbf\\xbb\\x89\\x87\\x12=\\x95\\x1d\\x95=||\\xfd\\xbc\\xf2\\xf1\\xd1\\xbdKz\\x84;\\xc7\\tu=.\\x8ai<=\\x91R\\xbd\\xdd\\xf3m\\xbd\\x8c\\xb83=_\\xedF=\\x1a\\xf3\\xd1\\x08oA\\xba<(\\xacO\\xbd\\xfc\\x0e\\xc7;\\x87\\xf4\\x04\\xbdN\\x82\\x92\\xbd\\x92\\xddD=a\\xd8;\\xbc\\xd1;\\xf4\\xbc\\xb2\\x8f\\x97\\xbd<\\\\\\r\\xbd\\xe1\\x8c\\xf5\\xbd\\x95\\x13(=\\xa2\\xc8\\xc6=\\xa9\\xed\\x1a=\\x98\\xa8\\xf8=\\x96\\xc1\\xee\\xbc\\xff.\\x18\\xbb\\xbf~;<\\xd9F\\t\\xbd\\x13\\x08\\x17=\\xa8\\xa5\\x1e=\\x17K\\xcb\\xbd0\\xf7\\x8c\\xbdXb\\xed\\xbb\\xc9[\\x19\\xbcU\\x0c\\x13\\xbcdq\\x83=\\xe9wd\\xbd.\\xc7\\xd1\\xbb~lY\\xbc\\xa2|a=a\\xcf\\xfd\\xbcB\\xa5\\x83\\xbb\\x9fO\\x19\\xbd&\\x02]\\xbd\\xb6\\xeaz=\\xff5\\x9c=1^\\xa9\\xbdi^9\\xbcT\\xe4N\\xbc}\\x07x\\xbd\\x17{\\xa0=@\\x9f\\x96<\\x81s8\\xba\\xa6\\xbb=\\xbd\\xb2|(<\\xc1\\xdf\\xb4\\xbbr\\xc9\\x0b\\xbd\\xc2\\x01\\x95\\xbd\\x02\\xc7T=\\x11p\\xd1\\x0c\\xb1\\xbch\\x01J\\xbda\\xf4~==\\xe3Z\\xbd?/\\xf1\\xbbJ98=\\xd92T\\xbc.\\'\\x81<\\xbd\\xa0M=\\xa0\\xde\\x05<\\x1bI|\\xbd\\xc4\\x98w<\\xdf\\xd3\\xa7\\xbd\\xdbS \\xbdl\\x13\\x07=\\x18&\\x14\\xbc\\xbev\\xe9<\\xfa,\\x97='}]" ] }, - "execution_count": 63, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "movie_data[:3]" + "movie_data[:1]" ] }, { @@ -244,17 +228,9 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "12:41:14 redisvl.index.index INFO Index already exists, overwriting.\n" - ] - } - ], + "outputs": [], "source": [ "from redisvl.schema import IndexSchema\n", "from redisvl.index import SearchIndex\n", @@ -285,7 +261,7 @@ "})\n", "\n", "\n", - "index = SearchIndex(schema, client)\n", + "index = SearchIndex(schema, client, validate_on_load=True)\n", "index.create(overwrite=True, drop=True)" ] }, @@ -300,35 +276,35 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['movie:01JQC7NC1JXCDCBZY83G914DZD',\n", - " 'movie:01JQC7NC1JQQ63GN6QZN2TSYAF',\n", - " 'movie:01JQC7NC1JJPZYMZ74B6ATQ37V',\n", - " 'movie:01JQC7NC1JY715NWCSJX0VCM0Q',\n", - " 'movie:01JQC7NC1JHRSWD1DFGC0W3P61',\n", - " 'movie:01JQC7NC1KEEH4ZSA5R1PQAPZM',\n", - " 'movie:01JQC7NC1KCT1GPTQHBM5Y37XG',\n", - " 'movie:01JQC7NC1KRJSTFT32BSE8SK1V',\n", - " 'movie:01JQC7NC1KA3AFNM187PHBZZ0T',\n", - " 'movie:01JQC7NC1K33J6DW2MK88HWM3F',\n", - " 'movie:01JQC7NC1KVYFHXVEM30E41HN9',\n", - " 'movie:01JQC7NC1KYN9WDY2MDNB6JGB6',\n", - " 'movie:01JQC7NC1K9YSWW7FT0379TJX9',\n", - " 'movie:01JQC7NC1K27YQT742HRFTFQVM',\n", - " 'movie:01JQC7NC1KWC3Y75RP97X8M6VD',\n", - " 'movie:01JQC7NC1KPM2X2EJAQ8DFP1W7',\n", - " 'movie:01JQC7NC1KMH8VWWG65Y3YV5CJ',\n", - " 'movie:01JQC7NC1KB30XEN6P1MQ7BJQW',\n", - " 'movie:01JQC7NC1KTPAB7KX5H6AASPB9',\n", - " 'movie:01JQC7NC1KX18AVM8MC3S7D1FQ']" + "['movie:01JR6XHSRXW6DF4TSF7N2KJD21',\n", + " 'movie:01JR6XHSS0HK3E95HDVE1D5Y4W',\n", + " 'movie:01JR6XHSS0EE89YNT702SB1ZZN',\n", + " 'movie:01JR6XHSS09BNDBRZHA6F5RT0H',\n", + " 'movie:01JR6XHSS0N8W3M4YDR45HHCPY',\n", + " 'movie:01JR6XHSS1XDQFJXHHJD9Z0AW3',\n", + " 'movie:01JR6XHSS17RNERVEQ9YAHA81D',\n", + " 'movie:01JR6XHSS182SYE6E630Y6D6B1',\n", + " 'movie:01JR6XHSS1899GWQDBWHP0Q57A',\n", + " 'movie:01JR6XHSS1WRCF39MX0XQ60PEJ',\n", + " 'movie:01JR6XHSS1K9RYSRJTTNRQQP5C',\n", + " 'movie:01JR6XHSS148EGMV6JTECWRC5J',\n", + " 'movie:01JR6XHSS1199WGN103VB7GGHE',\n", + " 'movie:01JR6XHSS12RTJR8B953SDWC1E',\n", + " 'movie:01JR6XHSS1WR63V3MZNVZ5X82R',\n", + " 'movie:01JR6XHSS2Z1F2C0ZG60FCJ36F',\n", + " 'movie:01JR6XHSS250RZBG2M3C3VKWE2',\n", + " 'movie:01JR6XHSS2ECNNK19FRAZ00RE6',\n", + " 'movie:01JR6XHSS2W46EMH9EVC42TBVV',\n", + " 'movie:01JR6XHSS2K8KF6DK6MZN65MT8']" ] }, - "execution_count": 65, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -345,343 +321,258 @@ "\n", "Now that our search index is populated and ready, we will build out a few different hybrid search techniques in Redis.\n", "\n", - "To start, we will build a few helper methods that we can reuse for each technique." - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "# Sample user query (can be changed for comparisons)\n", - "user_query = \"action adventure movie with great fighting scenes, crime busting, superheroes, and magic\"" + "To start, we will use our `HybridQuery` class that accepts a text string and vector to automatically combine text similarity and vector similarity scores." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "First, we need a method to tokenize a user query into a full-text search string:" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'action | adventure | movie | great | fighting | scenes | crime | busting | superheroes | magic'" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from redisvl.utils.token_escaper import TokenEscaper\n", - "\n", - "escaper = TokenEscaper()\n", - "\n", - "# list of stopwords to filter out noise from query string\n", - "stopwords = set([\n", - " \"a\", \"is\", \"the\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n", - " \"if\", \"in\", \"into\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\", \"such\", \"that\", \"their\",\n", - " \"then\", \"there\", \"these\", \"they\", \"this\", \"to\", \"was\", \"will\", \"with\"\n", - "])\n", + "## 1. Linear Combination using HybridQuery\n", "\n", - "def tokenize_query(user_query: str) -> str:\n", - " \"\"\"Convert a raw user query to a redis full text query joined by ORs\"\"\"\n", - " tokens = [escaper.escape(token.strip().strip(\",\").lower()) for token in user_query.split()]\n", - " return \" | \".join([token for token in tokens if token and token not in stopwords])\n", + "The goal of this technique is to calculate a weighted sum of the text similarity score for our provided text search and the cosine distance between vectors calculated via a KNN vector query. Under the hood this is possible in Redis using the [aggregations API](https://redis.io/docs/latest/develop/interact/search-and-query/advanced-concepts/aggregations/), as of `Redis 7.4.x` (search version `2.10.5`), within a single database call.\n", "\n", - "# Example\n", - "tokenize_query(user_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we need methods to create vector search and full-text search queries:" + "As of RedisVl 0.5.0 all of this is nicely encapsulated in your `HybridQuery` class, which behaves much like our other query classes." ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Function to create a vector query using RedisVL helpers for ease of use\n", - "from redisvl.query import VectorQuery, FilterQuery\n", - "from redisvl.query.filter import Text\n", - "from redisvl.redis.utils import convert_bytes, make_dict\n", - "\n", - "\n", - "def make_vector_query(user_query: str, num_results: int, filters = None) -> VectorQuery:\n", - " \"\"\"Generate a Redis vector query given user query string.\"\"\"\n", - " vector = model.embed(user_query, as_buffer=True, dtype=\"float32\")\n", - " query = VectorQuery(\n", - " vector=vector,\n", - " vector_field_name=\"description_vector\",\n", - " num_results=num_results,\n", - " return_fields=[\"title\", \"description\"]\n", - " )\n", - " if filters:\n", - " query.set_filter(filters)\n", - " \n", - " return query\n", - "\n", - "\n", - "def make_ft_query(text_field: str, user_query: str, num_results: int) -> FilterQuery:\n", - " \"\"\"Generate a Redis full-text query given a user query string.\"\"\"\n", - " return FilterQuery(\n", - " filter_expression=f\"~({Text(text_field) % tokenize_query(user_query)})\",\n", - " num_results=num_results,\n", - " return_fields=[\"title\", \"description\"],\n", - " dialect=2,\n", - " ).scorer(\"BM25STD\").with_scores()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Linear Combination using Aggregation API\n", - "\n", - "The goal of this technique is to calculate a weighted sum of the BM25 score for our provided text search and the cosine distance between vectors calculated via a KNN vector query. This is possible in Redis using the [aggregations API](https://redis.io/docs/latest/develop/interact/search-and-query/advanced-concepts/aggregations/), as of `Redis 7.4.x` (search version `2.10.5`), within a single database call.\n", - "\n", - "In Redis, the aggregations api allow you the ability to group, sort, and transform your result data in the ways you might expect to be able to do with groupby and sums in other database paradigms. \n" + "# Sample user query (can be changed for comparisons)\n", + "user_query = \"action adventure movie with great fighting scenes against a dangerous criminal, crime busting, superheroes, and magic\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "First, we build a base `VectorQuery` that runs a KNN-style vector search and test it below:" + "First, we will import our `HybridQuery` and understand its parameters.\n", + "At a minimum, the `HybridQuery` needs 4 arguments:\n", + "```python\n", + "query = HybridQuery(\n", + " text = \"your query string here\",\n", + " text_field_name = \"\",\n", + " vector = ,\n", + " vector_field_name = \"\",\n", + ")\n", + "```" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{'id': 'movie:01JQC7NC1KPM2X2EJAQ8DFP1W7',\n", - " 'vector_distance': '0.643690168858',\n", + "[{'vector_distance': '0.645975470543',\n", " 'title': 'The Incredibles',\n", - " 'description': \"A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world. Bob Parr (Mr. Incredible) and his wife Helen (Elastigirl) were among the world's greatest crime fighters, but now they must assume civilian identities and retreat to the suburbs to live a 'normal' life with their three children. However, the family's desire to help the world pulls them back into action when they face a new and dangerous enemy.\"},\n", - " {'id': 'movie:01JQC7NC1JXCDCBZY83G914DZD',\n", - " 'vector_distance': '0.66843944788',\n", + " 'vector_similarity': '0.677012264729',\n", + " 'text_score': '10.5386477145',\n", + " 'hybrid_score': '3.63550289966'},\n", + " {'vector_distance': '0.797545194626',\n", + " 'title': 'Skyfall',\n", + " 'vector_similarity': '0.601227402687',\n", + " 'text_score': '4.73920856087',\n", + " 'hybrid_score': '1.84262175014'},\n", + " {'vector_distance': '0.608649492264',\n", " 'title': 'Explosive Pursuit',\n", - " 'description': 'A daring cop chases a notorious criminal across the city in a high-stakes game of cat and mouse.'},\n", - " {'id': 'movie:01JQC7NC1KEEH4ZSA5R1PQAPZM',\n", - " 'vector_distance': '0.698122441769',\n", - " 'title': 'Mad Max: Fury Road',\n", - " 'description': \"In a post-apocalyptic wasteland, Max teams up with Furiosa to escape a tyrant's clutches and find freedom.\"}]" + " 'vector_similarity': '0.695675253868',\n", + " 'text_score': '3.93239518818',\n", + " 'hybrid_score': '1.66669123416'}]" ] }, - "execution_count": 69, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "query = make_vector_query(user_query, num_results=3)\n", + "from redisvl.query import HybridQuery\n", + "\n", + "vector = model.embed(user_query, as_buffer=True, dtype=\"float32\")\n", + "\n", + "query = HybridQuery(\n", + " text=user_query,\n", + " text_field_name=\"description\",\n", + " vector=vector,\n", + " vector_field_name=\"description_vector\",\n", + " return_fields=[\"title\"],\n", + ")\n", + "\n", + "results = index.query(query)\n", "\n", - "# Check standard vector search results\n", - "index.query(query)" + "results[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Next, we add a full-text search predicate using RedisVL helpers and our user-query tokenizer:" + "That's it! That is all it takes to perform a hybrid text matching and vector query with RedisVL.\n", + "Of course there are many more configurations and things we can do with the `HybridQuery` class. Let's investigate.\n", + "\n", + "First, let's look at just the text query part that is being run:" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'(~@description:(action | adventure | movie | great | fighting | scenes | crime | busting | superheroes | magic))=>[KNN 3 @description_vector $vector AS vector_distance]'" + "'(~@description:(action | adventure | movie | great | fighting | scenes | dangerous | criminal | crime | busting | superheroes | magic))=>[KNN 10 @description_vector $vector AS vector_distance]'" ] }, - "execution_count": 70, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "base_full_text_query = str(Text(\"description\") % tokenize_query(user_query))\n", - "\n", - "# Add the optional flag, \"~\", so that this doesn't also act as a strict text filter\n", - "full_text_query = f\"(~{base_full_text_query})\"\n", - "\n", - "\n", - "# Add full-text predicate to the vector query \n", - "query.set_filter(full_text_query)\n", - "query.query_string()" + "query._build_query_string()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**The query string above combines both full-text search and a vector search.** This will be passed to the aggregation API to combine using a simple weighted sum of scores before a final sort and truncation.\n", - "\n", - "Note: for the following query to work `redis-py >= 5.2.0`" + "### Choosing your stopwords for better queries\n", + "You can see that the user query string has been tokenized and certain stopwords like 'and', 'for', 'with', 'but', have been removed, otherwise you would get matches on irrelevant words.\n", + "RedisVL uses [NLTK](https://www.nltk.org/index.html) english stopwords as the the default. You can change which default language stopwords to use with the `stopwords` argument.\n", + "You specify a language, like 'german', 'arabic', 'greek' and many others, provide your own list of stopwords, or set it to `None` to not remove any." ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 14, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[{'vector_distance': '0.643690168858',\n", - " '__score': '5.82636454242',\n", - " 'title': 'The Incredibles',\n", - " 'description': \"A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world. Bob Parr (Mr. Incredible) and his wife Helen (Elastigirl) were among the world's greatest crime fighters, but now they must assume civilian identities and retreat to the suburbs to live a 'normal' life with their three children. However, the family's desire to help the world pulls them back into action when they face a new and dangerous enemy.\",\n", - " 'cosine_similarity': '0.678154915571',\n", - " 'bm25_score': '5.82636454242',\n", - " 'hybrid_score': '2.22261780363'},\n", - " {'vector_distance': '0.66843944788',\n", - " '__score': '0',\n", - " 'title': 'Explosive Pursuit',\n", - " 'description': 'A daring cop chases a notorious criminal across the city in a high-stakes game of cat and mouse.',\n", - " 'cosine_similarity': '0.66578027606',\n", - " 'bm25_score': '0',\n", - " 'hybrid_score': '0.466046193242'},\n", - " {'vector_distance': '0.698122441769',\n", - " '__score': '0',\n", - " 'title': 'Mad Max: Fury Road',\n", - " 'description': \"In a post-apocalyptic wasteland, Max teams up with Furiosa to escape a tyrant's clutches and find freedom.\",\n", - " 'cosine_similarity': '0.650938779116',\n", - " 'bm25_score': '0',\n", - " 'hybrid_score': '0.455657145381'}]" - ] - }, - "execution_count": 92, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "(~@description:(film | d\\'action | d\\'aventure | superbes | scènes | combat | enquêtes | criminelles | super\\-héros | magie))=>[KNN 10 @description_vector $vector AS vector_distance]\n", + "(~@description:(action | adventure | movie | great | fighting | scenes | against | dangerous | criminal | crime | busting | superheroes | magic))=>[KNN 10 @description_vector $vector AS vector_distance]\n", + "(~@description:(action | adventure | movie | with | great | fighting | scenes | against | a | dangerous | criminal | crime | busting | superheroes | and | magic))=>[KNN 10 @description_vector $vector AS vector_distance]\n" + ] } ], "source": [ - "from typing import Any, Dict, List\n", - "from redis.commands.search.aggregation import AggregateRequest, Desc\n", - "\n", - "# Build the aggregation request\n", - "req = (\n", - " AggregateRequest(query.query_string())\n", - " .scorer(\"BM25STD\")\n", - " .add_scores()\n", - " .apply(cosine_similarity=\"(2 - @vector_distance)/2\", bm25_score=\"@__score\")\n", - " .apply(hybrid_score=f\"0.3*@bm25_score + 0.7*@cosine_similarity\")\n", - " .load(\"title\", \"description\", \"cosine_similarity\", \"bm25_score\", \"hybrid_score\")\n", - " .sort_by(Desc(\"@hybrid_score\"), max=3)\n", - " .dialect(2)\n", + "# translate our user query to French and use nltk french stopwords\n", + "french_query_text = \"Film d'action et d'aventure avec de superbes scènes de combat, des enquêtes criminelles, des super-héros et de la magie\"\n", + "\n", + "french_film_query = HybridQuery(\n", + " text=french_query_text,\n", + " text_field_name=\"description\",\n", + " vector=model.embed(french_query_text, as_buffer=True, dtype=\"float32\"),\n", + " vector_field_name=\"description_vector\",\n", + " stopwords=\"french\",\n", ")\n", "\n", - "# Run the query\n", - "res = index.aggregate(req, query_params={'vector': query._vector})\n", + "print(french_film_query._build_query_string())\n", "\n", - "# Perform output parsing\n", - "[make_dict(row) for row in convert_bytes(res.rows)]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notes on aggregate query syntax \n", - "- `.scorer`: specifies the scoring function to use BM25 in this case\n", - " - [see docs](https://redis.io/docs/latest/develop/interact/search-and-query/advanced-concepts/scoring/) for all available scorers\n", - "- `.add_scores`: adds the scores to the result\n", - "- `.apply`: algebraic operations that can be customized for your use case\n", - "- `.load`: specifies fields to return - all in this case.\n", - "- `.sort_by`: sort the output based on the hybrid score and yield top 5 results\n", - "- `.dialect`: specifies the query dialect to use." + "# specify your own stopwords\n", + "custom_stopwords = set([\n", + " \"a\", \"is\", \"the\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n", + " \"if\", \"in\", \"into\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\", \"such\", \"that\", \"their\",\n", + " \"then\", \"there\", \"these\", \"they\", \"this\", \"to\", \"was\", \"will\", \"with\"\n", + "])\n", + "\n", + "stopwords_query = HybridQuery(\n", + " text=user_query,\n", + " text_field_name=\"description\",\n", + " vector=vector,\n", + " vector_field_name=\"description_vector\",\n", + " stopwords=custom_stopwords,\n", + ")\n", + "\n", + "print(stopwords_query._build_query_string())\n", + "\n", + "# don't use any stopwords\n", + "no_stopwords_query = HybridQuery(\n", + " text=user_query,\n", + " text_field_name=\"description\",\n", + " vector=vector,\n", + " vector_field_name=\"description_vector\",\n", + " stopwords=None,\n", + ")\n", + "\n", + "print(no_stopwords_query._build_query_string())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we will define a function to do the entire operation start to finish for simplicity." - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [], - "source": [ - "def linear_combo(user_query: str, alpha: float, num_results: int = 3) -> List[Dict[str, Any]]:\n", - " # Add the optional flag, \"~\", so that this doesn't also act as a strict text filter\n", - " text = f\"(~{Text('description') % tokenize_query(user_query)})\"\n", + "### Choosing your text scoring function and weights\n", + "There are different ways to calculate the similarity between sets of text. Redis supports several, such as `BM25`, `TFIDF`, `DISMAX`, and others. The default is `BM25STD` and is easy to configure with the `text_scorer` parameter. Just like changing you embedding model can change your vector similarity scores, changing your text similarity measure can change your text scores.\n", "\n", - " # Build vector query\n", - " query = make_vector_query(user_query, num_results=num_results, filters=text)\n", - " \n", - " # Build aggregation\n", - " req = (\n", - " AggregateRequest(query.query_string())\n", - " .scorer(\"BM25STD\")\n", - " .add_scores()\n", - " .apply(cosine_similarity=\"(2 - @vector_distance)/2\", bm25_score=\"@__score\")\n", - " .apply(hybrid_score=f\"{1-alpha}*@bm25_score + {alpha}*@cosine_similarity\")\n", - " .sort_by(Desc(\"@hybrid_score\"), max=num_results)\n", - " .load(\"title\", \"description\", \"cosine_similarity\", \"bm25_score\", \"hybrid_score\")\n", - " .dialect(2)\n", - " )\n", + "Because hybrid queries are performing a weighted average of text similarity and vector similarity you also control the relative balance of these scores with the `alpha` parameter.\n", + "\n", + "The documents are ranked based on the hybrid score which is computed as:\n", "\n", - " # Run the query\n", - " res = index.aggregate(req, query_params={'vector': query._vector})\n", + "```python\n", + "hybrid_score = {1-alpha} * text_score + {alpha} * vector_similarity\n", + "```\n", "\n", - " # Perform output parsing\n", - " if res:\n", - " movies = [make_dict(row) for row in convert_bytes(res.rows)]\n", - " return [(movie[\"title\"], movie[\"hybrid_score\"]) for movie in movies]" + "Try changing the `text_scorer` and `alpha` parameters in the query below to see how results may change.\n" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[('The Incredibles', '2.22261780363'),\n", - " ('Explosive Pursuit', '0.466046193242'),\n", - " ('Mad Max: Fury Road', '0.455657145381'),\n", - " ('The Dark Knight', '0.452280691266'),\n", - " ('Despicable Me', '0.448826736212'),\n", - " ('Inception', '0.434456560016')]" + "[{'vector_distance': '0.645975470543',\n", + " 'title': 'The Incredibles',\n", + " 'description': \"A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world. Bob Parr (Mr. Incredible) and his wife Helen (Elastigirl) were among the world's greatest crime fighters, but now they must assume civilian identities and retreat to the suburbs to live a 'normal' life with their three children. However, the family's desire to help the world pulls them back into action when they face a new and dangerous enemy.\",\n", + " 'vector_similarity': '0.677012264729',\n", + " 'text_score': '6',\n", + " 'hybrid_score': '4.66925306618'},\n", + " {'vector_distance': '0.653376281261',\n", + " 'title': 'The Dark Knight',\n", + " 'description': 'Batman faces off against the Joker, a criminal mastermind who threatens to plunge Gotham into chaos.',\n", + " 'vector_similarity': '0.673311859369',\n", + " 'text_score': '4',\n", + " 'hybrid_score': '3.16832796484'},\n", + " {'vector_distance': '0.608649492264',\n", + " 'title': 'Explosive Pursuit',\n", + " 'description': 'A daring cop chases a notorious criminal across the city in a high-stakes game of cat and mouse.',\n", + " 'vector_similarity': '0.695675253868',\n", + " 'text_score': '3',\n", + " 'hybrid_score': '2.42391881347'}]" ] }, - "execution_count": 91, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Test it out\n", + "tfidf_query = HybridQuery(\n", + " text=user_query,\n", + " text_field_name=\"description\",\n", + " vector=vector,\n", + " vector_field_name=\"description_vector\",\n", + " text_scorer=\"TFIDF\", # can be one of [TFIDF, TFIDF.DOCNORM, BM25, DISMAX, DOCSCORE, BM25STD]\n", + " stopwords=None,\n", + " alpha=0.25, # weight the vector score lower\n", + " return_fields=[\"title\", \"description\"],\n", + ")\n", + "\n", + "results = index.query(tfidf_query)\n", "\n", - "# 70% of the hybrid search score based on cosine similarity\n", - "linear_combo(user_query, alpha=0.7, num_results=6)" + "results[:3]" ] }, { @@ -697,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -725,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -741,7 +632,7 @@ " (8, 0.015384615384615385)]" ] }, - "execution_count": 75, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -751,12 +642,57 @@ "fuse_rankings_rrf([1, 2, 3], [2, 4, 6, 7, 8], [5, 6, 1, 2])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll want some helper functions to construct our individual text and vector queries" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to create a vector query using RedisVL helpers for ease of use\n", + "from redisvl.query import VectorQuery, TextQuery\n", + "\n", + "\n", + "def make_vector_query(user_query: str, num_results: int, filters = None) -> VectorQuery:\n", + " \"\"\"Generate a Redis vector query given user query string.\"\"\"\n", + " vector = model.embed(user_query, as_buffer=True, dtype=\"float32\")\n", + " query = VectorQuery(\n", + " vector=vector,\n", + " vector_field_name=\"description_vector\",\n", + " num_results=num_results,\n", + " return_fields=[\"title\", \"description\"]\n", + " )\n", + " if filters:\n", + " query.set_filter(filters)\n", + " return query\n", + "\n", + "\n", + "def make_ft_query(text_field: str, user_query: str, num_results: int) -> TextQuery:\n", + " \"\"\"Generate a Redis full-text query given a user query string.\"\"\"\n", + " return TextQuery(\n", + " text=user_query,\n", + " text_field_name=text_field,\n", + " text_scorer=\"BM25\",\n", + " num_results=num_results,\n", + " return_fields=[\"title\", \"description\"],\n", + " )" + ] + }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ + "from typing import List, Dict, Any\n", + "\n", + "\n", "def weighted_rrf(\n", " user_query: str,\n", " alpha: float = 0.5,\n", @@ -784,21 +720,21 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[('The Incredibles', 0.01639344262295082),\n", - " ('Explosive Pursuit', 0.01575682382133995),\n", - " ('Mad Max: Fury Road', 0.015079365079365078),\n", - " ('Fast & Furious 9', 0.014925373134328358),\n", - " ('Finding Nemo', 0.01488095238095238),\n", - " ('The Dark Knight', 0.014854753521126762)]" + "[('Explosive Pursuit', 0.01639344262295082),\n", + " ('The Dark Knight', 0.015873015873015872),\n", + " ('Despicable Me', 0.015625),\n", + " ('The Incredibles', 0.015417457305502846),\n", + " ('Skyfall', 0.0152073732718894),\n", + " ('Finding Nemo', 0.014242424242424244)]" ] }, - "execution_count": 77, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -817,21 +753,21 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[('The Incredibles', 0.01639344262295082),\n", - " ('Explosive Pursuit', 0.015905707196029777),\n", - " ('Mad Max: Fury Road', 0.015396825396825395),\n", - " ('The Dark Knight', 0.015162852112676057),\n", - " ('Fast & Furious 9', 0.014925373134328356),\n", - " ('Inception', 0.014715649647156496)]" + "[('Explosive Pursuit', 0.01639344262295082),\n", + " ('The Dark Knight', 0.015873015873015872),\n", + " ('The Incredibles', 0.015702087286527514),\n", + " ('Despicable Me', 0.015625),\n", + " ('Skyfall', 0.014838709677419354),\n", + " ('Finding Nemo', 0.01387878787878788)]" ] }, - "execution_count": 78, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -851,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -900,21 +836,21 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[('The Incredibles', -0.4526837468147278),\n", - " ('The Dark Knight', -7.41187858581543),\n", - " ('Explosive Pursuit', -8.751346588134766),\n", - " ('Mad Max: Fury Road', -7.049142837524414),\n", - " ('Aladdin', -9.638406753540039),\n", - " ('Despicable Me', -9.797615051269531)]" + "[('The Incredibles', -4.1636810302734375),\n", + " ('Explosive Pursuit', 0.8551048636436462),\n", + " ('The Dark Knight', -4.403156280517578),\n", + " ('Skyfall', -7.830077171325684),\n", + " ('Mad Max: Fury Road', -7.7119951248168945),\n", + " ('Despicable Me', -8.742403030395508)]" ] }, - "execution_count": 80, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -942,7 +878,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -967,7 +903,37 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def hybrid_query(text, alpha, num_results) -> List[Dict[str, Any]]:\n", + "\n", + " query = HybridQuery(\n", + " text,\n", + " text_field_name=\"description\",\n", + " vector=model.embed(text, as_buffer=True, dtype=\"float32\"),\n", + " vector_field_name=\"description_vector\",\n", + " text_scorer=\"BM25\",\n", + " stopwords=\"english\",\n", + " alpha=alpha,\n", + " return_fields=[\"title\", \"hybrid_score\"],\n", + " )\n", + "\n", + " results = index.query(query)\n", + "\n", + " return [\n", + " (\n", + " movie[\"title\"],\n", + " movie[\"hybrid_score\"]\n", + " )\n", + " for movie in results\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -985,7 +951,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -993,12 +959,12 @@ "for i, user_query in enumerate(movie_user_queries):\n", " rankings.at[i, \"hf-cross-encoder\"] = rerank(user_query, num_results=4)\n", " rankings.at[i, \"rrf\"] = weighted_rrf(user_query, alpha=0.7, num_results=4)\n", - " rankings.at[i, \"linear-combo-bm25-cosine\"] = linear_combo(user_query, alpha=0.7, num_results=4)" + " rankings.at[i, \"linear-combo-bm25-cosine\"] = hybrid_query(user_query, alpha=0.7, num_results=4)" ] }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1032,37 +998,37 @@ " \n", " 0\n", " I'm in the mood for a high-rated action movie ...\n", - " [(Explosive Pursuit, -11.244140625), (Mad Max:...\n", - " [(The Incredibles, 0.016029143897996357), (Mad...\n", - " [(The Incredibles, 1.09860771359), (Despicable...\n", + " [(Mad Max: Fury Road, -11.244140625), (Toy Sto...\n", + " [(The Incredibles, 0.016029143897996357), (Toy...\n", + " [(The Incredibles, 0.55239223002), (Toy Story,...\n", " \n", " \n", " 1\n", " What's a funny animated film about unlikely fr...\n", " [(Despicable Me, -10.441909790039062), (The In...\n", - " [(Black Widow, 0.015625), (The Incredibles, 0....\n", - " [(The Incredibles, 0.454752063751), (Despicabl...\n", + " [(Monsters, Inc., 0.015524093392945852), (Mada...\n", + " [(The Incredibles, 0.45475204289), (Despicable...\n", " \n", " \n", " 2\n", " Any movies featuring superheroes or extraordin...\n", - " [(The Incredibles, -3.6648082733154297), (The ...\n", - " [(The Incredibles, 0.01639344262295082), (Mad ...\n", - " [(The Incredibles, 1.05887192239), (The Avenge...\n", + " [(The Incredibles, -3.6648073196411133), (The ...\n", + " [(The Incredibles, 0.01639344262295082), (The ...\n", + " [(The Incredibles, 0.603234915587), (The Aveng...\n", " \n", " \n", " 3\n", " I want to watch a thrilling movie with spies o...\n", - " [(The Incredibles, -10.843631744384766), (Expl...\n", - " [(Skyfall, 0.01631411951348493), (Explosive Pu...\n", + " [(Inception, -10.843631744384766), (The Incred...\n", + " [(Inception, 0.015524093392945852), (Skyfall, ...\n", " [(Skyfall, 0.443840536475), (Despicable Me, 0....\n", " \n", " \n", " 4\n", " Are there any comedies set in unusual location...\n", - " [(The Incredibles, -11.45376968383789), (Explo...\n", - " [(Madagascar, 0.015272878190495952), (Explosiv...\n", - " [(Madagascar, 0.442132198811), (Despicable Me,...\n", + " [(The Incredibles, -11.45376968383789), (Findi...\n", + " [(Finding Nemo, 0.015524093392945852), (Madaga...\n", + " [(Madagascar, 0.442132219672), (Despicable Me,...\n", " \n", " \n", "\n", @@ -1077,28 +1043,28 @@ "4 Are there any comedies set in unusual location... \n", "\n", " hf-cross-encoder \\\n", - "0 [(Explosive Pursuit, -11.244140625), (Mad Max:... \n", + "0 [(Mad Max: Fury Road, -11.244140625), (Toy Sto... \n", "1 [(Despicable Me, -10.441909790039062), (The In... \n", - "2 [(The Incredibles, -3.6648082733154297), (The ... \n", - "3 [(The Incredibles, -10.843631744384766), (Expl... \n", - "4 [(The Incredibles, -11.45376968383789), (Explo... \n", + "2 [(The Incredibles, -3.6648073196411133), (The ... \n", + "3 [(Inception, -10.843631744384766), (The Incred... \n", + "4 [(The Incredibles, -11.45376968383789), (Findi... \n", "\n", " rrf \\\n", - "0 [(The Incredibles, 0.016029143897996357), (Mad... \n", - "1 [(Black Widow, 0.015625), (The Incredibles, 0.... \n", - "2 [(The Incredibles, 0.01639344262295082), (Mad ... \n", - "3 [(Skyfall, 0.01631411951348493), (Explosive Pu... \n", - "4 [(Madagascar, 0.015272878190495952), (Explosiv... \n", + "0 [(The Incredibles, 0.016029143897996357), (Toy... \n", + "1 [(Monsters, Inc., 0.015524093392945852), (Mada... \n", + "2 [(The Incredibles, 0.01639344262295082), (The ... \n", + "3 [(Inception, 0.015524093392945852), (Skyfall, ... \n", + "4 [(Finding Nemo, 0.015524093392945852), (Madaga... \n", "\n", " linear-combo-bm25-cosine \n", - "0 [(The Incredibles, 1.09860771359), (Despicable... \n", - "1 [(The Incredibles, 0.454752063751), (Despicabl... \n", - "2 [(The Incredibles, 1.05887192239), (The Avenge... \n", + "0 [(The Incredibles, 0.55239223002), (Toy Story,... \n", + "1 [(The Incredibles, 0.45475204289), (Despicable... \n", + "2 [(The Incredibles, 0.603234915587), (The Aveng... \n", "3 [(Skyfall, 0.443840536475), (Despicable Me, 0.... \n", - "4 [(Madagascar, 0.442132198811), (Despicable Me,... " + "4 [(Madagascar, 0.442132219672), (Despicable Me,... " ] }, - "execution_count": 84, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1109,20 +1075,20 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Show me movies set in dystopian or post-apocalyptic worlds',\n", - " list([('Mad Max: Fury Road', -3.4906280040740967), ('Despicable Me', -11.051526069641113), ('The Incredibles', -11.315656661987305), ('Black Widow', -10.880638122558594)]),\n", - " list([('Mad Max: Fury Road', 0.01602086438152012), ('Skyfall', 0.015607940446650124), ('The Incredibles', 0.015237691001697792), ('Black Widow', 0.01513526119402985)]),\n", - " list([('Mad Max: Fury Road', '0.452238592505'), ('The Incredibles', '0.445061504841'), ('Madagascar', '0.419015598297'), ('Despicable Me', '0.416218388081')])],\n", + " list([('Mad Max: Fury Road', -3.4906270503997803), ('Despicable Me', -11.051526069641113), ('The Incredibles', -11.315656661987305), ('Finding Nemo', -10.880638122558594)]),\n", + " list([('The Incredibles', 0.01620835536753041), ('Finding Nemo', 0.013813068651778329), ('Mad Max: Fury Road', 0.011475409836065573), ('Madagascar', 0.01111111111111111)]),\n", + " list([('Mad Max: Fury Road', '0.452238571644'), ('The Incredibles', '0.445061463118'), ('Madagascar', '0.41901564002'), ('Despicable Me', '0.416218408942'), ('Skyfall', '0.411504244804'), ('The Avengers', '0.41121032536'), ('Black Widow', '0.410578364134'), ('The Lego Movie', '0.408463662863'), ('Monsters, Inc.', '0.392220926285'), ('Shrek', '0.390464815497')])],\n", " dtype=object)" ] }, - "execution_count": 85, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1142,11 +1108,16 @@ "- How to implement hybrid search queries using the Redis aggregation API\n", "- How to perform client-side fusion and reranking techniques" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "redis-ai-res", "language": "python", "name": "python3" }, @@ -1160,7 +1131,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.13.2" } }, "nbformat": 4,