|
70 | 70 | }, |
71 | 71 | { |
72 | 72 | "cell_type": "code", |
73 | | - "execution_count": null, |
| 73 | + "execution_count": 2, |
74 | 74 | "id": "dfa92a08", |
75 | 75 | "metadata": {}, |
76 | 76 | "outputs": [], |
|
91 | 91 | }, |
92 | 92 | { |
93 | 93 | "cell_type": "code", |
94 | | - "execution_count": null, |
| 94 | + "execution_count": 2, |
95 | 95 | "id": "c795913e", |
96 | 96 | "metadata": {}, |
97 | 97 | "outputs": [], |
98 | 98 | "source": [ |
99 | | - "embedding_model = VertexAIEmbeddings(model_name=\"textembedding-gecko@003\")" |
| 99 | + "embedding_model = VertexAIEmbeddings(model_name=\"text-embedding-005\")" |
100 | 100 | ] |
101 | 101 | }, |
102 | 102 | { |
|
722 | 722 | "cell_type": "markdown", |
723 | 723 | "id": "31222b03", |
724 | 724 | "metadata": {}, |
725 | | - "source": [] |
| 725 | + "source": [ |
| 726 | + "## Hybrid Search" |
| 727 | + ] |
| 728 | + }, |
| 729 | + { |
| 730 | + "cell_type": "markdown", |
| 731 | + "id": "b8a308f2", |
| 732 | + "metadata": {}, |
| 733 | + "source": [ |
| 734 | + "Vector Search supports hybrid search, a popular architecture pattern in information retrieval (IR) that combines both semantic search and keyword search (also called token-based search). With hybrid search, developers can take advantage of the best of the two approaches, effectively providing higher search quality.\n", |
| 735 | + "Click [here](https://cloud.google.com/vertex-ai/docs/vector-search/about-hybrid-search) to learn more.\n", |
| 736 | + "\n", |
| 737 | + "In order to use hybrid search, we need to fit a sparse embedding vectorizer and handle the embeddings outside of the Vector Search integration.\n", |
| 738 | + "An example of sparse embedding vectorizer is sklearn TfidfVectorizer but other techniques can be used, for instance BM25." |
| 739 | + ] |
| 740 | + }, |
| 741 | + { |
| 742 | + "cell_type": "code", |
| 743 | + "execution_count": 16, |
| 744 | + "id": "e319402d", |
| 745 | + "metadata": {}, |
| 746 | + "outputs": [], |
| 747 | + "source": [ |
| 748 | + "# Define some sample data\n", |
| 749 | + "texts = [\n", |
| 750 | + " \"The cat sat on\",\n", |
| 751 | + " \"the mat.\",\n", |
| 752 | + " \"I like to\",\n", |
| 753 | + " \"eat pizza for\",\n", |
| 754 | + " \"dinner.\",\n", |
| 755 | + " \"The sun sets\",\n", |
| 756 | + " \"in the west.\",\n", |
| 757 | + "]\n", |
| 758 | + "\n", |
| 759 | + "# optional IDs\n", |
| 760 | + "ids = [\"i_\" + str(i + 1) for i in range(len(texts))]\n", |
| 761 | + "\n", |
| 762 | + "# optional metadata\n", |
| 763 | + "metadatas = [{\"my_metadata\": i} for i in range(len(texts))]" |
| 764 | + ] |
| 765 | + }, |
| 766 | + { |
| 767 | + "cell_type": "code", |
| 768 | + "execution_count": null, |
| 769 | + "id": "14efefc1", |
| 770 | + "metadata": {}, |
| 771 | + "outputs": [], |
| 772 | + "source": [ |
| 773 | + "from sklearn.feature_extraction.text import TfidfVectorizer\n", |
| 774 | + "\n", |
| 775 | + "# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)\n", |
| 776 | + "vectorizer = TfidfVectorizer()\n", |
| 777 | + "vectorizer.fit(texts)" |
| 778 | + ] |
| 779 | + }, |
| 780 | + { |
| 781 | + "cell_type": "code", |
| 782 | + "execution_count": null, |
| 783 | + "id": "2c7206c2", |
| 784 | + "metadata": {}, |
| 785 | + "outputs": [], |
| 786 | + "source": [ |
| 787 | + "# Utility function to transform text into a TF-IDF Sparse Vector\n", |
| 788 | + "def get_sparse_embedding(tfidf_vectorizer, text):\n", |
| 789 | + " tfidf_vector = tfidf_vectorizer.transform([text])\n", |
| 790 | + " values = []\n", |
| 791 | + " dims = []\n", |
| 792 | + " for i, tfidf_value in enumerate(tfidf_vector.data):\n", |
| 793 | + " values.append(float(tfidf_value))\n", |
| 794 | + " dims.append(int(tfidf_vector.indices[i]))\n", |
| 795 | + " return {\"values\": values, \"dimensions\": dims}" |
| 796 | + ] |
| 797 | + }, |
| 798 | + { |
| 799 | + "cell_type": "code", |
| 800 | + "execution_count": 19, |
| 801 | + "id": "0dc5b782", |
| 802 | + "metadata": {}, |
| 803 | + "outputs": [], |
| 804 | + "source": [ |
| 805 | + "# semantic (dense) embeddings\n", |
| 806 | + "embeddings = embedding_model.embed_documents(texts)\n", |
| 807 | + "# tfidf (sparse) embeddings\n", |
| 808 | + "sparse_embeddings = [get_sparse_embedding(vectorizer, x) for x in texts]" |
| 809 | + ] |
| 810 | + }, |
| 811 | + { |
| 812 | + "cell_type": "code", |
| 813 | + "execution_count": null, |
| 814 | + "id": "3a353679", |
| 815 | + "metadata": {}, |
| 816 | + "outputs": [], |
| 817 | + "source": [ |
| 818 | + "sparse_embeddings[0]" |
| 819 | + ] |
| 820 | + }, |
| 821 | + { |
| 822 | + "cell_type": "code", |
| 823 | + "execution_count": null, |
| 824 | + "id": "2623cad9", |
| 825 | + "metadata": {}, |
| 826 | + "outputs": [], |
| 827 | + "source": [ |
| 828 | + "# Add the dense and sparse embeddings in Vector Search\n", |
| 829 | + "\n", |
| 830 | + "vector_store.add_texts_with_embeddings(\n", |
| 831 | + " texts=texts,\n", |
| 832 | + " embeddings=embeddings,\n", |
| 833 | + " sparse_embeddings=sparse_embeddings,\n", |
| 834 | + " ids=ids,\n", |
| 835 | + " metadatas=metadatas,\n", |
| 836 | + ")" |
| 837 | + ] |
| 838 | + }, |
| 839 | + { |
| 840 | + "cell_type": "code", |
| 841 | + "execution_count": null, |
| 842 | + "id": "29885e38", |
| 843 | + "metadata": {}, |
| 844 | + "outputs": [], |
| 845 | + "source": [ |
| 846 | + "# Run hybrid search\n", |
| 847 | + "query = \"the cat\"\n", |
| 848 | + "embedding = embedding_model.embed_query(query)\n", |
| 849 | + "sparse_embedding = get_sparse_embedding(vectorizer, query)\n", |
| 850 | + "\n", |
| 851 | + "vector_store.similarity_search_by_vector_with_score(\n", |
| 852 | + " embedding=embedding,\n", |
| 853 | + " sparse_embedding=sparse_embedding,\n", |
| 854 | + " k=5,\n", |
| 855 | + " rrf_ranking_alpha=0.7, # 0.7 weight to dense and 0.3 weight to sparse\n", |
| 856 | + ")" |
| 857 | + ] |
726 | 858 | } |
727 | 859 | ], |
728 | 860 | "metadata": { |
|
733 | 865 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m107" |
734 | 866 | }, |
735 | 867 | "kernelspec": { |
736 | | - "display_name": "Python 3 (ipykernel)", |
| 868 | + "display_name": "langchain-google-community-3Os9yvMd-py3.10", |
737 | 869 | "language": "python", |
738 | 870 | "name": "python3" |
739 | 871 | }, |
|
747 | 879 | "name": "python", |
748 | 880 | "nbconvert_exporter": "python", |
749 | 881 | "pygments_lexer": "ipython3", |
750 | | - "version": "3.11.6" |
| 882 | + "version": "3.10.14" |
751 | 883 | } |
752 | 884 | }, |
753 | 885 | "nbformat": 4, |
|
0 commit comments