|  | 
| 23 | 23 | ) | 
| 24 | 24 | 
 | 
| 25 | 25 | from .blobmanager import BlobManager | 
| 26 |  | -from .embeddings import OpenAIEmbeddings | 
|  | 26 | +from .embeddings import AzureOpenAIEmbeddingService, OpenAIEmbeddings | 
| 27 | 27 | from .listfilestrategy import File | 
| 28 | 28 | from .strategy import SearchInfo | 
| 29 | 29 | from .textsplitter import SplitPage | 
| @@ -67,149 +67,190 @@ def __init__( | 
| 67 | 67 |         self.search_images = search_images | 
| 68 | 68 | 
 | 
| 69 | 69 |     async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]] = None): | 
| 70 |  | -        logger.info("Ensuring search index %s exists", self.search_info.index_name) | 
|  | 70 | +        logger.info("Checking whether search index %s exists...", self.search_info.index_name) | 
| 71 | 71 | 
 | 
| 72 | 72 |         async with self.search_info.create_search_index_client() as search_index_client: | 
| 73 |  | -            fields = [ | 
| 74 |  | -                ( | 
| 75 |  | -                    SimpleField(name="id", type="Edm.String", key=True) | 
| 76 |  | -                    if not self.use_int_vectorization | 
| 77 |  | -                    else SearchField( | 
| 78 |  | -                        name="id", | 
|  | 73 | + | 
|  | 74 | +            if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]: | 
|  | 75 | +                logger.info("Creating new search index %s", self.search_info.index_name) | 
|  | 76 | +                fields = [ | 
|  | 77 | +                    ( | 
|  | 78 | +                        SimpleField(name="id", type="Edm.String", key=True) | 
|  | 79 | +                        if not self.use_int_vectorization | 
|  | 80 | +                        else SearchField( | 
|  | 81 | +                            name="id", | 
|  | 82 | +                            type="Edm.String", | 
|  | 83 | +                            key=True, | 
|  | 84 | +                            sortable=True, | 
|  | 85 | +                            filterable=True, | 
|  | 86 | +                            facetable=True, | 
|  | 87 | +                            analyzer_name="keyword", | 
|  | 88 | +                        ) | 
|  | 89 | +                    ), | 
|  | 90 | +                    SearchableField( | 
|  | 91 | +                        name="content", | 
| 79 | 92 |                         type="Edm.String", | 
| 80 |  | -                        key=True, | 
| 81 |  | -                        sortable=True, | 
| 82 |  | -                        filterable=True, | 
| 83 |  | -                        facetable=True, | 
| 84 |  | -                        analyzer_name="keyword", | 
| 85 |  | -                    ) | 
| 86 |  | -                ), | 
| 87 |  | -                SearchableField( | 
| 88 |  | -                    name="content", | 
| 89 |  | -                    type="Edm.String", | 
| 90 |  | -                    analyzer_name=self.search_analyzer_name, | 
| 91 |  | -                ), | 
| 92 |  | -                SearchField( | 
| 93 |  | -                    name="embedding", | 
| 94 |  | -                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single), | 
| 95 |  | -                    hidden=False, | 
| 96 |  | -                    searchable=True, | 
| 97 |  | -                    filterable=False, | 
| 98 |  | -                    sortable=False, | 
| 99 |  | -                    facetable=False, | 
| 100 |  | -                    vector_search_dimensions=self.embedding_dimensions, | 
| 101 |  | -                    vector_search_profile_name="embedding_config", | 
| 102 |  | -                ), | 
| 103 |  | -                SimpleField(name="category", type="Edm.String", filterable=True, facetable=True), | 
| 104 |  | -                SimpleField( | 
| 105 |  | -                    name="sourcepage", | 
| 106 |  | -                    type="Edm.String", | 
| 107 |  | -                    filterable=True, | 
| 108 |  | -                    facetable=True, | 
| 109 |  | -                ), | 
| 110 |  | -                SimpleField( | 
| 111 |  | -                    name="sourcefile", | 
| 112 |  | -                    type="Edm.String", | 
| 113 |  | -                    filterable=True, | 
| 114 |  | -                    facetable=True, | 
| 115 |  | -                ), | 
| 116 |  | -                SimpleField( | 
| 117 |  | -                    name="storageUrl", | 
| 118 |  | -                    type="Edm.String", | 
| 119 |  | -                    filterable=True, | 
| 120 |  | -                    facetable=False, | 
| 121 |  | -                ), | 
| 122 |  | -            ] | 
| 123 |  | -            if self.use_acls: | 
| 124 |  | -                fields.append( | 
| 125 |  | -                    SimpleField( | 
| 126 |  | -                        name="oids", | 
| 127 |  | -                        type=SearchFieldDataType.Collection(SearchFieldDataType.String), | 
| 128 |  | -                        filterable=True, | 
| 129 |  | -                    ) | 
| 130 |  | -                ) | 
| 131 |  | -                fields.append( | 
| 132 |  | -                    SimpleField( | 
| 133 |  | -                        name="groups", | 
| 134 |  | -                        type=SearchFieldDataType.Collection(SearchFieldDataType.String), | 
| 135 |  | -                        filterable=True, | 
| 136 |  | -                    ) | 
| 137 |  | -                ) | 
| 138 |  | -            if self.use_int_vectorization: | 
| 139 |  | -                fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True)) | 
| 140 |  | -            if self.search_images: | 
| 141 |  | -                fields.append( | 
|  | 93 | +                        analyzer_name=self.search_analyzer_name, | 
|  | 94 | +                    ), | 
| 142 | 95 |                     SearchField( | 
| 143 |  | -                        name="imageEmbedding", | 
|  | 96 | +                        name="embedding", | 
| 144 | 97 |                         type=SearchFieldDataType.Collection(SearchFieldDataType.Single), | 
| 145 | 98 |                         hidden=False, | 
| 146 | 99 |                         searchable=True, | 
| 147 | 100 |                         filterable=False, | 
| 148 | 101 |                         sortable=False, | 
| 149 | 102 |                         facetable=False, | 
| 150 |  | -                        vector_search_dimensions=1024, | 
|  | 103 | +                        vector_search_dimensions=self.embedding_dimensions, | 
| 151 | 104 |                         vector_search_profile_name="embedding_config", | 
| 152 | 105 |                     ), | 
| 153 |  | -                ) | 
| 154 |  | - | 
| 155 |  | -            index = SearchIndex( | 
| 156 |  | -                name=self.search_info.index_name, | 
| 157 |  | -                fields=fields, | 
| 158 |  | -                semantic_search=SemanticSearch( | 
| 159 |  | -                    configurations=[ | 
| 160 |  | -                        SemanticConfiguration( | 
| 161 |  | -                            name="default", | 
| 162 |  | -                            prioritized_fields=SemanticPrioritizedFields( | 
| 163 |  | -                                title_field=None, content_fields=[SemanticField(field_name="content")] | 
| 164 |  | -                            ), | 
|  | 106 | +                    SimpleField(name="category", type="Edm.String", filterable=True, facetable=True), | 
|  | 107 | +                    SimpleField( | 
|  | 108 | +                        name="sourcepage", | 
|  | 109 | +                        type="Edm.String", | 
|  | 110 | +                        filterable=True, | 
|  | 111 | +                        facetable=True, | 
|  | 112 | +                    ), | 
|  | 113 | +                    SimpleField( | 
|  | 114 | +                        name="sourcefile", | 
|  | 115 | +                        type="Edm.String", | 
|  | 116 | +                        filterable=True, | 
|  | 117 | +                        facetable=True, | 
|  | 118 | +                    ), | 
|  | 119 | +                    SimpleField( | 
|  | 120 | +                        name="storageUrl", | 
|  | 121 | +                        type="Edm.String", | 
|  | 122 | +                        filterable=True, | 
|  | 123 | +                        facetable=False, | 
|  | 124 | +                    ), | 
|  | 125 | +                ] | 
|  | 126 | +                if self.use_acls: | 
|  | 127 | +                    fields.append( | 
|  | 128 | +                        SimpleField( | 
|  | 129 | +                            name="oids", | 
|  | 130 | +                            type=SearchFieldDataType.Collection(SearchFieldDataType.String), | 
|  | 131 | +                            filterable=True, | 
| 165 | 132 |                         ) | 
| 166 |  | -                    ] | 
| 167 |  | -                ), | 
| 168 |  | -                vector_search=VectorSearch( | 
| 169 |  | -                    algorithms=[ | 
| 170 |  | -                        HnswAlgorithmConfiguration( | 
| 171 |  | -                            name="hnsw_config", | 
| 172 |  | -                            parameters=HnswParameters(metric="cosine"), | 
|  | 133 | +                    ) | 
|  | 134 | +                    fields.append( | 
|  | 135 | +                        SimpleField( | 
|  | 136 | +                            name="groups", | 
|  | 137 | +                            type=SearchFieldDataType.Collection(SearchFieldDataType.String), | 
|  | 138 | +                            filterable=True, | 
| 173 | 139 |                         ) | 
| 174 |  | -                    ], | 
| 175 |  | -                    profiles=[ | 
| 176 |  | -                        VectorSearchProfile( | 
| 177 |  | -                            name="embedding_config", | 
| 178 |  | -                            algorithm_configuration_name="hnsw_config", | 
| 179 |  | -                            vectorizer_name=( | 
| 180 |  | -                                f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None | 
| 181 |  | -                            ), | 
|  | 140 | +                    ) | 
|  | 141 | +                if self.use_int_vectorization: | 
|  | 142 | +                    logger.info("Including parent_id field in new index %s", self.search_info.index_name) | 
|  | 143 | +                    fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True)) | 
|  | 144 | +                if self.search_images: | 
|  | 145 | +                    logger.info("Including imageEmbedding field in new index %s", self.search_info.index_name) | 
|  | 146 | +                    fields.append( | 
|  | 147 | +                        SearchField( | 
|  | 148 | +                            name="imageEmbedding", | 
|  | 149 | +                            type=SearchFieldDataType.Collection(SearchFieldDataType.Single), | 
|  | 150 | +                            hidden=False, | 
|  | 151 | +                            searchable=True, | 
|  | 152 | +                            filterable=False, | 
|  | 153 | +                            sortable=False, | 
|  | 154 | +                            facetable=False, | 
|  | 155 | +                            vector_search_dimensions=1024, | 
|  | 156 | +                            vector_search_profile_name="embedding_config", | 
| 182 | 157 |                         ), | 
| 183 |  | -                    ], | 
| 184 |  | -                    vectorizers=[ | 
|  | 158 | +                    ) | 
|  | 159 | + | 
|  | 160 | +                vectorizers = [] | 
|  | 161 | +                if self.embeddings and isinstance(self.embeddings, AzureOpenAIEmbeddingService): | 
|  | 162 | +                    logger.info( | 
|  | 163 | +                        "Including vectorizer for search index %s, using Azure OpenAI service %s", | 
|  | 164 | +                        self.search_info.index_name, | 
|  | 165 | +                        self.embeddings.open_ai_service, | 
|  | 166 | +                    ) | 
|  | 167 | +                    vectorizers.append( | 
| 185 | 168 |                         AzureOpenAIVectorizer( | 
| 186 | 169 |                             vectorizer_name=f"{self.search_info.index_name}-vectorizer", | 
| 187 | 170 |                             parameters=AzureOpenAIVectorizerParameters( | 
| 188 |  | -                                resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com", | 
|  | 171 | +                                resource_url=self.embeddings.open_ai_endpoint, | 
| 189 | 172 |                                 deployment_name=self.embeddings.open_ai_deployment, | 
| 190 | 173 |                                 model_name=self.embeddings.open_ai_model_name, | 
| 191 | 174 |                             ), | 
| 192 |  | -                        ), | 
| 193 |  | -                    ], | 
| 194 |  | -                ), | 
| 195 |  | -            ) | 
| 196 |  | -            if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]: | 
| 197 |  | -                logger.info("Creating %s search index", self.search_info.index_name) | 
|  | 175 | +                        ) | 
|  | 176 | +                    ) | 
|  | 177 | +                else: | 
|  | 178 | +                    logger.info( | 
|  | 179 | +                        "Not including vectorizer for search index %s, no Azure OpenAI service found", | 
|  | 180 | +                        self.search_info.index_name, | 
|  | 181 | +                    ) | 
|  | 182 | + | 
|  | 183 | +                index = SearchIndex( | 
|  | 184 | +                    name=self.search_info.index_name, | 
|  | 185 | +                    fields=fields, | 
|  | 186 | +                    semantic_search=SemanticSearch( | 
|  | 187 | +                        configurations=[ | 
|  | 188 | +                            SemanticConfiguration( | 
|  | 189 | +                                name="default", | 
|  | 190 | +                                prioritized_fields=SemanticPrioritizedFields( | 
|  | 191 | +                                    title_field=None, content_fields=[SemanticField(field_name="content")] | 
|  | 192 | +                                ), | 
|  | 193 | +                            ) | 
|  | 194 | +                        ] | 
|  | 195 | +                    ), | 
|  | 196 | +                    vector_search=VectorSearch( | 
|  | 197 | +                        algorithms=[ | 
|  | 198 | +                            HnswAlgorithmConfiguration( | 
|  | 199 | +                                name="hnsw_config", | 
|  | 200 | +                                parameters=HnswParameters(metric="cosine"), | 
|  | 201 | +                            ) | 
|  | 202 | +                        ], | 
|  | 203 | +                        profiles=[ | 
|  | 204 | +                            VectorSearchProfile( | 
|  | 205 | +                                name="embedding_config", | 
|  | 206 | +                                algorithm_configuration_name="hnsw_config", | 
|  | 207 | +                                vectorizer_name=( | 
|  | 208 | +                                    f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None | 
|  | 209 | +                                ), | 
|  | 210 | +                            ), | 
|  | 211 | +                        ], | 
|  | 212 | +                        vectorizers=vectorizers, | 
|  | 213 | +                    ), | 
|  | 214 | +                ) | 
|  | 215 | + | 
| 198 | 216 |                 await search_index_client.create_index(index) | 
| 199 | 217 |             else: | 
| 200 | 218 |                 logger.info("Search index %s already exists", self.search_info.index_name) | 
| 201 |  | -                index_definition = await search_index_client.get_index(self.search_info.index_name) | 
| 202 |  | -                if not any(field.name == "storageUrl" for field in index_definition.fields): | 
|  | 219 | +                existing_index = await search_index_client.get_index(self.search_info.index_name) | 
|  | 220 | +                if not any(field.name == "storageUrl" for field in existing_index.fields): | 
| 203 | 221 |                     logger.info("Adding storageUrl field to index %s", self.search_info.index_name) | 
| 204 |  | -                    index_definition.fields.append( | 
|  | 222 | +                    existing_index.fields.append( | 
| 205 | 223 |                         SimpleField( | 
| 206 | 224 |                             name="storageUrl", | 
| 207 | 225 |                             type="Edm.String", | 
| 208 | 226 |                             filterable=True, | 
| 209 | 227 |                             facetable=False, | 
| 210 | 228 |                         ), | 
| 211 | 229 |                     ) | 
| 212 |  | -                    await search_index_client.create_or_update_index(index_definition) | 
|  | 230 | +                    await search_index_client.create_or_update_index(existing_index) | 
|  | 231 | + | 
|  | 232 | +                if existing_index.vector_search is not None and ( | 
|  | 233 | +                    existing_index.vector_search.vectorizers is None | 
|  | 234 | +                    or len(existing_index.vector_search.vectorizers) == 0 | 
|  | 235 | +                ): | 
|  | 236 | +                    if self.embeddings is not None: | 
|  | 237 | +                        logger.info("Adding vectorizer to search index %s", self.search_info.index_name) | 
|  | 238 | +                        existing_index.vector_search.vectorizers = [ | 
|  | 239 | +                            AzureOpenAIVectorizer( | 
|  | 240 | +                                vectorizer_name=f"{self.search_info.index_name}-vectorizer", | 
|  | 241 | +                                parameters=AzureOpenAIVectorizerParameters( | 
|  | 242 | +                                    resource_url=self.embeddings.open_ai_endpoint, | 
|  | 243 | +                                    deployment_name=self.embeddings.open_ai_deployment, | 
|  | 244 | +                                    model_name=self.embeddings.open_ai_model_name, | 
|  | 245 | +                                ), | 
|  | 246 | +                            ) | 
|  | 247 | +                        ] | 
|  | 248 | +                        await search_index_client.create_or_update_index(existing_index) | 
|  | 249 | +                    else: | 
|  | 250 | +                        logger.info( | 
|  | 251 | +                            "Can't add vectorizer to search index %s since embeddings service isn't defined", | 
|  | 252 | +                            self.search_info, | 
|  | 253 | +                        ) | 
| 213 | 254 | 
 | 
| 214 | 255 |     async def update_content( | 
| 215 | 256 |         self, sections: List[Section], image_embeddings: Optional[List[List[float]]] = None, url: Optional[str] = None | 
|  | 
0 commit comments