|
| 1 | +// Copyright (c) Microsoft Corporation. All rights reserved. |
| 2 | +// Licensed under the MIT License. |
| 3 | +package com.azure.search.documents; |
| 4 | + |
| 5 | +import com.azure.ai.openai.OpenAIClient; |
| 6 | +import com.azure.ai.openai.OpenAIClientBuilder; |
| 7 | +import com.azure.ai.openai.models.Embeddings; |
| 8 | +import com.azure.ai.openai.models.EmbeddingsOptions; |
| 9 | +import com.azure.core.credential.AzureKeyCredential; |
| 10 | +import com.azure.core.credential.KeyCredential; |
| 11 | +import com.azure.core.util.Configuration; |
| 12 | +import com.azure.core.util.Context; |
| 13 | +import com.azure.json.JsonReader; |
| 14 | +import com.azure.json.JsonSerializable; |
| 15 | +import com.azure.json.JsonToken; |
| 16 | +import com.azure.json.JsonWriter; |
| 17 | +import com.azure.search.documents.indexes.SearchIndexClient; |
| 18 | +import com.azure.search.documents.indexes.SearchIndexClientBuilder; |
| 19 | +import com.azure.search.documents.indexes.SearchableField; |
| 20 | +import com.azure.search.documents.indexes.SimpleField; |
| 21 | +import com.azure.search.documents.indexes.models.AzureOpenAIModelName; |
| 22 | +import com.azure.search.documents.indexes.models.AzureOpenAIParameters; |
| 23 | +import com.azure.search.documents.indexes.models.AzureOpenAIVectorizer; |
| 24 | +import com.azure.search.documents.indexes.models.HnswAlgorithmConfiguration; |
| 25 | +import com.azure.search.documents.indexes.models.IndexDocumentsBatch; |
| 26 | +import com.azure.search.documents.indexes.models.SearchField; |
| 27 | +import com.azure.search.documents.indexes.models.SearchFieldDataType; |
| 28 | +import com.azure.search.documents.indexes.models.SearchIndex; |
| 29 | +import com.azure.search.documents.indexes.models.VectorSearch; |
| 30 | +import com.azure.search.documents.indexes.models.VectorSearchProfile; |
| 31 | +import com.azure.search.documents.models.SearchOptions; |
| 32 | +import com.azure.search.documents.models.SearchResult; |
| 33 | +import com.azure.search.documents.models.VectorSearchOptions; |
| 34 | +import com.azure.search.documents.models.VectorizableTextQuery; |
| 35 | +import com.azure.search.documents.util.SearchPagedIterable; |
| 36 | + |
| 37 | +import java.io.IOException; |
| 38 | +import java.util.ArrayList; |
| 39 | +import java.util.Arrays; |
| 40 | +import java.util.Collections; |
| 41 | +import java.util.List; |
| 42 | + |
| 43 | +/** |
| 44 | + * This sample demonstrates how to create a vector fields index with reduced dimensions, upload reduced embeddings into |
| 45 | + * the index, and query the documents. To accomplish this, you can utilize Azure OpenAI embedding models: a smaller and |
| 46 | + * highly efficient {@code text-embedding-3-small} model or a larger and more powerful {@code text-embedding-3-large} |
| 47 | + * model. These models are significantly more efficient and require less storage space. |
| 48 | + */ |
| 49 | +public class VectorSearchReducedEmbeddings { |
| 50 | + public static void main(String[] args) { |
| 51 | + SearchIndex vectorIndex = defineVectorIndex(); |
| 52 | + |
| 53 | + // After creating an instance of the 'SearchIndex',we need to instantiate the 'SearchIndexClient' and call the |
| 54 | + // 'createIndex' method to create the search index. |
| 55 | + createVectorIndex(vectorIndex); |
| 56 | + |
| 57 | + // Now, we can instantiate the 'SearchClient' and upload the documents to the 'Hotel' index we created earlier. |
| 58 | + SearchClient searchClient = new SearchClientBuilder() |
| 59 | + .endpoint(Configuration.getGlobalConfiguration().get("SEARCH_ENDPOINT")) |
| 60 | + .indexName("hotel") |
| 61 | + .credential(new AzureKeyCredential(Configuration.getGlobalConfiguration().get("SEARCH_API_KEY"))) |
| 62 | + .buildClient(); |
| 63 | + |
| 64 | + // Next, we will create sample hotel documents. The vector field requires submitting text input to an embedding |
| 65 | + // model that converts human-readable text into a vector representation. To convert a text query string provided |
| 66 | + // by a user into a vector representation, your application should utilize an embedding library that offers this |
| 67 | + // functionality. |
| 68 | + indexDocuments(searchClient, getHotelDocuments()); |
| 69 | + |
| 70 | + // When using 'VectorizableTextQuery', the query for a vector field should be the text that will be vectorized |
| 71 | + // based on the 'Vectorizer' configuration in order to perform a vector search. |
| 72 | + // |
| 73 | + // Let's query the index and make sure everything works as implemented. You can also refer to |
| 74 | + // https://learn.microsoft.com/azure/search/vector-search-how-to-query for more information on querying vector |
| 75 | + // data. |
| 76 | + } |
| 77 | + |
| 78 | + /** |
| 79 | + * Let's consider the example of a 'Hotel'. First, we need to create an index for storing hotel information. In this |
| 80 | + * index, we will define vector fields called 'DescriptionVector' and 'CategoryVector'. To configure the vector |
| 81 | + * field, you need to provide the model dimensions, which indicate the size of the embeddings generated for this |
| 82 | + * field. You can pass reduced dimensions and the name of the vector search profile that specifies the algorithm |
| 83 | + * configuration, along with 'Vectorizer'. |
| 84 | + * <p> |
| 85 | + * In order to get the reduced embeddings using either the {@code text-embedding-3-small} or |
| 86 | + * {@code text-embedding-3-large} models, it is necessary to include the 'Dimensions' parameter. This parameter |
| 87 | + * configures the desired number of dimensions for the output vector. Therefore, for {@link AzureOpenAIVectorizer}, |
| 88 | + * we will retrieve the 'VectorSearchDimensions' that is already specified in the corresponding index field |
| 89 | + * definition. However, to ensure that dimensions are only passed along in the vectorizer for a model that supports |
| 90 | + * it, we need to pass a required property named 'ModelName'. This property enables the service to determine which |
| 91 | + * model we are using, and dimensions will only be passed along when it is for a known supported model name. |
| 92 | + * <p> |
| 93 | + * We will create an instace of {@code SearchIndex} and define 'Hotel' fields. |
| 94 | + */ |
| 95 | + public static SearchIndex defineVectorIndex() { |
| 96 | + String vectorSearchProfileName = "my-vector-profile"; |
| 97 | + String vectorSearchHnswConfig = "my-hnsw-vector-config"; |
| 98 | + String deploymentId = "my-text-embedding-3-small"; |
| 99 | + int modelDimensions = 256; // Here's the reduced model dimensions |
| 100 | + String indexName = "hotel"; |
| 101 | + return new SearchIndex(indexName).setFields(new SearchField("HotelId", SearchFieldDataType.STRING).setKey(true) |
| 102 | + .setFilterable(true) |
| 103 | + .setSortable(true) |
| 104 | + .setFacetable(true), new SearchField("HotelName", SearchFieldDataType.STRING).setSearchable(true) |
| 105 | + .setFilterable(true) |
| 106 | + .setSortable(true), |
| 107 | + new SearchField("Description", SearchFieldDataType.STRING).setSearchable(true).setFilterable(true), |
| 108 | + new SearchField("DescriptionVector", |
| 109 | + SearchFieldDataType.collection(SearchFieldDataType.SINGLE)).setSearchable(true) |
| 110 | + .setFilterable(true) |
| 111 | + .setVectorSearchDimensions(modelDimensions) |
| 112 | + .setVectorSearchProfileName(vectorSearchProfileName), |
| 113 | + new SearchField("Category", SearchFieldDataType.STRING).setSearchable(true) |
| 114 | + .setFilterable(true) |
| 115 | + .setSortable(true) |
| 116 | + .setFacetable(true), |
| 117 | + new SearchField("CategoryVector", SearchFieldDataType.collection(SearchFieldDataType.SINGLE)).setSearchable( |
| 118 | + true) |
| 119 | + .setFilterable(true) |
| 120 | + .setVectorSearchDimensions(modelDimensions) |
| 121 | + .setVectorSearchProfileName(vectorSearchProfileName)) |
| 122 | + .setVectorSearch(new VectorSearch().setProfiles( |
| 123 | + new VectorSearchProfile(vectorSearchProfileName, vectorSearchHnswConfig).setVectorizer("openai")) |
| 124 | + .setAlgorithms(new HnswAlgorithmConfiguration(vectorSearchHnswConfig)) |
| 125 | + .setVectorizers(Collections.singletonList(new AzureOpenAIVectorizer("openai").setAzureOpenAIParameters( |
| 126 | + new AzureOpenAIParameters().setResourceUri( |
| 127 | + Configuration.getGlobalConfiguration().get("OPENAI_ENDPOINT")) |
| 128 | + .setApiKey(Configuration.getGlobalConfiguration().get("OPENAI_KEY")) |
| 129 | + .setDeploymentId(deploymentId) |
| 130 | + .setModelName(AzureOpenAIModelName.TEXT_EMBEDDING3LARGE))))); |
| 131 | + } |
| 132 | + |
| 133 | + public static void createVectorIndex(SearchIndex vectorIndex) { |
| 134 | + // Instantiate the 'SearchIndexClient' and call the 'createIndex' method to create the search index. |
| 135 | + String endpoint = Configuration.getGlobalConfiguration().get("SEARCH_ENDPOINT"); |
| 136 | + String key = Configuration.getGlobalConfiguration().get("SEARCH_API_KEY"); |
| 137 | + AzureKeyCredential credential = new AzureKeyCredential(key); |
| 138 | + |
| 139 | + SearchIndexClient indexClient = new SearchIndexClientBuilder().endpoint(endpoint) |
| 140 | + .credential(credential) |
| 141 | + .buildClient(); |
| 142 | + |
| 143 | + indexClient.createIndex(vectorIndex); |
| 144 | + } |
| 145 | + |
| 146 | + // Simple model type for Hotel |
| 147 | + |
| 148 | + /** |
| 149 | + * Hotel model with an additional field for the vector description. |
| 150 | + */ |
| 151 | + public static final class VectorHotel implements JsonSerializable<VectorHotel> { |
| 152 | + @SimpleField(isKey = true) |
| 153 | + private String hotelId; |
| 154 | + @SearchableField(isFilterable = true, isSortable = true, analyzerName = "en.lucene") |
| 155 | + private String hotelName; |
| 156 | + @SearchableField(analyzerName = "en.lucene") |
| 157 | + private String description; |
| 158 | + @SearchableField(vectorSearchDimensions = 256, vectorSearchProfileName = "my-vector-profile") |
| 159 | + private List<Float> descriptionVector; |
| 160 | + @SearchableField(isFilterable = true, isFacetable = true, isSortable = true) |
| 161 | + private String category; |
| 162 | + @SearchableField(vectorSearchDimensions = 256, vectorSearchProfileName = "my-vector-profile") |
| 163 | + private List<Float> categoryVector; |
| 164 | + |
| 165 | + public VectorHotel() { |
| 166 | + } |
| 167 | + |
| 168 | + public String getHotelId() { |
| 169 | + return hotelId; |
| 170 | + } |
| 171 | + |
| 172 | + public VectorHotel setHotelId(String hotelId) { |
| 173 | + this.hotelId = hotelId; |
| 174 | + return this; |
| 175 | + } |
| 176 | + |
| 177 | + public String getHotelName() { |
| 178 | + return hotelName; |
| 179 | + } |
| 180 | + |
| 181 | + public VectorHotel setHotelName(String hotelName) { |
| 182 | + this.hotelName = hotelName; |
| 183 | + return this; |
| 184 | + } |
| 185 | + |
| 186 | + public String getDescription() { |
| 187 | + return description; |
| 188 | + } |
| 189 | + |
| 190 | + public VectorHotel setDescription(String description) { |
| 191 | + this.description = description; |
| 192 | + return this; |
| 193 | + } |
| 194 | + |
| 195 | + public List<Float> getDescriptionVector() { |
| 196 | + return descriptionVector == null ? null : Collections.unmodifiableList(descriptionVector); |
| 197 | + } |
| 198 | + |
| 199 | + public VectorHotel setDescriptionVector(List<Float> descriptionVector) { |
| 200 | + this.descriptionVector = descriptionVector == null ? null : new ArrayList<>(descriptionVector); |
| 201 | + return this; |
| 202 | + } |
| 203 | + |
| 204 | + public String getCategory() { |
| 205 | + return category; |
| 206 | + } |
| 207 | + |
| 208 | + public VectorHotel setCategory(String category) { |
| 209 | + this.category = category; |
| 210 | + return this; |
| 211 | + } |
| 212 | + |
| 213 | + public List<Float> getCategoryVector() { |
| 214 | + return categoryVector == null ? null : Collections.unmodifiableList(categoryVector); |
| 215 | + } |
| 216 | + |
| 217 | + public VectorHotel setCategoryVector(List<Float> categoryVector) { |
| 218 | + this.categoryVector = categoryVector == null ? null : new ArrayList<>(categoryVector); |
| 219 | + return this; |
| 220 | + } |
| 221 | + |
| 222 | + @Override |
| 223 | + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { |
| 224 | + return jsonWriter.writeStartObject() |
| 225 | + .writeStringField("HotelId", hotelId) |
| 226 | + .writeStringField("HotelName", hotelName) |
| 227 | + .writeStringField("Description", description) |
| 228 | + .writeArrayField("DescriptionVector", descriptionVector, JsonWriter::writeFloat) |
| 229 | + .writeStringField("Category", category) |
| 230 | + .writeArrayField("DescriptionVector", categoryVector, JsonWriter::writeFloat) |
| 231 | + .writeEndObject(); |
| 232 | + } |
| 233 | + |
| 234 | + public static VectorHotel fromJson(JsonReader jsonReader) throws IOException { |
| 235 | + return jsonReader.readObject(reader -> { |
| 236 | + VectorHotel vectorHotel = new VectorHotel(); |
| 237 | + |
| 238 | + while (reader.nextToken() != JsonToken.END_OBJECT) { |
| 239 | + String fieldName = reader.getFieldName(); |
| 240 | + reader.nextToken(); |
| 241 | + |
| 242 | + if ("HotelId".equals(fieldName)) { |
| 243 | + vectorHotel.hotelId = reader.getString(); |
| 244 | + } else if ("HotelName".equals(fieldName)) { |
| 245 | + vectorHotel.hotelName = reader.getString(); |
| 246 | + } else if ("Description".equals(fieldName)) { |
| 247 | + vectorHotel.description = reader.getString(); |
| 248 | + } else if ("DescriptionVector".equals(fieldName)) { |
| 249 | + vectorHotel.descriptionVector = reader.readArray(JsonReader::getFloat); |
| 250 | + } else if ("Category".equals(fieldName)) { |
| 251 | + vectorHotel.category = reader.getString(); |
| 252 | + } else if ("CategoryVector".equals(fieldName)) { |
| 253 | + vectorHotel.categoryVector = reader.readArray(JsonReader::getFloat); |
| 254 | + } else { |
| 255 | + reader.skipChildren(); |
| 256 | + } |
| 257 | + } |
| 258 | + |
| 259 | + return vectorHotel; |
| 260 | + }); |
| 261 | + } |
| 262 | + } |
| 263 | + |
| 264 | + /** |
| 265 | + * Get Embeddings using {@code azure-ai-openai} library. |
| 266 | + * <p> |
| 267 | + * You can use Azure OpenAI embedding models, {@code text-embedding-3-small} or {@code text-embedding-3-large}, to |
| 268 | + * get the reduced embeddings. With these models, you can specify the desired number of dimensions for the output |
| 269 | + * vector by passing the 'Dimensions' property. This enables you to customize the output according to your needs. |
| 270 | + * <p> |
| 271 | + * For more details about how to generate embeddings, refer to the |
| 272 | + * <a href="https://learn.microsoft.com/azure/search/vector-search-how-to-generate-embeddings">documentation</a>. |
| 273 | + * Here's an example of how you can get embeddings using |
| 274 | + * <a href="https://github.com/Azure/azure-sdk-for-java/blob/main/sdk/openai/azure-ai-openai/README.md">azure-ai-openai</a> |
| 275 | + * library. |
| 276 | + */ |
| 277 | + public static List<Float> getEmbeddings(String input) { |
| 278 | + // Get embeddings using Azure OpenAI |
| 279 | + String endpoint = Configuration.getGlobalConfiguration().get("OPENAI_ENDPOINT"); |
| 280 | + String key = Configuration.getGlobalConfiguration().get("OPENAI_API_KEY"); |
| 281 | + KeyCredential credential = new KeyCredential(key); |
| 282 | + |
| 283 | + OpenAIClient openAIClient = new OpenAIClientBuilder() |
| 284 | + .endpoint(endpoint) |
| 285 | + .credential(credential) |
| 286 | + .buildClient(); |
| 287 | + EmbeddingsOptions embeddingsOptions = new EmbeddingsOptions(Collections.singletonList(input)) |
| 288 | + .setModel("my-text-embedding-3-small") |
| 289 | + .setDimensions(256); |
| 290 | + |
| 291 | + Embeddings embeddings = openAIClient.getEmbeddings("my-text-embedding-3-small", embeddingsOptions); |
| 292 | + return embeddings.getData().get(0).getEmbedding(); |
| 293 | + } |
| 294 | + |
| 295 | + public static List<VectorHotel> getHotelDocuments() { |
| 296 | + // In the sample code below, we are using 'getEmbeddings' method mentioned above to get embeddings for the |
| 297 | + // vector fields named 'DescriptionVector' and 'CategoryVector'. |
| 298 | + return Arrays.asList( |
| 299 | + new VectorHotel().setHotelId("1") |
| 300 | + .setHotelName("Fancy Stay") |
| 301 | + .setDescription("Best hotel in town if you like luxury hotels. They have an amazing infinity pool, a " |
| 302 | + + "spa, and a really helpful concierge. The location is perfect -- right downtown, close to " |
| 303 | + + "all the tourist attractions. We highly recommend this hotel.") |
| 304 | + .setDescriptionVector(getEmbeddings( |
| 305 | + "Best hotel in town if you like luxury hotels. They have an amazing infinity pool, a spa, " |
| 306 | + + "and a really helpful concierge. The location is perfect -- right downtown, close to all " |
| 307 | + + "the tourist attractions. We highly recommend this hotel.")) |
| 308 | + .setCategory("Luxury") |
| 309 | + .setCategoryVector(getEmbeddings("Luxury")), |
| 310 | + new VectorHotel().setHotelId("2") |
| 311 | + .setHotelName("Roach Motel") |
| 312 | + .setDescription("Cheapest hotel in town. Infact, a motel.") |
| 313 | + .setDescriptionVector(getEmbeddings("Cheapest hotel in town. Infact, a motel.")) |
| 314 | + .setCategory("Budget") |
| 315 | + .setCategoryVector(getEmbeddings("Budget")) |
| 316 | + // Add more hotel documents here... |
| 317 | + ); |
| 318 | + } |
| 319 | + |
| 320 | + public static void indexDocuments(SearchClient searchClient, List<VectorHotel> hotelDocuments) { |
| 321 | + searchClient.indexDocuments(new IndexDocumentsBatch<VectorHotel>().addUploadActions(hotelDocuments)); |
| 322 | + } |
| 323 | + |
| 324 | + /** |
| 325 | + * In this vector query, the 'VectorQueries' contains the vectorizable text of the query input. The 'Fields' |
| 326 | + * property specifies which vector fields are searched. The 'KNearestNeighborsCount' property specifies the number |
| 327 | + * of nearest neighbors to return as top hits. |
| 328 | + */ |
| 329 | + public static void vectorSearch(SearchClient searchClient) { |
| 330 | + SearchPagedIterable response = searchClient.search(null, new SearchOptions() |
| 331 | + .setVectorSearchOptions(new VectorSearchOptions() |
| 332 | + .setQueries(new VectorizableTextQuery("Luxury hotels in town") |
| 333 | + .setKNearestNeighborsCount(3) |
| 334 | + .setFields("DescriptionVector"))), Context.NONE); |
| 335 | + |
| 336 | + int count = 0; |
| 337 | + System.out.println("Vector Search Results:"); |
| 338 | + |
| 339 | + for (SearchResult result : response) { |
| 340 | + count++; |
| 341 | + VectorHotel doc = result.getDocument(VectorHotel.class); |
| 342 | + System.out.println(doc.getHotelId() + ": " + doc.getHotelName()); |
| 343 | + } |
| 344 | + |
| 345 | + System.out.println("Total number of search results: " + count); |
| 346 | + } |
| 347 | +} |
0 commit comments