Skip to content

Commit 9981c12

Browse files
authored
Add example for multi-analyzer (#1736)
Signed-off-by: yhmo <yihua.mo@zilliz.com>
1 parent 907e98b commit 9981c12

File tree

2 files changed

+211
-3
lines changed

2 files changed

+211
-3
lines changed

examples/src/main/java/io/milvus/v2/FullTextSearchExample.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,15 @@ public class FullTextSearchExample {
4747
private static final String COLLECTION_NAME = "java_sdk_example_text_match_v2";
4848
private static final String ID_FIELD = "id";
4949
private static final String VECTOR_FIELD = "vector";
50+
private static final String TEXT_FIELD = "text";
5051

5152
private static void searchByText(MilvusClientV2 client, String text) {
5253
// The text is tokenized inside server and turned into a sparse embedding to compare with the vector field
5354
SearchResp searchResp = client.search(SearchReq.builder()
5455
.collectionName(COLLECTION_NAME)
5556
.data(Collections.singletonList(new EmbeddedText(text)))
5657
.limit(3)
57-
.outputFields(Collections.singletonList("text"))
58+
.outputFields(Collections.singletonList(TEXT_FIELD))
5859
.build());
5960
System.out.println("\nSearch by text: " + text);
6061
List<List<SearchResp.SearchResult>> searchResults = searchResp.getSearchResults();
@@ -87,7 +88,7 @@ public static void main(String[] args) {
8788
.autoID(false)
8889
.build());
8990
schema.addField(AddFieldReq.builder()
90-
.fieldName("text")
91+
.fieldName(TEXT_FIELD)
9192
.dataType(DataType.VarChar)
9293
.maxLength(65535)
9394
.enableAnalyzer(true) // must enable this if you use Function
@@ -103,7 +104,7 @@ public static void main(String[] args) {
103104
schema.addFunction(Function.builder()
104105
.functionType(FunctionType.BM25)
105106
.name("function_bm25")
106-
.inputFieldNames(Collections.singletonList("text"))
107+
.inputFieldNames(Collections.singletonList(TEXT_FIELD))
107108
.outputFieldNames(Collections.singletonList(VECTOR_FIELD))
108109
.build());
109110

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
package io.milvus.v2;
2+
3+
import com.google.gson.Gson;
4+
import com.google.gson.JsonObject;
5+
import io.milvus.common.clientenum.FunctionType;
6+
import io.milvus.v2.client.ConnectConfig;
7+
import io.milvus.v2.client.MilvusClientV2;
8+
import io.milvus.v2.common.ConsistencyLevel;
9+
import io.milvus.v2.common.DataType;
10+
import io.milvus.v2.common.IndexParam;
11+
import io.milvus.v2.service.collection.request.AddFieldReq;
12+
import io.milvus.v2.service.collection.request.CreateCollectionReq;
13+
import io.milvus.v2.service.collection.request.DropCollectionReq;
14+
import io.milvus.v2.service.vector.request.InsertReq;
15+
import io.milvus.v2.service.vector.request.QueryReq;
16+
import io.milvus.v2.service.vector.request.SearchReq;
17+
import io.milvus.v2.service.vector.request.data.EmbeddedText;
18+
import io.milvus.v2.service.vector.response.QueryResp;
19+
import io.milvus.v2.service.vector.response.SearchResp;
20+
21+
import java.util.*;
22+
23+
public class MultiAnalyzerExample {
24+
private static final String COLLECTION_NAME = "java_sdk_example_multi_analyzer_v2";
25+
private static final String ID_FIELD = "id";
26+
private static final String VECTOR_FIELD = "vector";
27+
private static final String TEXT_FIELD = "text";
28+
private static final String LANGUAGE_FIELD = "language";
29+
30+
private static void buildCollection(MilvusClientV2 client) {
31+
// Drop collection if exists
32+
client.dropCollection(DropCollectionReq.builder()
33+
.collectionName(COLLECTION_NAME)
34+
.build());
35+
36+
// Create collection
37+
CreateCollectionReq.CollectionSchema schema = CreateCollectionReq.CollectionSchema.builder()
38+
.build();
39+
schema.addField(AddFieldReq.builder()
40+
.fieldName(ID_FIELD)
41+
.dataType(DataType.Int64)
42+
.isPrimaryKey(true)
43+
.autoID(true)
44+
.build());
45+
46+
// apply multiple analyzers to the text field, so that insert data can specify different tokenizers for each row.
47+
// in this example, texts are written by multiple languages, so we use multiple analyzers to handle different texts.
48+
// to use multiple analyzers, there must be a field to specify the language type, in this example, the "language"
49+
// field is used for this purpose. multiple analyzers is optional, no need to set it if the data only contains one
50+
// language, no need to add the "language" field if the data only contains one language.
51+
// tokenizer:
52+
// english: https://milvus.io/docs/english-analyzer.md
53+
// chinese: https://milvus.io/docs/chinese-analyzer.md
54+
// lindera: https://milvus.io/docs/lindera-tokenizer.md
55+
// icu: https://milvus.io/docs/icu-tokenizer.md
56+
// filter:
57+
// lowercase: https://milvus.io/docs/lowercase-filter.md
58+
// removepunct: https://milvus.io/docs/removepunct-filter.md
59+
// asciifolding: https://milvus.io/docs/ascii-folding-filter.md
60+
Map<String, Object> analyzerParams = new HashMap<>();
61+
analyzerParams.put("analyzers", new HashMap<String, Object>() {{
62+
put("english", new HashMap<String, Object>() {{
63+
put("type", "english");
64+
}});
65+
put("chinese", new HashMap<String, Object>() {{
66+
put("tokenizer", "jieba");
67+
put("filter", Arrays.asList("lowercase", "removepunct"));
68+
}});
69+
put("japanese", new HashMap<String, Object>() {{
70+
put("tokenizer", new HashMap<String, Object>() {{
71+
put("type", "lindera");
72+
put("dict_kind", "ipadic");
73+
}});
74+
}});
75+
put("default", new HashMap<String, Object>() {{
76+
put("tokenizer", "icu");
77+
put("filter", Arrays.asList("lowercase", "removepunct", "asciifolding"));
78+
}});
79+
}});
80+
analyzerParams.put("by_field", "language");
81+
analyzerParams.put("alias", new HashMap<String, Object>() {{
82+
put("cn", "chinese");
83+
put("en", "english");
84+
put("jap", "japanese");
85+
}});
86+
87+
schema.addField(AddFieldReq.builder()
88+
.fieldName(TEXT_FIELD)
89+
.dataType(DataType.VarChar)
90+
.maxLength(65535)
91+
.enableAnalyzer(true) // must enable this if you use Function
92+
.multiAnalyzerParams(analyzerParams)
93+
.build());
94+
schema.addField(AddFieldReq.builder()
95+
.fieldName(LANGUAGE_FIELD)
96+
.dataType(DataType.VarChar)
97+
.maxLength(100)
98+
.build());
99+
schema.addField(AddFieldReq.builder()
100+
.fieldName(VECTOR_FIELD)
101+
.dataType(DataType.SparseFloatVector)
102+
.build());
103+
104+
// With this function, milvus will convert the strings of "text" field to sparse vectors of "vector" field
105+
// by built-in tokenizer and analyzer
106+
// Read the link for more info: https://milvus.io/docs/full-text-search.md
107+
schema.addFunction(CreateCollectionReq.Function.builder()
108+
.functionType(FunctionType.BM25)
109+
.name("function_bm25")
110+
.inputFieldNames(Collections.singletonList(TEXT_FIELD))
111+
.outputFieldNames(Collections.singletonList(VECTOR_FIELD))
112+
.build());
113+
114+
List<IndexParam> indexes = new ArrayList<>();
115+
indexes.add(IndexParam.builder()
116+
.fieldName(VECTOR_FIELD)
117+
.indexType(IndexParam.IndexType.SPARSE_INVERTED_INDEX)
118+
.metricType(IndexParam.MetricType.BM25) // to use full text search, metric type must be "BM25"
119+
.build());
120+
121+
CreateCollectionReq requestCreate = CreateCollectionReq.builder()
122+
.collectionName(COLLECTION_NAME)
123+
.collectionSchema(schema)
124+
.indexParams(indexes)
125+
.consistencyLevel(ConsistencyLevel.BOUNDED)
126+
.build();
127+
client.createCollection(requestCreate);
128+
System.out.println("Collection created");
129+
130+
// Insert rows
131+
Gson gson = new Gson();
132+
List<JsonObject> rows = Arrays.asList(
133+
gson.fromJson("{\"language\": \"en\", \"text\": \"Milvus is an open-source vector database\"}", JsonObject.class),
134+
gson.fromJson("{\"language\": \"en\", \"text\": \"AI applications help people better life\"}", JsonObject.class),
135+
gson.fromJson("{\"language\": \"en\", \"text\": \"Will the electric car replace gas-powered car?\"}", JsonObject.class),
136+
gson.fromJson("{\"language\": \"en\", \"text\": \"LangChain is a composable framework to build with LLMs. Milvus is integrated into LangChain.\"}", JsonObject.class),
137+
gson.fromJson("{\"language\": \"en\", \"text\": \"RAG is the process of optimizing the output of a large language model\"}", JsonObject.class),
138+
gson.fromJson("{\"language\": \"en\", \"text\": \"Newton is one of the greatest scientist of human history\"}", JsonObject.class),
139+
gson.fromJson("{\"language\": \"en\", \"text\": \"Metric type L2 is Euclidean distance\"}", JsonObject.class),
140+
gson.fromJson("{\"language\": \"en\", \"text\": \"Embeddings represent real-world objects, like words, images, or videos, in a form that computers can process.\"}", JsonObject.class),
141+
gson.fromJson("{\"language\": \"en\", \"text\": \"The moon is 384,400 km distance away from earth\"}", JsonObject.class),
142+
gson.fromJson("{\"language\": \"en\", \"text\": \"Milvus supports L2 distance and IP similarity for float vector.\"}", JsonObject.class),
143+
gson.fromJson("{\"language\": \"cn\", \"text\": \"人工智能正在改变技术领域\"}", JsonObject.class),
144+
gson.fromJson("{\"language\": \"cn\", \"text\": \"机器学习模型需要大型数据集\"}", JsonObject.class),
145+
gson.fromJson("{\"language\": \"cn\", \"text\": \"Milvus 是一个高性能、可扩展的向量数据库!\"}", JsonObject.class),
146+
gson.fromJson("{\"language\": \"jap\", \"text\": \"Milvusの新機能をご確認くださいこのページでは\"}", JsonObject.class),
147+
gson.fromJson("{\"language\": \"jap\", \"text\": \"非構造化データやマルチモーダルデータを構造化されたコレクションに整理することができます\"}", JsonObject.class),
148+
gson.fromJson("{\"language\": \"jap\", \"text\": \"主な利点はデータアクセスパターンにある\"}", JsonObject.class),
149+
gson.fromJson("{\"language\": \"default\", \"text\": \"토큰화 도구는 소프트웨어 국제화를 위한 핵심 도구를 제공하는\"}", JsonObject.class),
150+
gson.fromJson("{\"language\": \"default\", \"text\": \"Les applications qui suivent le temps à travers les régions\"}", JsonObject.class),
151+
gson.fromJson("{\"language\": \"default\", \"text\": \"Sin embargo, esto puede aumentar la complejidad de las consultas y de la gestión\"}", JsonObject.class),
152+
gson.fromJson("{\"language\": \"default\", \"text\": \"المثال، يوضح الرمز التالي كيفية إضافة عامل تصفية الحقل القياسي إلى بحث متجه\"}", JsonObject.class)
153+
);
154+
155+
client.insert(InsertReq.builder()
156+
.collectionName(COLLECTION_NAME)
157+
.data(rows)
158+
.build());
159+
160+
// Get row count, set ConsistencyLevel.STRONG to sync the data to query node so that data is visible
161+
QueryResp countR = client.query(QueryReq.builder()
162+
.collectionName(COLLECTION_NAME)
163+
.outputFields(Collections.singletonList("count(*)"))
164+
.consistencyLevel(ConsistencyLevel.STRONG)
165+
.build());
166+
System.out.printf("%d rows in collection\n", (long) countR.getQueryResults().get(0).getEntity().get("count(*)"));
167+
}
168+
169+
private static void searchByText(MilvusClientV2 client, String text, String language) {
170+
System.out.printf("\n===============================Language:%s==============================%n", language);
171+
System.out.println("Text: " + text);
172+
// The text is tokenized inside server and turned into a sparse embedding to compare with the vector field
173+
Map<String, Object> searchParams = new HashMap<>();
174+
searchParams.put("analyzer_name", language);
175+
SearchResp searchResp = client.search(SearchReq.builder()
176+
.collectionName(COLLECTION_NAME)
177+
.data(Collections.singletonList(new EmbeddedText(text)))
178+
.limit(5)
179+
.searchParams(searchParams)
180+
.outputFields(Arrays.asList(TEXT_FIELD, LANGUAGE_FIELD))
181+
.build());
182+
System.out.println("Search results:");
183+
List<List<SearchResp.SearchResult>> searchResults = searchResp.getSearchResults();
184+
for (List<SearchResp.SearchResult> results : searchResults) {
185+
for (SearchResp.SearchResult result : results) {
186+
System.out.println(result);
187+
}
188+
}
189+
}
190+
191+
public static void main(String[] args) {
192+
ConnectConfig config = ConnectConfig.builder()
193+
.uri("http://localhost:19530")
194+
.build();
195+
MilvusClientV2 client = new MilvusClientV2(config);
196+
197+
buildCollection(client);
198+
199+
// Query by filtering expression
200+
searchByText(client, "Milvus vector database", "english");
201+
searchByText(client, "人工智能与机器学习", "chinese");
202+
searchByText(client, "非構造化データ", "japanese");
203+
searchByText(client, "Gestion des applications", "default");
204+
205+
client.close();
206+
}
207+
}

0 commit comments

Comments
 (0)