Skip to content

Commit 4a0d729

Browse files
committed
BE: Issue#1332 Sort based on prefix offsets
1 parent d4bd299 commit 4a0d729

File tree

7 files changed

+319
-11
lines changed

7 files changed

+319
-11
lines changed

api/src/main/java/io/kafbat/ui/service/index/LuceneTopicsIndex.java

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
import io.kafbat.ui.model.InternalTopic;
44
import io.kafbat.ui.model.InternalTopicConfig;
5+
import io.kafbat.ui.service.index.lucene.IndexedTextField;
6+
import io.kafbat.ui.service.index.lucene.NameDistanceScoringFunction;
7+
import io.kafbat.ui.service.index.lucene.ShortWordAnalyzer;
58
import java.io.IOException;
69
import java.io.UncheckedIOException;
710
import java.util.ArrayList;
@@ -18,11 +21,11 @@
1821
import org.apache.lucene.document.IntPoint;
1922
import org.apache.lucene.document.LongPoint;
2023
import org.apache.lucene.document.StringField;
21-
import org.apache.lucene.document.TextField;
2224
import org.apache.lucene.index.DirectoryReader;
2325
import org.apache.lucene.index.IndexWriter;
2426
import org.apache.lucene.index.IndexWriterConfig;
2527
import org.apache.lucene.index.Term;
28+
import org.apache.lucene.queries.function.FunctionScoreQuery;
2629
import org.apache.lucene.queryparser.classic.ParseException;
2730
import org.apache.lucene.queryparser.classic.QueryParser;
2831
import org.apache.lucene.search.BooleanClause;
@@ -59,11 +62,14 @@ public LuceneTopicsIndex(List<InternalTopic> topics) throws IOException {
5962

6063
private Directory build(List<InternalTopic> topics) {
6164
Directory directory = new ByteBuffersDirectory();
65+
6266
try (IndexWriter directoryWriter = new IndexWriter(directory, new IndexWriterConfig(this.analyzer))) {
6367
for (InternalTopic topic : topics) {
6468
Document doc = new Document();
69+
70+
6571
doc.add(new StringField(FIELD_NAME_RAW, topic.getName(), Field.Store.YES));
66-
doc.add(new TextField(FIELD_NAME, topic.getName(), Field.Store.NO));
72+
doc.add(new IndexedTextField(FIELD_NAME, topic.getName(), Field.Store.YES));
6773
doc.add(new IntPoint(FIELD_PARTITIONS, topic.getPartitionCount()));
6874
doc.add(new IntPoint(FIELD_REPLICATION, topic.getReplicationFactor()));
6975
doc.add(new LongPoint(FIELD_SIZE, topic.getSegmentSize()));
@@ -117,9 +123,9 @@ public List<InternalTopic> find(String search, Boolean showInternal,
117123
closeLock.readLock().lock();
118124
try {
119125

120-
QueryParser queryParser = new PrefixQueryParser(FIELD_NAME, this.analyzer);
126+
PrefixQueryParser queryParser = new PrefixQueryParser(FIELD_NAME, this.analyzer);
121127
queryParser.setDefaultOperator(QueryParser.Operator.AND);
122-
Query nameQuery = queryParser.parse(search);;
128+
Query nameQuery = queryParser.parse(search);
123129

124130
Query internalFilter = new TermQuery(new Term(FIELD_INTERNAL, "true"));
125131

@@ -129,6 +135,12 @@ public List<InternalTopic> find(String search, Boolean showInternal,
129135
queryBuilder.add(internalFilter, BooleanClause.Occur.MUST_NOT);
130136
}
131137

138+
BooleanQuery combined = queryBuilder.build();
139+
Query wrapped = new FunctionScoreQuery(
140+
combined,
141+
new NameDistanceScoringFunction(FIELD_NAME, queryParser.getPrefixes())
142+
);
143+
132144
List<SortField> sortFields = new ArrayList<>();
133145
sortFields.add(SortField.FIELD_SCORE);
134146
if (!sortField.equals(FIELD_NAME)) {
@@ -137,7 +149,7 @@ public List<InternalTopic> find(String search, Boolean showInternal,
137149

138150
Sort sort = new Sort(sortFields.toArray(new SortField[0]));
139151

140-
TopDocs result = this.indexSearcher.search(queryBuilder.build(), count != null ? count : this.maxSize, sort);
152+
TopDocs result = this.indexSearcher.search(wrapped, count != null ? count : this.maxSize, sort);
141153

142154
List<String> topics = new ArrayList<>();
143155
for (ScoreDoc scoreDoc : result.scoreDocs) {

api/src/main/java/io/kafbat/ui/service/index/PrefixQueryParser.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST;
44

55
import io.kafbat.ui.service.index.TopicsIndex.FieldType;
6+
import java.util.ArrayList;
67
import java.util.List;
8+
import java.util.Objects;
79
import java.util.Optional;
810
import org.apache.lucene.analysis.Analyzer;
911
import org.apache.lucene.document.IntPoint;
@@ -14,10 +16,11 @@
1416
import org.apache.lucene.search.PrefixQuery;
1517
import org.apache.lucene.search.Query;
1618
import org.apache.lucene.search.TermQuery;
17-
import org.apache.lucene.search.TermRangeQuery;
1819

1920
public class PrefixQueryParser extends QueryParser {
2021

22+
private final List<String> prefixes = new ArrayList<>();
23+
2124
public PrefixQueryParser(String field, Analyzer analyzer) {
2225
super(field, analyzer);
2326
}
@@ -60,7 +63,13 @@ protected Query newTermQuery(Term term, float boost) {
6063
.orElse(FieldType.STRING);
6164

6265
Query query = switch (fieldType) {
63-
case STRING -> new PrefixQuery(term);
66+
case STRING -> {
67+
if (Objects.equals(term.field(), field)) {
68+
prefixes.add(term.text());
69+
}
70+
71+
yield new PrefixQuery(term);
72+
}
6473
case INT -> IntPoint.newExactQuery(term.field(), Integer.parseInt(term.text()));
6574
case LONG -> LongPoint.newExactQuery(term.field(), Long.parseLong(term.text()));
6675
case BOOLEAN -> new TermQuery(term);
@@ -72,4 +81,7 @@ protected Query newTermQuery(Term term, float boost) {
7281
return new BoostQuery(query, boost);
7382
}
7483

84+
public List<String> getPrefixes() {
85+
return prefixes;
86+
}
7587
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package io.kafbat.ui.service.index.lucene;
2+
3+
4+
import java.io.Reader;
5+
import org.apache.lucene.analysis.TokenStream;
6+
import org.apache.lucene.document.Field;
7+
import org.apache.lucene.document.FieldType;
8+
import org.apache.lucene.document.StoredValue;
9+
import org.apache.lucene.index.IndexOptions;
10+
11+
public class IndexedTextField extends Field {
12+
13+
/** Indexed, tokenized, not stored. */
14+
public static final FieldType TYPE_NOT_STORED = new FieldType();
15+
16+
/** Indexed, tokenized, stored. */
17+
public static final FieldType TYPE_STORED = new FieldType();
18+
19+
static {
20+
TYPE_NOT_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
21+
TYPE_NOT_STORED.setTokenized(true);
22+
TYPE_NOT_STORED.freeze();
23+
24+
TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
25+
TYPE_STORED.setTokenized(true);
26+
TYPE_STORED.setStored(true);
27+
TYPE_STORED.setStoreTermVectors(true);
28+
TYPE_STORED.setStoreTermVectorOffsets(true);
29+
TYPE_STORED.setStoreTermVectorPositions(true);
30+
TYPE_STORED.freeze();
31+
}
32+
33+
private final StoredValue storedValue;
34+
35+
/**
36+
* Creates a new un-stored TextField with Reader value.
37+
*
38+
* @param name field name
39+
* @param reader reader value
40+
* @throws IllegalArgumentException if the field name is null
41+
* @throws NullPointerException if the reader is null
42+
*/
43+
public IndexedTextField(String name, Reader reader) {
44+
super(name, reader, TYPE_NOT_STORED);
45+
storedValue = null;
46+
}
47+
48+
/**
49+
* Creates a new TextField with String value.
50+
*
51+
* @param name field name
52+
* @param value string value
53+
* @param store Store.YES if the content should also be stored
54+
* @throws IllegalArgumentException if the field name or value is null.
55+
*/
56+
public IndexedTextField(String name, String value, Store store) {
57+
super(name, value, store == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
58+
if (store == Store.YES) {
59+
storedValue = new StoredValue(value);
60+
} else {
61+
storedValue = null;
62+
}
63+
}
64+
65+
/**
66+
* Creates a new un-stored TextField with TokenStream value.
67+
*
68+
* @param name field name
69+
* @param stream TokenStream value
70+
* @throws IllegalArgumentException if the field name is null.
71+
* @throws NullPointerException if the tokenStream is null
72+
*/
73+
public IndexedTextField(String name, TokenStream stream) {
74+
super(name, stream, TYPE_NOT_STORED);
75+
storedValue = null;
76+
}
77+
78+
@Override
79+
public void setStringValue(String value) {
80+
super.setStringValue(value);
81+
if (storedValue != null) {
82+
storedValue.setStringValue(value);
83+
}
84+
}
85+
86+
@Override
87+
public StoredValue storedValue() {
88+
return storedValue;
89+
}
90+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
package io.kafbat.ui.service.index.lucene;
2+
3+
import java.io.IOException;
4+
import java.util.HashMap;
5+
import java.util.List;
6+
import java.util.Map;
7+
import org.apache.lucene.index.LeafReaderContext;
8+
import org.apache.lucene.index.PostingsEnum;
9+
import org.apache.lucene.index.Terms;
10+
import org.apache.lucene.index.TermsEnum;
11+
import org.apache.lucene.search.DocIdSetIterator;
12+
import org.apache.lucene.search.DoubleValues;
13+
import org.apache.lucene.search.DoubleValuesSource;
14+
import org.apache.lucene.search.IndexSearcher;
15+
import org.apache.lucene.util.BytesRef;
16+
17+
public class NameDistanceScoringFunction extends DoubleValuesSource {
18+
private final String fieldName;
19+
private final List<String> prefixes;
20+
21+
public NameDistanceScoringFunction(String fieldName, List<String> prefixes) {
22+
this.fieldName = fieldName;
23+
this.prefixes = prefixes;
24+
}
25+
26+
@Override
27+
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
28+
29+
Terms terms = ctx.reader().terms(fieldName);
30+
Map<Integer, Integer> positions = new HashMap<>();
31+
32+
for (String prefix : prefixes) {
33+
TermsEnum iterator = terms.iterator();
34+
TermsEnum.SeekStatus seekStatus = iterator.seekCeil(new BytesRef(prefix));
35+
if (!seekStatus.equals(TermsEnum.SeekStatus.END)) {
36+
37+
PostingsEnum postings = iterator.postings(
38+
null,
39+
PostingsEnum.OFFSETS | PostingsEnum.FREQS | PostingsEnum.POSITIONS
40+
);
41+
42+
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
43+
int doc = postings.docID();
44+
int smallest = Integer.MAX_VALUE;
45+
46+
for (int i = 0; i < postings.freq(); i++) {
47+
postings.nextPosition();
48+
smallest = Math.min(smallest, postings.startOffset());
49+
}
50+
int finalSmall = smallest;
51+
int s = positions.computeIfAbsent(doc, d -> finalSmall);
52+
if (finalSmall < s) {
53+
positions.put(doc, finalSmall);
54+
}
55+
}
56+
}
57+
}
58+
59+
return new DoubleValues() {
60+
int doc = -1;
61+
62+
@Override
63+
public double doubleValue() {
64+
Integer pos = positions.get(doc);
65+
if (pos == null) {
66+
return 1.0;
67+
}
68+
return 1.0 / (1.0 + pos);
69+
}
70+
71+
@Override
72+
public boolean advanceExact(int target) {
73+
doc = target;
74+
return true;
75+
}
76+
};
77+
}
78+
79+
@Override
80+
public boolean needsScores() {
81+
return false;
82+
}
83+
84+
@Override
85+
public DoubleValuesSource rewrite(IndexSearcher searcher) {
86+
return this;
87+
}
88+
89+
@Override
90+
public int hashCode() {
91+
return 0;
92+
}
93+
94+
@Override
95+
public boolean equals(Object obj) {
96+
return false;
97+
}
98+
99+
@Override
100+
public String toString() {
101+
return "NameDistanceScoringFunction";
102+
}
103+
104+
@Override
105+
public boolean isCacheable(LeafReaderContext ctx) {
106+
return false;
107+
}
108+
}

api/src/main/java/io/kafbat/ui/service/index/ShortWordAnalyzer.java renamed to api/src/main/java/io/kafbat/ui/service/index/lucene/ShortWordAnalyzer.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1-
package io.kafbat.ui.service.index;
1+
package io.kafbat.ui.service.index.lucene;
22

33
import org.apache.lucene.analysis.Analyzer;
44
import org.apache.lucene.analysis.LowerCaseFilter;
55
import org.apache.lucene.analysis.TokenStream;
66
import org.apache.lucene.analysis.Tokenizer;
77
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
8+
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
89
import org.apache.lucene.analysis.standard.StandardTokenizer;
910

10-
class ShortWordAnalyzer extends Analyzer {
11+
public class ShortWordAnalyzer extends Analyzer {
1112

1213
public ShortWordAnalyzer() {}
1314

@@ -17,12 +18,13 @@ protected TokenStreamComponents createComponents(String fieldName) {
1718

1819
TokenStream tokenStream = new WordDelimiterGraphFilter(
1920
tokenizer,
21+
true,
22+
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
2023
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
2124
| WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
2225
| WordDelimiterGraphFilter.PRESERVE_ORIGINAL
2326
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
2427
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE,
25-
2628
null
2729
);
2830

0 commit comments

Comments
 (0)