Skip to content

Commit 69cef8d

Browse files
committed
Ngram filters
1 parent 78794cc commit 69cef8d

File tree

2 files changed

+34
-3
lines changed

2 files changed

+34
-3
lines changed

api/src/main/java/io/kafbat/ui/service/index/NgramFilter.java

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
import java.util.HashSet;
1111
import java.util.List;
1212
import java.util.Map;
13+
import java.util.NavigableMap;
14+
import java.util.TreeMap;
15+
import java.util.TreeSet;
1316
import java.util.stream.Stream;
1417
import lombok.SneakyThrows;
1518
import lombok.extern.slf4j.Slf4j;
@@ -22,9 +25,11 @@
2225
public abstract class NgramFilter<T> {
2326
private final Analyzer analyzer;
2427
private final boolean enabled;
28+
private final boolean distanceScore;
2529

26-
public NgramFilter(ClustersProperties.NgramProperties properties, boolean enabled) {
30+
public NgramFilter(ClustersProperties.NgramProperties properties, boolean enabled, boolean distanceScore) {
2731
this.enabled = enabled;
32+
this.distanceScore = distanceScore;
2833
this.analyzer = new ShortWordNGramAnalyzer(properties.getNgramMin(), properties.getNgramMax(), false);
2934
}
3035

@@ -52,14 +57,23 @@ public List<T> find(String search, Comparator<T> comparator) {
5257
try {
5358
List<SearchResult<T>> result = new ArrayList<>();
5459
List<String> queryTokens = tokenizeString(analyzer, search);
55-
Map<String, Integer> queryFreq = termFreq(queryTokens);
60+
Map<String, Integer> queryFreq = Map.of();
61+
62+
if (!distanceScore) {
63+
queryFreq = termFreq(queryTokens);
64+
}
5665

5766
for (Tuple2<List<String>, T> item : getItems()) {
5867
for (String field : item.getT1()) {
5968
List<String> itemTokens = tokenizeString(analyzer, field);
6069
HashSet<String> itemTokensSet = new HashSet<>(itemTokens);
6170
if (itemTokensSet.containsAll(queryTokens)) {
62-
double score = cosineSimilarity(queryFreq, itemTokens);
71+
double score;
72+
if (distanceScore) {
73+
score = distanceSimilarity(queryTokens, itemTokens);
74+
} else {
75+
score = cosineSimilarity(queryFreq, itemTokens);
76+
}
6377
result.add(new SearchResult<>(item.getT2(), score));
6478
}
6579
}
@@ -77,6 +91,22 @@ public List<T> find(String search, Comparator<T> comparator) {
7791
}
7892
}
7993

94+
private double distanceSimilarity(List<String> queryTokens, List<String> itemTokens) {
95+
int smallest = Integer.MAX_VALUE;
96+
for (String queryToken : queryTokens) {
97+
int i = itemTokens.indexOf(queryToken);
98+
if (i >= 0) {
99+
smallest = Math.min(smallest, i);
100+
}
101+
}
102+
103+
if (smallest == Integer.MAX_VALUE) {
104+
return 1.0;
105+
} else {
106+
return 1.0 / (1.0 + smallest);
107+
}
108+
}
109+
80110
private List<T> list(Stream<T> stream, Comparator<T> comparator) {
81111
if (comparator != null) {
82112
return stream.sorted(comparator).toList();

api/src/main/resources/application-localtest.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ kafka:
1212
schemaRegistry: http://localhost:8085
1313
fts:
1414
enabled: true
15+
default-enabled: true
1516

1617
dynamic.config.enabled: true
1718

0 commit comments

Comments
 (0)