1010import java .util .HashSet ;
1111import java .util .List ;
1212import java .util .Map ;
13+ import java .util .NavigableMap ;
14+ import java .util .TreeMap ;
15+ import java .util .TreeSet ;
1316import java .util .stream .Stream ;
1417import lombok .SneakyThrows ;
1518import lombok .extern .slf4j .Slf4j ;
2225public abstract class NgramFilter <T > {
2326 private final Analyzer analyzer ;
2427 private final boolean enabled ;
28+ private final boolean distanceScore ;
2529
26- public NgramFilter (ClustersProperties .NgramProperties properties , boolean enabled ) {
30+ public NgramFilter (ClustersProperties .NgramProperties properties , boolean enabled , boolean distanceScore ) {
2731 this .enabled = enabled ;
32+ this .distanceScore = distanceScore ;
2833 this .analyzer = new ShortWordNGramAnalyzer (properties .getNgramMin (), properties .getNgramMax (), false );
2934 }
3035
@@ -52,14 +57,23 @@ public List<T> find(String search, Comparator<T> comparator) {
5257 try {
5358 List <SearchResult <T >> result = new ArrayList <>();
5459 List <String > queryTokens = tokenizeString (analyzer , search );
55- Map <String , Integer > queryFreq = termFreq (queryTokens );
60+ Map <String , Integer > queryFreq = Map .of ();
61+
62+ if (!distanceScore ) {
63+ queryFreq = termFreq (queryTokens );
64+ }
5665
5766 for (Tuple2 <List <String >, T > item : getItems ()) {
5867 for (String field : item .getT1 ()) {
5968 List <String > itemTokens = tokenizeString (analyzer , field );
6069 HashSet <String > itemTokensSet = new HashSet <>(itemTokens );
6170 if (itemTokensSet .containsAll (queryTokens )) {
62- double score = cosineSimilarity (queryFreq , itemTokens );
71+ double score ;
72+ if (distanceScore ) {
73+ score = distanceSimilarity (queryTokens , itemTokens );
74+ } else {
75+ score = cosineSimilarity (queryFreq , itemTokens );
76+ }
6377 result .add (new SearchResult <>(item .getT2 (), score ));
6478 }
6579 }
@@ -77,6 +91,22 @@ public List<T> find(String search, Comparator<T> comparator) {
7791 }
7892 }
7993
94+ private double distanceSimilarity (List <String > queryTokens , List <String > itemTokens ) {
95+ int smallest = Integer .MAX_VALUE ;
96+ for (String queryToken : queryTokens ) {
97+ int i = itemTokens .indexOf (queryToken );
98+ if (i >= 0 ) {
99+ smallest = Math .min (smallest , i );
100+ }
101+ }
102+
103+ if (smallest == Integer .MAX_VALUE ) {
104+ return 1.0 ;
105+ } else {
106+ return 1.0 / (1.0 + smallest );
107+ }
108+ }
109+
80110 private List <T > list (Stream <T > stream , Comparator <T > comparator ) {
81111 if (comparator != null ) {
82112 return stream .sorted (comparator ).toList ();
0 commit comments