Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 7 additions & 21 deletions app/lib/search/token_index.dart
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,11 @@ class TokenIndex {
/// Maps token Strings to a weighted documents (addressed via indexes).
final _inverseIds = <String, Map<int, double>>{};

/// {id: size} map to store a value representative to the document length
late final List<double> _docWeights;

late final _length = _docWeights.length;
late final _length = _ids.length;

TokenIndex(List<String> ids, List<String?> values) : _ids = ids {
assert(ids.length == values.length);
final length = values.length;
_docWeights = List<double>.filled(length, 0.0);
for (var i = 0; i < length; i++) {
final text = values[i];

Expand All @@ -166,12 +162,12 @@ class TokenIndex {
if (tokens == null || tokens.isEmpty) {
continue;
}
// Document weight is a highly scaled-down proxy of the length.
final dw = 1 + math.log(1 + tokens.length) / 100;
for (final token in tokens.keys) {
final weights = _inverseIds.putIfAbsent(token, () => {});
weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!);
weights[i] = math.max(weights[i] ?? 0.0, tokens[token]! / dw);
}
// Document weight is a highly scaled-down proxy of the length.
_docWeights[i] = 1 + math.log(1 + tokens.length) / 100;
}
}

Expand Down Expand Up @@ -215,7 +211,7 @@ class TokenIndex {
/// When [limitToIds] is specified, the result will contain only the set of
/// identifiers in it.
Map<String, double> _scoreDocs(TokenMatch tokenMatch,
{double weight = 1.0, int wordCount = 1, Set<String>? limitToIds}) {
{double weight = 1.0, Set<String>? limitToIds}) {
// Summarize the scores for the documents.
final docScores = List<double>.filled(_length, 0.0);
for (final token in tokenMatch.tokens) {
Expand All @@ -226,11 +222,6 @@ class TokenIndex {
}
}

// In multi-word queries we will penalize the score with the document size
// for each word separately. As these scores will be multiplied, we need to
// compensate the formula in order to prevent multiple exponential penalties.
final double wordSizeExponent = 1.0 / wordCount;

final result = <String, double>{};
// post-process match weights
for (var i = 0; i < _length; i++) {
Expand All @@ -242,11 +233,7 @@ class TokenIndex {
if (limitToIds != null && !limitToIds.contains(id)) {
continue;
}
var dw = _docWeights[i];
if (wordCount > 1) {
dw = math.pow(dw, wordSizeExponent).toDouble();
}
result[id] = w * weight / dw;
result[id] = w * weight;
}
return result;
}
Expand All @@ -255,7 +242,7 @@ class TokenIndex {
/// scoring.
@visibleForTesting
Map<String, double> search(String text) {
return _scoreDocs(lookupTokens(text));
return searchWords(splitForQuery(text))._values;
}

/// Search the index for [words], with a (term-match / document coverage percent)
Expand All @@ -271,7 +258,6 @@ class TokenIndex {
final values = _scoreDocs(
tokens,
weight: weight,
wordCount: words.length,
limitToIds: limitToIds,
);
if (values.isEmpty) {
Expand Down
2 changes: 1 addition & 1 deletion app/test/search/token_index_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void main() {
});

expect(index.search('riak client'), {
'uri://riak_client': closeTo(0.99, 0.01),
'uri://riak_client': closeTo(0.98, 0.01),
});
});

Expand Down
Loading