Skip to content

Commit 12f2c66

Browse files
authored
Move doc-weight correction to pre-calculation of term weights. (#8229)
1 parent 5396f91 commit 12f2c66

File tree

2 files changed

+8
-22
lines changed

2 files changed

+8
-22
lines changed

app/lib/search/token_index.dart

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -147,15 +147,11 @@ class TokenIndex {
147147
/// Maps token Strings to a weighted documents (addressed via indexes).
148148
final _inverseIds = <String, Map<int, double>>{};
149149

150-
/// {id: size} map to store a value representative to the document length
151-
late final List<double> _docWeights;
152-
153-
late final _length = _docWeights.length;
150+
late final _length = _ids.length;
154151

155152
TokenIndex(List<String> ids, List<String?> values) : _ids = ids {
156153
assert(ids.length == values.length);
157154
final length = values.length;
158-
_docWeights = List<double>.filled(length, 0.0);
159155
for (var i = 0; i < length; i++) {
160156
final text = values[i];
161157

@@ -166,12 +162,12 @@ class TokenIndex {
166162
if (tokens == null || tokens.isEmpty) {
167163
continue;
168164
}
165+
// Document weight is a highly scaled-down proxy of the length.
166+
final dw = 1 + math.log(1 + tokens.length) / 100;
169167
for (final token in tokens.keys) {
170168
final weights = _inverseIds.putIfAbsent(token, () => {});
171-
weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!);
169+
weights[i] = math.max(weights[i] ?? 0.0, tokens[token]! / dw);
172170
}
173-
// Document weight is a highly scaled-down proxy of the length.
174-
_docWeights[i] = 1 + math.log(1 + tokens.length) / 100;
175171
}
176172
}
177173

@@ -215,7 +211,7 @@ class TokenIndex {
215211
/// When [limitToIds] is specified, the result will contain only the set of
216212
/// identifiers in it.
217213
Map<String, double> _scoreDocs(TokenMatch tokenMatch,
218-
{double weight = 1.0, int wordCount = 1, Set<String>? limitToIds}) {
214+
{double weight = 1.0, Set<String>? limitToIds}) {
219215
// Summarize the scores for the documents.
220216
final docScores = List<double>.filled(_length, 0.0);
221217
for (final token in tokenMatch.tokens) {
@@ -226,11 +222,6 @@ class TokenIndex {
226222
}
227223
}
228224

229-
// In multi-word queries we will penalize the score with the document size
230-
// for each word separately. As these scores will be multiplied, we need to
231-
// compensate the formula in order to prevent multiple exponential penalties.
232-
final double wordSizeExponent = 1.0 / wordCount;
233-
234225
final result = <String, double>{};
235226
// post-process match weights
236227
for (var i = 0; i < _length; i++) {
@@ -242,11 +233,7 @@ class TokenIndex {
242233
if (limitToIds != null && !limitToIds.contains(id)) {
243234
continue;
244235
}
245-
var dw = _docWeights[i];
246-
if (wordCount > 1) {
247-
dw = math.pow(dw, wordSizeExponent).toDouble();
248-
}
249-
result[id] = w * weight / dw;
236+
result[id] = w * weight;
250237
}
251238
return result;
252239
}
@@ -255,7 +242,7 @@ class TokenIndex {
255242
/// scoring.
256243
@visibleForTesting
257244
Map<String, double> search(String text) {
258-
return _scoreDocs(lookupTokens(text));
245+
return searchWords(splitForQuery(text))._values;
259246
}
260247

261248
/// Search the index for [words], with a (term-match / document coverage percent)
@@ -271,7 +258,6 @@ class TokenIndex {
271258
final values = _scoreDocs(
272259
tokens,
273260
weight: weight,
274-
wordCount: words.length,
275261
limitToIds: limitToIds,
276262
);
277263
if (values.isEmpty) {

app/test/search/token_index_test.dart

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ void main() {
6868
});
6969

7070
expect(index.search('riak client'), {
71-
'uri://riak_client': closeTo(0.99, 0.01),
71+
'uri://riak_client': closeTo(0.98, 0.01),
7272
});
7373
});
7474

0 commit comments

Comments
 (0)