Skip to content

Commit 61e2c75

Browse files
authored
Improve token index performance by using fewer memory during search. (#8152)
1 parent 4869d77 commit 61e2c75

File tree

5 files changed

+177
-102
lines changed

5 files changed

+177
-102
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright (c) 2024, the Dart project authors. Please see the AUTHORS file
2+
// for details. All rights reserved. Use of this source code is governed by a
3+
// BSD-style license that can be found in the LICENSE file.
4+
5+
import 'dart:convert';
6+
import 'dart:io';
7+
8+
import 'package:pub_dev/package/overrides.dart';
9+
import 'package:pub_dev/search/mem_index.dart';
10+
import 'package:pub_dev/search/models.dart';
11+
import 'package:pub_dev/search/search_service.dart';
12+
13+
/// Loads a search snapshot and executes queries on it, benchmarking their total time to complete.
14+
Future<void> main(List<String> args) async {
15+
// Assumes that the first argument is a search snapshot file.
16+
final file = File(args.first);
17+
final content =
18+
json.decode(utf8.decode(gzip.decode(await file.readAsBytes())))
19+
as Map<String, Object?>;
20+
final snapshot = SearchSnapshot.fromJson(content);
21+
snapshot.documents!
22+
.removeWhere((packageName, doc) => isSoftRemoved(packageName));
23+
final index = InMemoryPackageIndex(documents: snapshot.documents!.values);
24+
25+
// NOTE: please add more queries to this list, especially if there is a performance bottleneck.
26+
final queries = [
27+
'json',
28+
'camera',
29+
'android camera',
30+
'sql database',
31+
];
32+
33+
final sw = Stopwatch()..start();
34+
var count = 0;
35+
for (var i = 0; i < 100; i++) {
36+
index.search(ServiceSearchQuery.parse(query: queries[i % queries.length]));
37+
count++;
38+
}
39+
sw.stop();
40+
print('${(sw.elapsedMilliseconds / count).toStringAsFixed(2)} ms/request');
41+
}

app/lib/search/mem_index.dart

Lines changed: 55 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@ final _logger = Logger('search.mem_index');
1919
final _textSearchTimeout = Duration(milliseconds: 500);
2020

2121
class InMemoryPackageIndex {
22-
final Map<String, PackageDocument> _packages = <String, PackageDocument>{};
22+
final List<PackageDocument> _documents;
23+
final _documentsByName = <String, PackageDocument>{};
2324
final _packageNameIndex = PackageNameIndex();
24-
final TokenIndex _descrIndex = TokenIndex();
25-
final TokenIndex _readmeIndex = TokenIndex();
26-
final TokenIndex _apiSymbolIndex = TokenIndex();
25+
late final TokenIndex _descrIndex;
26+
late final TokenIndex _readmeIndex;
27+
late final TokenIndex _apiSymbolIndex;
2728

2829
/// Adjusted score takes the overall score and transforms
2930
/// it linearly into the [0.4-1.0] range.
@@ -39,13 +40,38 @@ class InMemoryPackageIndex {
3940

4041
InMemoryPackageIndex({
4142
required Iterable<PackageDocument> documents,
42-
}) {
43-
for (final doc in documents) {
44-
_addPackage(doc);
43+
}) : _documents = [...documents] {
44+
final apiDocPageKeys = <String>[];
45+
final apiDocPageValues = <String>[];
46+
for (final doc in _documents) {
47+
_documentsByName[doc.package] = doc;
48+
_packageNameIndex.add(doc.package);
49+
50+
final apiDocPages = doc.apiDocPages;
51+
if (apiDocPages != null) {
52+
for (final page in apiDocPages) {
53+
if (page.symbols != null && page.symbols!.isNotEmpty) {
54+
apiDocPageKeys.add(_apiDocPageId(doc.package, page));
55+
apiDocPageValues.add(page.symbols!.join(' '));
56+
}
57+
}
58+
}
4559
}
60+
61+
final packageKeys = _documents.map((d) => d.package).toList();
62+
_descrIndex = TokenIndex(
63+
packageKeys,
64+
_documents.map((d) => d.description).toList(),
65+
);
66+
_readmeIndex = TokenIndex(
67+
packageKeys,
68+
_documents.map((d) => d.readme).toList(),
69+
);
70+
_apiSymbolIndex = TokenIndex(apiDocPageKeys, apiDocPageValues);
71+
4672
// update like scores only if they were not set (should happen only in local tests)
47-
if (_packages.values.any((e) => e.likeScore == null)) {
48-
_packages.values.updateLikeScores();
73+
if (_documentsByName.values.any((e) => e.likeScore == null)) {
74+
_documentsByName.values.updateLikeScores();
4975
}
5076
_updateOverallScores();
5177
_lastUpdated = clock.now().toUtc();
@@ -64,49 +90,37 @@ class InMemoryPackageIndex {
6490
IndexInfo indexInfo() {
6591
return IndexInfo(
6692
isReady: true,
67-
packageCount: _packages.length,
93+
packageCount: _documentsByName.length,
6894
lastUpdated: _lastUpdated,
6995
);
7096
}
7197

72-
void _addPackage(PackageDocument doc) {
73-
_packages[doc.package] = doc;
74-
_packageNameIndex.add(doc.package);
75-
_descrIndex.add(doc.package, doc.description);
76-
_readmeIndex.add(doc.package, doc.readme);
77-
78-
for (final ApiDocPage page in doc.apiDocPages ?? const []) {
79-
final pageId = _apiDocPageId(doc.package, page);
80-
if (page.symbols != null && page.symbols!.isNotEmpty) {
81-
_apiSymbolIndex.add(pageId, page.symbols!.join(' '));
82-
}
83-
}
84-
}
85-
8698
PackageSearchResult search(ServiceSearchQuery query) {
87-
final packages = Set<String>.of(_packages.keys);
99+
final packages = Set<String>.of(_documentsByName.keys);
88100

89101
// filter on package prefix
90102
if (query.parsedQuery.packagePrefix != null) {
91103
final String prefix = query.parsedQuery.packagePrefix!.toLowerCase();
92104
packages.removeWhere(
93-
(package) =>
94-
!_packages[package]!.package.toLowerCase().startsWith(prefix),
105+
(package) => !_documentsByName[package]!
106+
.package
107+
.toLowerCase()
108+
.startsWith(prefix),
95109
);
96110
}
97111

98112
// filter on tags
99113
final combinedTagsPredicate =
100114
query.tagsPredicate.appendPredicate(query.parsedQuery.tagsPredicate);
101115
if (combinedTagsPredicate.isNotEmpty) {
102-
packages.retainWhere((package) =>
103-
combinedTagsPredicate.matches(_packages[package]!.tagsForLookup));
116+
packages.retainWhere((package) => combinedTagsPredicate
117+
.matches(_documentsByName[package]!.tagsForLookup));
104118
}
105119

106120
// filter on dependency
107121
if (query.parsedQuery.hasAnyDependency) {
108122
packages.removeWhere((package) {
109-
final doc = _packages[package]!;
123+
final doc = _documentsByName[package]!;
110124
if (doc.dependencies.isEmpty) return true;
111125
for (final dependency in query.parsedQuery.allDependencies) {
112126
if (!doc.dependencies.containsKey(dependency)) return true;
@@ -122,7 +136,7 @@ class InMemoryPackageIndex {
122136
// filter on points
123137
if (query.minPoints != null && query.minPoints! > 0) {
124138
packages.removeWhere((package) {
125-
final doc = _packages[package]!;
139+
final doc = _documentsByName[package]!;
126140
return doc.grantedPoints < query.minPoints!;
127141
});
128142
}
@@ -132,7 +146,7 @@ class InMemoryPackageIndex {
132146
if (updatedDuration != null && updatedDuration > Duration.zero) {
133147
final now = clock.now();
134148
packages.removeWhere((package) {
135-
final doc = _packages[package]!;
149+
final doc = _documentsByName[package]!;
136150
final diff = now.difference(doc.updated);
137151
return diff > updatedDuration;
138152
});
@@ -163,7 +177,8 @@ class InMemoryPackageIndex {
163177
.map((key, value) => value * _adjustedOverallScores[key]!);
164178
// If the search hits have an exact name match, we move it to the front of the result list.
165179
final parsedQueryText = query.parsedQuery.text;
166-
if (parsedQueryText != null && _packages.containsKey(parsedQueryText)) {
180+
if (parsedQueryText != null &&
181+
_documentsByName.containsKey(parsedQueryText)) {
167182
nameMatches = <String>[parsedQueryText];
168183
}
169184
packageHits = _rankWithValues(overallScore.getValues());
@@ -215,7 +230,7 @@ class InMemoryPackageIndex {
215230

216231
/// Update the overall score both on [PackageDocument] and in the [_adjustedOverallScores] map.
217232
void _updateOverallScores() {
218-
for (final doc in _packages.values) {
233+
for (final doc in _documentsByName.values) {
219234
final downloadScore = doc.popularityScore ?? 0.0;
220235
final likeScore = doc.likeScore ?? 0.0;
221236
final popularity = (downloadScore + likeScore) / 2;
@@ -316,7 +331,7 @@ class InMemoryPackageIndex {
316331
if (!aborted && phrases.isNotEmpty) {
317332
final matched = <String, double>{};
318333
for (final package in score.getKeys()) {
319-
final doc = _packages[package]!;
334+
final doc = _documentsByName[package]!;
320335
final bool matchedAllPhrases = phrases.every((phrase) =>
321336
doc.package.contains(phrase) ||
322337
doc.description!.contains(phrase) ||
@@ -341,7 +356,8 @@ class InMemoryPackageIndex {
341356
final int scoreCompare = -a.score!.compareTo(b.score!);
342357
if (scoreCompare != 0) return scoreCompare;
343358
// if two packages got the same score, order by last updated
344-
return _compareUpdated(_packages[a.package]!, _packages[b.package]!);
359+
return _compareUpdated(
360+
_documentsByName[a.package]!, _documentsByName[b.package]!);
345361
});
346362
return list;
347363
}
@@ -350,11 +366,12 @@ class InMemoryPackageIndex {
350366
int Function(PackageDocument a, PackageDocument b) compare, {
351367
double Function(PackageDocument doc)? score,
352368
}) {
353-
final list = _packages.values
369+
final list = _documentsByName.values
354370
.map((doc) => PackageHit(
355371
package: doc.package, score: score == null ? null : score(doc)))
356372
.toList();
357-
list.sort((a, b) => compare(_packages[a.package]!, _packages[b.package]!));
373+
list.sort((a, b) =>
374+
compare(_documentsByName[a.package]!, _documentsByName[b.package]!));
358375
return list;
359376
}
360377

app/lib/search/sdk_mem_index.dart

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ class SdkMemIndex {
8080
DartdocIndex index, {
8181
Set<String>? allowedLibraries,
8282
}) async {
83+
final textsPerLibrary = <String, Map<String, String>>{};
8384
for (final f in index.entries) {
8485
final library = f.qualifiedName?.split('.').first;
8586
if (library == null) continue;
@@ -92,10 +93,15 @@ class SdkMemIndex {
9293
if (f.isLibrary) {
9394
_baseUriPerLibrary[library] = _baseUri.resolve(f.href!).toString();
9495
}
95-
final tokens = _tokensPerLibrary.putIfAbsent(library, () => TokenIndex());
9696

9797
final text = f.qualifiedName?.replaceAll('.', ' ').replaceAll(':', ' ');
98-
tokens.add(f.href!, text);
98+
if (text != null && text.isNotEmpty) {
99+
final texts = textsPerLibrary.putIfAbsent(library, () => {});
100+
texts[f.href!] = text;
101+
}
102+
}
103+
for (final e in textsPerLibrary.entries) {
104+
_tokensPerLibrary[e.key] = TokenIndex.fromMap(e.value);
99105
}
100106
}
101107

app/lib/search/token_index.dart

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -155,31 +155,43 @@ class TokenMatch {
155155

156156
/// Stores a token -> documentId inverted index with weights.
157157
class TokenIndex {
158-
/// Maps token Strings to a weighted map of document ids.
159-
final _inverseIds = <String, Map<String, double>>{};
158+
final List<String> _ids;
159+
160+
/// Maps token Strings to a weighted documents (addressed via indexes).
161+
final _inverseIds = <String, Map<int, double>>{};
160162

161163
/// {id: size} map to store a value representative to the document length
162-
final _docSizes = <String, double>{};
164+
late final List<double> _docWeights;
163165

164-
/// The number of tokens stored in the index.
165-
int get tokenCount => _inverseIds.length;
166+
late final _length = _docWeights.length;
166167

167-
int get documentCount => _docSizes.length;
168+
TokenIndex(List<String> ids, List<String?> values) : _ids = ids {
169+
assert(ids.length == values.length);
170+
final length = values.length;
171+
_docWeights = List<double>.filled(length, 0.0);
172+
for (var i = 0; i < length; i++) {
173+
final text = values[i];
168174

169-
void add(String id, String? text) {
170-
if (text == null) return;
171-
final tokens = tokenize(text);
172-
if (tokens == null || tokens.isEmpty) {
173-
return;
174-
}
175-
for (final token in tokens.keys) {
176-
final Map<String, double> weights =
177-
_inverseIds.putIfAbsent(token, () => <String, double>{});
178-
weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!);
175+
if (text == null) {
176+
continue;
177+
}
178+
final tokens = tokenize(text);
179+
if (tokens == null || tokens.isEmpty) {
180+
continue;
181+
}
182+
for (final token in tokens.keys) {
183+
final weights = _inverseIds.putIfAbsent(token, () => {});
184+
weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!);
185+
}
186+
// Document weight is a highly scaled-down proxy of the length.
187+
_docWeights[i] = 1 + math.log(1 + tokens.length) / 100;
179188
}
180-
// Document size is a highly scaled-down proxy of the length.
181-
final docSize = 1 + math.log(1 + tokens.length) / 100;
182-
_docSizes[id] = docSize;
189+
}
190+
191+
factory TokenIndex.fromMap(Map<String, String> map) {
192+
final keys = map.keys.toList();
193+
final values = map.values.toList();
194+
return TokenIndex(keys, values);
183195
}
184196

185197
/// Match the text against the corpus and return the tokens or
@@ -191,9 +203,8 @@ class TokenIndex {
191203
for (final word in splitForIndexing(text)) {
192204
final tokens = tokenize(word, isSplit: true) ?? {};
193205

194-
final present = tokens.keys
195-
.where((token) => (_inverseIds[token]?.length ?? 0) > 0)
196-
.toList();
206+
final present =
207+
tokens.keys.where((token) => _inverseIds.containsKey(token)).toList();
197208
if (present.isEmpty) {
198209
return TokenMatch();
199210
}
@@ -219,14 +230,12 @@ class TokenIndex {
219230
Map<String, double> _scoreDocs(TokenMatch tokenMatch,
220231
{double weight = 1.0, int wordCount = 1, Set<String>? limitToIds}) {
221232
// Summarize the scores for the documents.
222-
final docScores = <String, double>{};
233+
final docScores = List<double>.filled(_length, 0.0);
223234
for (final token in tokenMatch.tokens) {
224235
final docWeights = _inverseIds[token]!;
225236
for (final e in docWeights.entries) {
226-
if (limitToIds != null && !limitToIds.contains(e.key)) continue;
227-
final double prevValue = docScores[e.key] ?? 0.0;
228-
final double currentValue = tokenMatch[token]! * e.value;
229-
docScores[e.key] = math.max(prevValue, currentValue);
237+
final i = e.key;
238+
docScores[i] = math.max(docScores[i], tokenMatch[token]! * e.value);
230239
}
231240
}
232241

@@ -235,15 +244,24 @@ class TokenIndex {
235244
// compensate the formula in order to prevent multiple exponential penalties.
236245
final double wordSizeExponent = 1.0 / wordCount;
237246

247+
final result = <String, double>{};
238248
// post-process match weights
239-
docScores.updateAll((id, docScore) {
240-
var docSize = _docSizes[id]!;
249+
for (var i = 0; i < _length; i++) {
250+
final id = _ids[i];
251+
final w = docScores[i];
252+
if (w <= 0.0) {
253+
continue;
254+
}
255+
if (limitToIds != null && !limitToIds.contains(id)) {
256+
continue;
257+
}
258+
var dw = _docWeights[i];
241259
if (wordCount > 1) {
242-
docSize = math.pow(docSize, wordSizeExponent).toDouble();
260+
dw = math.pow(dw, wordSizeExponent).toDouble();
243261
}
244-
return weight * docScore / docSize;
245-
});
246-
return docScores;
262+
result[id] = w * weight / dw;
263+
}
264+
return result;
247265
}
248266

249267
/// Search the index for [text], with a (term-match / document coverage percent)

0 commit comments

Comments
 (0)