Improve token index performance by using fewer memory during search. (#8152)

isoos · web-flow · commit 61e2c754d8e0 · 2024-10-17T11:21:42.000+02:00
diff --git a/app/bin/tools/search_benchmark.dart b/app/bin/tools/search_benchmark.dart
@@ -0,0 +1,41 @@
+// Copyright (c) 2024, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+import 'dart:convert';
+import 'dart:io';
+
+import 'package:pub_dev/package/overrides.dart';
+import 'package:pub_dev/search/mem_index.dart';
+import 'package:pub_dev/search/models.dart';
+import 'package:pub_dev/search/search_service.dart';
+
+/// Loads a search snapshot and executes queries on it, benchmarking their total time to complete.
+Future<void> main(List<String> args) async {
+  // Assumes that the first argument is a search snapshot file.
+  final file = File(args.first);
+  final content =
+      json.decode(utf8.decode(gzip.decode(await file.readAsBytes())))
+          as Map<String, Object?>;
+  final snapshot = SearchSnapshot.fromJson(content);
+  snapshot.documents!
+      .removeWhere((packageName, doc) => isSoftRemoved(packageName));
+  final index = InMemoryPackageIndex(documents: snapshot.documents!.values);
+
+  // NOTE: please add more queries to this list, especially if there is a performance bottleneck.
+  final queries = [
+    'json',
+    'camera',
+    'android camera',
+    'sql database',
+  ];
+
+  final sw = Stopwatch()..start();
+  var count = 0;
+  for (var i = 0; i < 100; i++) {
+    index.search(ServiceSearchQuery.parse(query: queries[i % queries.length]));
+    count++;
+  }
+  sw.stop();
+  print('${(sw.elapsedMilliseconds / count).toStringAsFixed(2)} ms/request');
+}
diff --git a/app/lib/search/mem_index.dart b/app/lib/search/mem_index.dart
@@ -19,11 +19,12 @@ final _logger = Logger('search.mem_index');
 final _textSearchTimeout = Duration(milliseconds: 500);
 
 class InMemoryPackageIndex {
-  final Map<String, PackageDocument> _packages = <String, PackageDocument>{};
+  final List<PackageDocument> _documents;
+  final _documentsByName = <String, PackageDocument>{};
   final _packageNameIndex = PackageNameIndex();
-  final TokenIndex _descrIndex = TokenIndex();
-  final TokenIndex _readmeIndex = TokenIndex();
-  final TokenIndex _apiSymbolIndex = TokenIndex();
+  late final TokenIndex _descrIndex;
+  late final TokenIndex _readmeIndex;
+  late final TokenIndex _apiSymbolIndex;
 
   /// Adjusted score takes the overall score and transforms
   /// it linearly into the [0.4-1.0] range.
@@ -39,13 +40,38 @@ class InMemoryPackageIndex {
 
   InMemoryPackageIndex({
     required Iterable<PackageDocument> documents,
-  }) {
-    for (final doc in documents) {
-      _addPackage(doc);
+  }) : _documents = [...documents] {
+    final apiDocPageKeys = <String>[];
+    final apiDocPageValues = <String>[];
+    for (final doc in _documents) {
+      _documentsByName[doc.package] = doc;
+      _packageNameIndex.add(doc.package);
+
+      final apiDocPages = doc.apiDocPages;
+      if (apiDocPages != null) {
+        for (final page in apiDocPages) {
+          if (page.symbols != null && page.symbols!.isNotEmpty) {
+            apiDocPageKeys.add(_apiDocPageId(doc.package, page));
+            apiDocPageValues.add(page.symbols!.join(' '));
+          }
+        }
+      }
     }
+
+    final packageKeys = _documents.map((d) => d.package).toList();
+    _descrIndex = TokenIndex(
+      packageKeys,
+      _documents.map((d) => d.description).toList(),
+    );
+    _readmeIndex = TokenIndex(
+      packageKeys,
+      _documents.map((d) => d.readme).toList(),
+    );
+    _apiSymbolIndex = TokenIndex(apiDocPageKeys, apiDocPageValues);
+
     // update like scores only if they were not set (should happen only in local tests)
-    if (_packages.values.any((e) => e.likeScore == null)) {
-      _packages.values.updateLikeScores();
+    if (_documentsByName.values.any((e) => e.likeScore == null)) {
+      _documentsByName.values.updateLikeScores();
     }
     _updateOverallScores();
     _lastUpdated = clock.now().toUtc();
@@ -64,49 +90,37 @@ class InMemoryPackageIndex {
   IndexInfo indexInfo() {
     return IndexInfo(
       isReady: true,
-      packageCount: _packages.length,
+      packageCount: _documentsByName.length,
       lastUpdated: _lastUpdated,
     );
   }
 
-  void _addPackage(PackageDocument doc) {
-    _packages[doc.package] = doc;
-    _packageNameIndex.add(doc.package);
-    _descrIndex.add(doc.package, doc.description);
-    _readmeIndex.add(doc.package, doc.readme);
-
-    for (final ApiDocPage page in doc.apiDocPages ?? const []) {
-      final pageId = _apiDocPageId(doc.package, page);
-      if (page.symbols != null && page.symbols!.isNotEmpty) {
-        _apiSymbolIndex.add(pageId, page.symbols!.join(' '));
-      }
-    }
-  }
-
   PackageSearchResult search(ServiceSearchQuery query) {
-    final packages = Set<String>.of(_packages.keys);
+    final packages = Set<String>.of(_documentsByName.keys);
 
     // filter on package prefix
     if (query.parsedQuery.packagePrefix != null) {
       final String prefix = query.parsedQuery.packagePrefix!.toLowerCase();
       packages.removeWhere(
-        (package) =>
-            !_packages[package]!.package.toLowerCase().startsWith(prefix),
+        (package) => !_documentsByName[package]!
+            .package
+            .toLowerCase()
+            .startsWith(prefix),
       );
     }
 
     // filter on tags
     final combinedTagsPredicate =
         query.tagsPredicate.appendPredicate(query.parsedQuery.tagsPredicate);
     if (combinedTagsPredicate.isNotEmpty) {
-      packages.retainWhere((package) =>
-          combinedTagsPredicate.matches(_packages[package]!.tagsForLookup));
+      packages.retainWhere((package) => combinedTagsPredicate
+          .matches(_documentsByName[package]!.tagsForLookup));
     }
 
     // filter on dependency
     if (query.parsedQuery.hasAnyDependency) {
       packages.removeWhere((package) {
-        final doc = _packages[package]!;
+        final doc = _documentsByName[package]!;
         if (doc.dependencies.isEmpty) return true;
         for (final dependency in query.parsedQuery.allDependencies) {
           if (!doc.dependencies.containsKey(dependency)) return true;
@@ -122,7 +136,7 @@ class InMemoryPackageIndex {
     // filter on points
     if (query.minPoints != null && query.minPoints! > 0) {
       packages.removeWhere((package) {
-        final doc = _packages[package]!;
+        final doc = _documentsByName[package]!;
         return doc.grantedPoints < query.minPoints!;
       });
     }
@@ -132,7 +146,7 @@ class InMemoryPackageIndex {
     if (updatedDuration != null && updatedDuration > Duration.zero) {
       final now = clock.now();
       packages.removeWhere((package) {
-        final doc = _packages[package]!;
+        final doc = _documentsByName[package]!;
         final diff = now.difference(doc.updated);
         return diff > updatedDuration;
       });
@@ -163,7 +177,8 @@ class InMemoryPackageIndex {
             .map((key, value) => value * _adjustedOverallScores[key]!);
         // If the search hits have an exact name match, we move it to the front of the result list.
         final parsedQueryText = query.parsedQuery.text;
-        if (parsedQueryText != null && _packages.containsKey(parsedQueryText)) {
+        if (parsedQueryText != null &&
+            _documentsByName.containsKey(parsedQueryText)) {
           nameMatches = <String>[parsedQueryText];
         }
         packageHits = _rankWithValues(overallScore.getValues());
@@ -215,7 +230,7 @@ class InMemoryPackageIndex {
 
   /// Update the overall score both on [PackageDocument] and in the [_adjustedOverallScores] map.
   void _updateOverallScores() {
-    for (final doc in _packages.values) {
+    for (final doc in _documentsByName.values) {
       final downloadScore = doc.popularityScore ?? 0.0;
       final likeScore = doc.likeScore ?? 0.0;
       final popularity = (downloadScore + likeScore) / 2;
@@ -316,7 +331,7 @@ class InMemoryPackageIndex {
       if (!aborted && phrases.isNotEmpty) {
         final matched = <String, double>{};
         for (final package in score.getKeys()) {
-          final doc = _packages[package]!;
+          final doc = _documentsByName[package]!;
           final bool matchedAllPhrases = phrases.every((phrase) =>
               doc.package.contains(phrase) ||
               doc.description!.contains(phrase) ||
@@ -341,7 +356,8 @@ class InMemoryPackageIndex {
       final int scoreCompare = -a.score!.compareTo(b.score!);
       if (scoreCompare != 0) return scoreCompare;
       // if two packages got the same score, order by last updated
-      return _compareUpdated(_packages[a.package]!, _packages[b.package]!);
+      return _compareUpdated(
+          _documentsByName[a.package]!, _documentsByName[b.package]!);
     });
     return list;
   }
@@ -350,11 +366,12 @@ class InMemoryPackageIndex {
     int Function(PackageDocument a, PackageDocument b) compare, {
     double Function(PackageDocument doc)? score,
   }) {
-    final list = _packages.values
+    final list = _documentsByName.values
         .map((doc) => PackageHit(
             package: doc.package, score: score == null ? null : score(doc)))
         .toList();
-    list.sort((a, b) => compare(_packages[a.package]!, _packages[b.package]!));
+    list.sort((a, b) =>
+        compare(_documentsByName[a.package]!, _documentsByName[b.package]!));
     return list;
   }
 
diff --git a/app/lib/search/sdk_mem_index.dart b/app/lib/search/sdk_mem_index.dart
@@ -80,6 +80,7 @@ class SdkMemIndex {
     DartdocIndex index, {
     Set<String>? allowedLibraries,
   }) async {
+    final textsPerLibrary = <String, Map<String, String>>{};
     for (final f in index.entries) {
       final library = f.qualifiedName?.split('.').first;
       if (library == null) continue;
@@ -92,10 +93,15 @@ class SdkMemIndex {
       if (f.isLibrary) {
         _baseUriPerLibrary[library] = _baseUri.resolve(f.href!).toString();
       }
-      final tokens = _tokensPerLibrary.putIfAbsent(library, () => TokenIndex());
 
       final text = f.qualifiedName?.replaceAll('.', ' ').replaceAll(':', ' ');
-      tokens.add(f.href!, text);
+      if (text != null && text.isNotEmpty) {
+        final texts = textsPerLibrary.putIfAbsent(library, () => {});
+        texts[f.href!] = text;
+      }
+    }
+    for (final e in textsPerLibrary.entries) {
+      _tokensPerLibrary[e.key] = TokenIndex.fromMap(e.value);
     }
   }
 
diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart
@@ -155,31 +155,43 @@ class TokenMatch {
 
 /// Stores a token -> documentId inverted index with weights.
 class TokenIndex {
-  /// Maps token Strings to a weighted map of document ids.
-  final _inverseIds = <String, Map<String, double>>{};
+  final List<String> _ids;
+
+  /// Maps token Strings to a weighted documents (addressed via indexes).
+  final _inverseIds = <String, Map<int, double>>{};
 
   /// {id: size} map to store a value representative to the document length
-  final _docSizes = <String, double>{};
+  late final List<double> _docWeights;
 
-  /// The number of tokens stored in the index.
-  int get tokenCount => _inverseIds.length;
+  late final _length = _docWeights.length;
 
-  int get documentCount => _docSizes.length;
+  TokenIndex(List<String> ids, List<String?> values) : _ids = ids {
+    assert(ids.length == values.length);
+    final length = values.length;
+    _docWeights = List<double>.filled(length, 0.0);
+    for (var i = 0; i < length; i++) {
+      final text = values[i];
 
-  void add(String id, String? text) {
-    if (text == null) return;
-    final tokens = tokenize(text);
-    if (tokens == null || tokens.isEmpty) {
-      return;
-    }
-    for (final token in tokens.keys) {
-      final Map<String, double> weights =
-          _inverseIds.putIfAbsent(token, () => <String, double>{});
-      weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!);
+      if (text == null) {
+        continue;
+      }
+      final tokens = tokenize(text);
+      if (tokens == null || tokens.isEmpty) {
+        continue;
+      }
+      for (final token in tokens.keys) {
+        final weights = _inverseIds.putIfAbsent(token, () => {});
+        weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!);
+      }
+      // Document weight is a highly scaled-down proxy of the length.
+      _docWeights[i] = 1 + math.log(1 + tokens.length) / 100;
     }
-    // Document size is a highly scaled-down proxy of the length.
-    final docSize = 1 + math.log(1 + tokens.length) / 100;
-    _docSizes[id] = docSize;
+  }
+
+  factory TokenIndex.fromMap(Map<String, String> map) {
+    final keys = map.keys.toList();
+    final values = map.values.toList();
+    return TokenIndex(keys, values);
   }
 
   /// Match the text against the corpus and return the tokens or
@@ -191,9 +203,8 @@ class TokenIndex {
     for (final word in splitForIndexing(text)) {
       final tokens = tokenize(word, isSplit: true) ?? {};
 
-      final present = tokens.keys
-          .where((token) => (_inverseIds[token]?.length ?? 0) > 0)
-          .toList();
+      final present =
+          tokens.keys.where((token) => _inverseIds.containsKey(token)).toList();
       if (present.isEmpty) {
         return TokenMatch();
       }
@@ -219,14 +230,12 @@ class TokenIndex {
   Map<String, double> _scoreDocs(TokenMatch tokenMatch,
       {double weight = 1.0, int wordCount = 1, Set<String>? limitToIds}) {
     // Summarize the scores for the documents.
-    final docScores = <String, double>{};
+    final docScores = List<double>.filled(_length, 0.0);
     for (final token in tokenMatch.tokens) {
       final docWeights = _inverseIds[token]!;
       for (final e in docWeights.entries) {
-        if (limitToIds != null && !limitToIds.contains(e.key)) continue;
-        final double prevValue = docScores[e.key] ?? 0.0;
-        final double currentValue = tokenMatch[token]! * e.value;
-        docScores[e.key] = math.max(prevValue, currentValue);
+        final i = e.key;
+        docScores[i] = math.max(docScores[i], tokenMatch[token]! * e.value);
       }
     }
 
@@ -235,15 +244,24 @@ class TokenIndex {
     // compensate the formula in order to prevent multiple exponential penalties.
     final double wordSizeExponent = 1.0 / wordCount;
 
+    final result = <String, double>{};
     // post-process match weights
-    docScores.updateAll((id, docScore) {
-      var docSize = _docSizes[id]!;
+    for (var i = 0; i < _length; i++) {
+      final id = _ids[i];
+      final w = docScores[i];
+      if (w <= 0.0) {
+        continue;
+      }
+      if (limitToIds != null && !limitToIds.contains(id)) {
+        continue;
+      }
+      var dw = _docWeights[i];
       if (wordCount > 1) {
-        docSize = math.pow(docSize, wordSizeExponent).toDouble();
+        dw = math.pow(dw, wordSizeExponent).toDouble();
       }
-      return weight * docScore / docSize;
-    });
-    return docScores;
+      result[id] = w * weight / dw;
+    }
+    return result;
   }
 
   /// Search the index for [text], with a (term-match / document coverage percent)
diff --git a/app/test/search/token_index_test.dart b/app/test/search/token_index_test.dart