Skip to content

Commit f45a893

Browse files
committed
WIP: optimize indexing
1 parent 10eddf7 commit f45a893

File tree

1 file changed

+86
-41
lines changed

1 file changed

+86
-41
lines changed

bdx/index.py

Lines changed: 86 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -759,10 +759,17 @@ def add_symbol(self, symbol: Symbol):
759759
document.set_data(pickle.dumps(symbol))
760760
db.add_document(document)
761761

762-
def delete_file(self, file: Path):
763-
"""Delete all documents for the given file path."""
764-
term_with_prefix = self.schema["path"].prefix + str(file)
765-
self._live_writable_db().delete_document(term_with_prefix)
762+
def replace_symbol(self, old_document: xapian.Document, symbol: Symbol):
763+
"""Replace an existing document."""
764+
db = self._live_writable_db()
765+
new_document = xapian.Document()
766+
self.schema.index_document(new_document, **asdict(symbol))
767+
new_document.set_data(pickle.dumps(symbol))
768+
db.replace_document(old_document.get_docid(), new_document)
769+
770+
def delete_document(self, doc: xapian.Document):
771+
"""Delete the given document from this index."""
772+
self._live_writable_db().delete_document(doc.get_docid())
766773

767774
def all_files(self) -> Iterator[Path]:
768775
"""Yield all the files indexed in this SymbolIndex."""
@@ -787,6 +794,22 @@ def all_files(self) -> Iterator[Path]:
787794
seen_paths.add(path)
788795
yield path
789796

797+
def get_docs_for_path(self, path: Path) -> list[xapian.Document]:
798+
"""Get all documents for given path."""
799+
raw_term = self.schema["path"].prefix.encode() + str(path).encode()
800+
raw_term = raw_term[:MAX_TERM_SIZE]
801+
802+
db = self._live_db()
803+
postlist: xapian.PostingIter = db.postlist(raw_term) # pyright: ignore
804+
805+
docs = []
806+
807+
for it in postlist:
808+
doc = db.get_document(it.docid)
809+
docs.append(doc)
810+
811+
return docs
812+
790813
def iter_prefix(self, field: str, value_prefix: str) -> Iterator[str]:
791814
"""Return all the possible values for ``field`` with given prefix."""
792815
db = self._live_db()
@@ -896,6 +919,7 @@ def _index_single_file(
896919
file: Path,
897920
options: IndexingOptions,
898921
use_compilation_database: bool,
922+
outdated_documents: list[xapian.Document],
899923
) -> int:
900924
try:
901925
symtab = read_symbols_in_file(
@@ -923,28 +947,33 @@ def _index_single_file(
923947
symbol.mtime,
924948
)
925949

926-
index.add_symbol(symbol)
950+
if outdated_documents:
951+
index.replace_symbol(outdated_documents.pop(), symbol)
952+
else:
953+
index.add_symbol(symbol)
927954

928955
num += 1
929956

930957
if num == 0:
931958
trace("{}: No symbols found", file)
932959
# Add a single document if there are no symbols. Otherwise,
933960
# we would always treat it as unindexed.
934-
index.add_symbol(
935-
Symbol(
936-
path=file,
937-
source=None,
938-
name="",
939-
demangled=None,
940-
section="",
941-
address=0,
942-
size=0,
943-
type=SymbolType.NOTYPE,
944-
relocations=list(),
945-
mtime=file.stat().st_mtime_ns,
946-
)
961+
symbol = Symbol(
962+
path=file,
963+
source=None,
964+
name="",
965+
demangled=None,
966+
section="",
967+
address=0,
968+
size=0,
969+
type=SymbolType.NOTYPE,
970+
relocations=list(),
971+
mtime=file.stat().st_mtime_ns,
947972
)
973+
if outdated_documents:
974+
index.replace_symbol(outdated_documents.pop(), symbol)
975+
else:
976+
index.add_symbol(symbol)
948977
num += 1
949978

950979
trace("{}: Adding {} symbol(s) to index", file, num)
@@ -960,12 +989,14 @@ def __init__(
960989
options: IndexingOptions,
961990
should_quit: Callable[[], bool],
962991
index_path: Path,
992+
files_to_delete: Collection[Path],
963993
use_compilation_database: bool,
964994
dry_run: bool,
965995
):
966996
self.options = options
967997
self.should_quit = should_quit
968998
self.index_path = index_path
999+
self.files_to_delete = set(files_to_delete)
9691000
self.use_compilation_database = use_compilation_database
9701001
self.dry_run = dry_run
9711002

@@ -1050,6 +1081,20 @@ def _worker(self):
10501081
SymbolIndex.open_shard(self.index_path) as index,
10511082
index.transaction(),
10521083
):
1084+
deletable_files = set(self.files_to_delete).intersection(
1085+
set(index.all_files())
1086+
)
1087+
1088+
outdated_documents: list[xapian.Document] = []
1089+
for f in deletable_files:
1090+
docs = index.get_docs_for_path(f)
1091+
outdated_documents.extend(docs)
1092+
1093+
debug(
1094+
"There are {} outdated documents to recycle",
1095+
len(outdated_documents),
1096+
)
1097+
10531098
while not self._stop_event.is_set():
10541099
parent = mp.parent_process()
10551100
if parent is not None and not parent.is_alive():
@@ -1069,10 +1114,20 @@ def _worker(self):
10691114
path,
10701115
self.options,
10711116
self.use_compilation_database,
1117+
outdated_documents,
10721118
)
10731119

10741120
self._result_queue.put(result)
10751121

1122+
trace(
1123+
"There are {} outdated documents to delete",
1124+
len(outdated_documents),
1125+
)
1126+
1127+
if not self.dry_run:
1128+
for doc in outdated_documents:
1129+
index.delete_document(doc)
1130+
10761131

10771132
def index_binary_directory(
10781133
directory: str | Path,
@@ -1136,30 +1191,19 @@ def index_binary_directory(
11361191
stats.num_files_changed = len(changed_files)
11371192
stats.num_files_deleted = len(deleted_files)
11381193

1139-
def unindex_file(path, is_deleted):
1140-
if dry_run:
1141-
if path in existing_files:
1142-
if is_deleted:
1143-
print(f"unindex-deleted-file {path}")
1144-
else:
1145-
print(f"unindex-outdated-file {path}")
1146-
else:
1147-
index.delete_file(path)
1194+
def log_unindex_file(path, is_deleted):
1195+
if dry_run and path in existing_files:
1196+
if is_deleted:
1197+
print(f"unindex-deleted-file {path}")
1198+
debug("File deleted: {}", file)
1199+
else:
1200+
print(f"unindex-outdated-file {path}")
1201+
debug("File modified: {}", file)
11481202

1149-
for file in make_progress_bar(
1150-
changed_files,
1151-
desc="Removing outdated files",
1152-
leave=False,
1153-
):
1154-
unindex_file(file, is_deleted=False)
1155-
debug("File modified: {}", file)
1156-
for file in make_progress_bar(
1157-
deleted_files,
1158-
desc="Removing deleted files",
1159-
leave=False,
1160-
):
1161-
unindex_file(file, is_deleted=True)
1162-
debug("File deleted: {}", file)
1203+
for file in changed_files:
1204+
log_unindex_file(file, is_deleted=False)
1205+
for file in deleted_files:
1206+
log_unindex_file(file, is_deleted=True)
11631207

11641208
if options.save_filters:
11651209
saved_exclusions.extend(original_exclusions)
@@ -1172,6 +1216,7 @@ def unindex_file(path, is_deleted):
11721216
options,
11731217
interrupted,
11741218
index_path,
1219+
files_to_delete=changed_files + deleted_files,
11751220
use_compilation_database=use_compilation_database,
11761221
dry_run=dry_run,
11771222
) as pool,

0 commit comments

Comments
 (0)