Skip to content

Commit 1b97eb7

Browse files
⚡ Bolt: Batch database inserts in tagging system (#244)
Implemented `save_tagged_files` in `tagging_system.py` using `executemany` to eliminate N+1 database queries when saving tags for multiple files. Refactored `tag_directory` in `tagging_cli.py` to batch tag generation and insertion, greatly improving directory tagging speed. Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: thebearwithabite <216692431+thebearwithabite@users.noreply.github.com>
1 parent 8fd3cd6 commit 1b97eb7

2 files changed

Lines changed: 73 additions & 19 deletions

File tree

tagging_cli.py

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -122,28 +122,41 @@ def tag_directory(directory: str, recursive: bool = True, file_pattern: str = "*
122122
successful = 0
123123
failed = 0
124124

125-
for i, file_path in enumerate(files_to_tag, 1):
126-
print(f"\n📄 [{i}/{len(files_to_tag)}] {file_path.name}")
125+
# Process in batches
126+
batch_size = 100
127+
for i in range(0, len(files_to_tag), batch_size):
128+
batch_files = files_to_tag[i:i + batch_size]
129+
tagged_batch = []
127130

128-
try:
129-
tagged_file = tagger.tag_file(file_path)
130-
success = tagger.save_tagged_file(tagged_file)
131-
132-
if success:
133-
successful += 1
134-
top_tags = sorted([(tag, tagged_file.confidence_scores.get(tag, 0))
135-
for tag in tagged_file.auto_tags],
136-
key=lambda x: x[1], reverse=True)[:3]
131+
for j, file_path in enumerate(batch_files, 1):
132+
print(f"\n📄 [{i+j}/{len(files_to_tag)}] {file_path.name}")
133+
try:
134+
tagged_file = tagger.tag_file(file_path)
135+
if tagged_file:
136+
tagged_batch.append(tagged_file)
137+
138+
top_tags = sorted([(tag, tagged_file.confidence_scores.get(tag, 0))
139+
for tag in tagged_file.auto_tags],
140+
key=lambda x: x[1], reverse=True)[:3]
141+
142+
print(f" ✅ Processed with {len(tagged_file.auto_tags)} auto tags")
143+
print(f" Top: {', '.join([f'{tag}({conf:.0%})' for tag, conf in top_tags])}")
144+
else:
145+
failed += 1
146+
print(f" ❌ Failed to generate tags")
147+
except Exception as e:
148+
failed += 1
149+
print(f" ❌ Error processing: {str(e)[:50]}...")
137150

138-
print(f" ✅ Tagged with {len(tagged_file.auto_tags)} auto tags")
139-
print(f" Top: {', '.join([f'{tag}({conf:.0%})' for tag, conf in top_tags])}")
151+
# Save batch
152+
if tagged_batch:
153+
success = tagger.save_tagged_files(tagged_batch)
154+
if success:
155+
successful += len(tagged_batch)
156+
print(f"\n💾 Successfully saved batch of {len(tagged_batch)} files")
140157
else:
141-
failed += 1
142-
print(f" ❌ Failed to save tags")
143-
144-
except Exception as e:
145-
failed += 1
146-
print(f" ❌ Error: {str(e)[:50]}...")
158+
failed += len(tagged_batch)
159+
print(f"\n❌ Failed to save batch")
147160

148161
print(f"\n📊 Tagging Summary:")
149162
print(f" ✅ Successful: {successful}")

tagging_system.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,47 @@ def save_tagged_file(self, tagged_file: TaggedFile) -> bool:
475475
print(f"❌ Error saving tagged file: {e}")
476476
return False
477477

478+
def save_tagged_files(self, tagged_files: List[TaggedFile]) -> bool:
479+
"""Save multiple tagged files to database efficiently"""
480+
if not tagged_files:
481+
return True
482+
483+
try:
484+
with sqlite3.connect(self.db_path) as conn:
485+
params = [
486+
(
487+
str(tf.file_path),
488+
tf.file_path.name,
489+
tf.file_path.suffix.lower(),
490+
tf.file_hash,
491+
json.dumps(tf.auto_tags),
492+
json.dumps(tf.user_tags),
493+
json.dumps(tf.confidence_scores),
494+
json.dumps(tf.tag_sources),
495+
tf.last_tagged.isoformat(),
496+
datetime.now().isoformat()
497+
)
498+
for tf in tagged_files
499+
]
500+
501+
conn.executemany("""
502+
INSERT OR REPLACE INTO file_tags
503+
(file_path, file_name, file_extension, file_hash, auto_tags, user_tags,
504+
confidence_scores, tag_sources, last_tagged, created_date)
505+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
506+
""", params)
507+
508+
# Update relationships and statistics
509+
for tf in tagged_files:
510+
self._update_tag_relationships(tf, conn)
511+
self._update_tag_statistics(tf, conn)
512+
513+
conn.commit()
514+
return True
515+
except Exception as e:
516+
print(f"❌ Error saving tagged files: {e}")
517+
return False
518+
478519
def _update_tag_relationships(self, tagged_file: TaggedFile, db_connection=None):
479520
"""Update co-occurrence relationships between tags"""
480521

0 commit comments

Comments
 (0)