⚡ Bolt: Fix N+1 database insertions in BatchProcessor (#248)

thebearwithabite · google-labs-jules[bot] · web-flow · commit eb53868a9849 · 2026-04-23T07:45:07.000-07:00
Replaced individual per-file SQLite `INSERT` statements with a single batched `executemany` block at the end of the `_process_sequential` and `_process_parallel` functions. Added periodic chunked saves to prevent data loss on crash.

Co-authored-by: google-labs-jules[bot] &lt;161369871+google-labs-jules[bot]@users.noreply.github.com&gt;
Co-authored-by: thebearwithabite &lt;216692431+thebearwithabite@users.noreply.github.com&gt;
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -33,3 +33,11 @@
 ## 2025-05-27 - [Bulk SQLite Inserts and Connection Reuse for Tagging]
 **Learning:** Sequential `.execute` calls for `INSERT OR REPLACE` inside nested loops over large arrays (like tags) coupled with opening independent DB connections per method creates a severe N+1 problem. Benchmarks showed replacing it with a single shared connection and `executemany` arrays resulted in an ~2x speedup on typical batch tagging workloads.
 **Action:** Always batch related SQL records using `.executemany()` and pass an optional `db_connection` downstream to nested operations instead of establishing a new database connection every time.
+
+## 2025-05-15 - Batched DB Inserts
+**Learning:** Sequential processing loops that insert database records one at a time cause N+1 query bottlenecks and extremely poor disk I/O performance on large batches.
+**Action:** Replace `commit()` inside sequential processing loops with `executemany` that runs a single batched commit when the entire result set is gathered.
+
+## 2025-05-15 - Batched DB Inserts vs Crash Recovery
+**Learning:** Fully deferring database saves to the end of a long-running batch job using `executemany` solves the N+1 bottleneck, but introduces a risk of data loss if the process crashes midway.
+**Action:** Use periodic chunked batching (e.g., executing `executemany` every 50 records) inside loops to balance disk I/O performance with incremental crash resilience.
diff --git a/batch_processor.py b/batch_processor.py
@@ -330,15 +330,49 @@ def _process_sequential(self, job: BatchJob) -> List[FileResult]:
                 
                 self.current_progress.questions_asked += result.questions_asked
             
-            # Save result to database
-            self._save_file_result(job.job_id, result)
+            # Result saving is now deferred to batch
             
+            # Batch save periodically to prevent data loss on crash
+            if i > 0 and i % 50 == 0:
+                self._save_file_results_batch(job.job_id, results[-50:])
+
             # Show progress periodically
             if i % 10 == 0 or i == len(job.files) - 1:
                 self._show_progress()
         
+
+        # Save remaining results that weren't caught in the periodic batch saves
+        remainder = len(results) % 50
+        if results and remainder > 0:
+            self._save_file_results_batch(job.job_id, results[-remainder:])
+        elif results and len(results) < 50:
+            self._save_file_results_batch(job.job_id, results)
+
         return results
     
+
+    def _save_file_results_batch(self, job_id: str, results: List[FileResult]):
+        """Save multiple file processing results to database efficiently"""
+        if not results:
+            return
+
+        with sqlite3.connect(self.db_path) as conn:
+            params = [
+                (
+                    job_id, str(r.file_path), r.success, r.action_taken,
+                    r.error_message, r.classification, r.confidence,
+                    r.processing_time, r.questions_asked, datetime.now().isoformat()
+                )
+                for r in results
+            ]
+            conn.executemany("""
+                INSERT INTO file_results
+                (job_id, file_path, success, action_taken, error_message,
+                 classification, confidence, processing_time, questions_asked, processed_at)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, params)
+            conn.commit()
+
     def _process_parallel(self, job: BatchJob, max_workers: int) -> List[FileResult]:
         """Process files in parallel (for automatic modes only)"""
         results = []
@@ -374,9 +408,13 @@ def _process_parallel(self, job: BatchJob, max_workers: int) -> List[FileResult]
                         if result.action_taken == "skipped":
                             self.current_progress.skipped += 1
                     
-                    # Save result
-                    self._save_file_result(job.job_id, result)
+                    # Result saving is now deferred to batch
                     
+                    # Batch save periodically to prevent data loss on crash
+                    if self.current_progress.processed % 50 == 0:
+                        # Save the last 50 results
+                        self._save_file_results_batch(job.job_id, results[-50:])
+
                     # Update progress display
                     if self.current_progress.processed % 10 == 0:
                         self._show_progress()
@@ -396,6 +434,14 @@ def _process_parallel(self, job: BatchJob, max_workers: int) -> List[FileResult]
                         self.current_progress.processed += 1
                         self.current_progress.failed += 1
         
+
+        # Save remaining results that weren't caught in the periodic batch saves
+        remainder = len(results) % 50
+        if results and remainder > 0:
+            self._save_file_results_batch(job.job_id, results[-remainder:])
+        elif results and len(results) < 50:
+            self._save_file_results_batch(job.job_id, results)
+
         return results
     
     def _process_single_file(self, file_path: Path, job: BatchJob) -> FileResult: