Skip to content

Commit 32ca1d9

Browse files
committed
Add idempotent indexing with file hash tracking to skip unchanged files, update modified files, and support force reindex, including updated mock helpers and test coverage.
1 parent 88ade74 commit 32ca1d9

File tree

7 files changed

+876
-48
lines changed

7 files changed

+876
-48
lines changed

lib/ragnar/cli.rb

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class CLI < Thor
4343
option :chunk_size, type: :numeric, desc: "Chunk size in tokens (default from config)"
4444
option :chunk_overlap, type: :numeric, desc: "Chunk overlap in tokens (default from config)"
4545
option :model, type: :string, desc: "Embedding model to use (default from config)"
46+
option :force, type: :boolean, default: false, aliases: "-f", desc: "Force reindex all files even if unchanged"
4647
def index(path)
4748
# Expand user paths (handle ~ in user input)
4849
expanded_path = File.expand_path(path)
@@ -76,11 +77,39 @@ def index(path)
7677
)
7778

7879
begin
79-
stats = indexer.index_path(expanded_path)
80+
stats = indexer.index_path(expanded_path, force: options[:force])
81+
8082
say "\nIndexing complete!", :green
81-
say "Files processed: #{stats[:files_processed]}"
82-
say "Chunks created: #{stats[:chunks_created]}"
83-
say "Errors: #{stats[:errors]}" if stats[:errors] > 0
83+
84+
# Display detailed stats
85+
if stats[:files_skipped] > 0
86+
say " Files skipped (unchanged): #{stats[:files_skipped]}", :cyan
87+
end
88+
89+
if stats[:files_processed] > 0
90+
say " New files indexed: #{stats[:files_processed]}", :green
91+
end
92+
93+
if stats[:files_updated] > 0
94+
say " Files updated: #{stats[:files_updated]}", :yellow
95+
if stats[:chunks_deleted] > 0 || stats[:chunks_created] > 0
96+
say " Chunks: -#{stats[:chunks_deleted]} removed, +#{stats[:chunks_created]} added", :yellow
97+
end
98+
elsif stats[:chunks_created] > 0
99+
say " Chunks created: #{stats[:chunks_created]}", :green
100+
end
101+
102+
if stats[:errors] > 0
103+
say " Errors: #{stats[:errors]}", :red
104+
end
105+
106+
# Summary
107+
total_files = stats[:files_processed] + stats[:files_skipped] + stats[:files_updated]
108+
say "\nSummary: #{total_files} files scanned", :cyan
109+
110+
if options[:force]
111+
say " (Force reindex was enabled)", :yellow
112+
end
84113
rescue => e
85114
say "Error during indexing: #{e.message}", :red
86115
exit 1

lib/ragnar/database.rb

Lines changed: 164 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
require 'fileutils'
2+
13
module Ragnar
24
class Database
35
attr_reader :db_path, :table_name
@@ -6,6 +8,7 @@ def initialize(db_path, table_name: "documents")
68
@db_path = db_path
79
@table_name = table_name
810
@dataset_cache = nil # Cache to prevent file descriptor leaks
11+
@file_metadata_cache = nil # Cache for file metadata dataset
912
ensure_database_exists
1013
end
1114

@@ -251,6 +254,143 @@ def get_all_documents_with_embeddings(limit: nil)
251254
all_docs.select { |doc| doc[:embedding] && !doc[:embedding].empty? }
252255
end
253256

257+
# File metadata tracking methods for idempotent indexing
258+
259+
# Get metadata for a specific file
260+
# @param file_path [String] The path of the file
261+
# @return [Hash, nil] File metadata or nil if not found
262+
def get_file_metadata(file_path)
263+
return nil unless file_metadata_exists?
264+
265+
metadata_dataset = cached_file_metadata_dataset
266+
return nil unless metadata_dataset
267+
268+
metadata_dataset.to_a.find { |row| row[:file_path] == file_path }
269+
end
270+
271+
# Store or update file metadata
272+
# @param file_path [String] The path of the file
273+
# @param file_hash [String] Hash of the file contents
274+
# @param chunk_count [Integer] Number of chunks created
275+
def upsert_file_metadata(file_path, file_hash, chunk_count)
276+
metadata = {
277+
file_path: file_path,
278+
file_hash: file_hash,
279+
chunk_count: chunk_count,
280+
indexed_at: Time.now.iso8601
281+
}
282+
283+
# Define schema for file metadata table
284+
schema = {
285+
file_path: :string,
286+
file_hash: :string,
287+
chunk_count: :int64,
288+
indexed_at: :string
289+
}
290+
291+
# Clear cache before modifying
292+
clear_file_metadata_cache
293+
294+
if file_metadata_exists?
295+
# Update existing or add new
296+
metadata_dataset = Lancelot::Dataset.open(file_metadata_path)
297+
existing_data = metadata_dataset.to_a.reject { |row| row[:file_path] == file_path }
298+
existing_data << metadata
299+
300+
# Recreate dataset with updated data
301+
FileUtils.rm_rf(file_metadata_path) if File.exist?(file_metadata_path)
302+
new_dataset = Lancelot::Dataset.open_or_create(file_metadata_path, schema: schema)
303+
new_dataset.add_documents(existing_data) unless existing_data.empty?
304+
else
305+
# Create new metadata dataset
306+
dataset = Lancelot::Dataset.open_or_create(file_metadata_path, schema: schema)
307+
dataset.add_documents([metadata])
308+
end
309+
310+
# Clear cache after modification
311+
clear_file_metadata_cache
312+
end
313+
314+
# Delete all chunks associated with a file
315+
# @param file_path [String] The path of the file
316+
# @return [Integer] Number of chunks deleted
317+
def delete_chunks_by_file(file_path)
318+
return 0 unless dataset_exists?
319+
320+
dataset = cached_dataset
321+
return 0 unless dataset
322+
323+
# Get all documents and filter out the ones from this file
324+
all_docs = dataset.to_a
325+
chunks_to_keep = all_docs.reject { |doc| doc[:file_path] == file_path }
326+
deleted_count = all_docs.size - chunks_to_keep.size
327+
328+
if deleted_count > 0
329+
# Clear cache before modifying
330+
clear_dataset_cache
331+
332+
# Recreate the dataset without the deleted chunks
333+
if chunks_to_keep.empty?
334+
# If no chunks left, just remove the dataset
335+
FileUtils.rm_rf(@db_path) if File.exist?(@db_path)
336+
else
337+
# Get schema from first remaining document
338+
first_doc = chunks_to_keep.first
339+
embedding_size = first_doc[:embedding]&.size || 768
340+
reduced_size = first_doc[:reduced_embedding]&.size
341+
342+
schema = {
343+
id: :string,
344+
chunk_text: :string,
345+
file_path: :string,
346+
chunk_index: :int64,
347+
embedding: { type: "vector", dimension: embedding_size },
348+
metadata: :string
349+
}
350+
351+
# Add reduced_embedding if present
352+
if reduced_size
353+
schema[:reduced_embedding] = { type: "vector", dimension: reduced_size }
354+
end
355+
356+
# Recreate dataset
357+
FileUtils.rm_rf(@db_path) if File.exist?(@db_path)
358+
new_dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
359+
new_dataset.add_documents(chunks_to_keep)
360+
end
361+
362+
# Clear cache after modification
363+
clear_dataset_cache
364+
end
365+
366+
deleted_count
367+
end
368+
369+
# Create the file metadata table
370+
def create_file_metadata_table
371+
schema = {
372+
file_path: :string,
373+
file_hash: :string,
374+
chunk_count: :int64,
375+
indexed_at: :string
376+
}
377+
378+
Lancelot::Dataset.open_or_create(file_metadata_path, schema: schema)
379+
clear_file_metadata_cache
380+
end
381+
382+
# Check if a table exists (for migration purposes)
383+
# @param table_name [String] Name of the table to check
384+
# @return [Boolean] true if table exists
385+
def table_exists?(table_name)
386+
case table_name
387+
when "file_metadata"
388+
file_metadata_exists?
389+
else
390+
dataset_exists?
391+
end
392+
end
393+
254394
def full_text_search(query, limit: 10)
255395
return [] unless dataset_exists?
256396

@@ -330,8 +470,30 @@ def ensure_database_exists
330470
# Don't create directory - Lance will handle this
331471
end
332472

333-
def table_exists?
334-
dataset_exists?
473+
# Path to the file metadata dataset
474+
def file_metadata_path
475+
File.join(File.dirname(@db_path), "file_metadata")
476+
end
477+
478+
# Check if file metadata dataset exists
479+
def file_metadata_exists?
480+
File.exist?(file_metadata_path)
481+
end
482+
483+
# Cached file metadata dataset accessor
484+
def cached_file_metadata_dataset
485+
return nil unless File.exist?(file_metadata_path)
486+
487+
@file_metadata_cache ||= begin
488+
Lancelot::Dataset.open(file_metadata_path)
489+
rescue => e
490+
nil
491+
end
492+
end
493+
494+
# Clear the cached file metadata dataset
495+
def clear_file_metadata_cache
496+
@file_metadata_cache = nil
335497
end
336498

337499
# Cached dataset accessor to prevent file descriptor leaks

0 commit comments

Comments
 (0)