1+ require 'fileutils'
2+
13module Ragnar
24 class Database
35 attr_reader :db_path , :table_name
@@ -6,6 +8,7 @@ def initialize(db_path, table_name: "documents")
68 @db_path = db_path
79 @table_name = table_name
810 @dataset_cache = nil # Cache to prevent file descriptor leaks
11+ @file_metadata_cache = nil # Cache for file metadata dataset
912 ensure_database_exists
1013 end
1114
@@ -251,6 +254,143 @@ def get_all_documents_with_embeddings(limit: nil)
251254 all_docs . select { |doc | doc [ :embedding ] && !doc [ :embedding ] . empty? }
252255 end
253256
257+ # File metadata tracking methods for idempotent indexing
258+
259+ # Get metadata for a specific file
260+ # @param file_path [String] The path of the file
261+ # @return [Hash, nil] File metadata or nil if not found
262+ def get_file_metadata ( file_path )
263+ return nil unless file_metadata_exists?
264+
265+ metadata_dataset = cached_file_metadata_dataset
266+ return nil unless metadata_dataset
267+
268+ metadata_dataset . to_a . find { |row | row [ :file_path ] == file_path }
269+ end
270+
271+ # Store or update file metadata
272+ # @param file_path [String] The path of the file
273+ # @param file_hash [String] Hash of the file contents
274+ # @param chunk_count [Integer] Number of chunks created
275+ def upsert_file_metadata ( file_path , file_hash , chunk_count )
276+ metadata = {
277+ file_path : file_path ,
278+ file_hash : file_hash ,
279+ chunk_count : chunk_count ,
280+ indexed_at : Time . now . iso8601
281+ }
282+
283+ # Define schema for file metadata table
284+ schema = {
285+ file_path : :string ,
286+ file_hash : :string ,
287+ chunk_count : :int64 ,
288+ indexed_at : :string
289+ }
290+
291+ # Clear cache before modifying
292+ clear_file_metadata_cache
293+
294+ if file_metadata_exists?
295+ # Update existing or add new
296+ metadata_dataset = Lancelot ::Dataset . open ( file_metadata_path )
297+ existing_data = metadata_dataset . to_a . reject { |row | row [ :file_path ] == file_path }
298+ existing_data << metadata
299+
300+ # Recreate dataset with updated data
301+ FileUtils . rm_rf ( file_metadata_path ) if File . exist? ( file_metadata_path )
302+ new_dataset = Lancelot ::Dataset . open_or_create ( file_metadata_path , schema : schema )
303+ new_dataset . add_documents ( existing_data ) unless existing_data . empty?
304+ else
305+ # Create new metadata dataset
306+ dataset = Lancelot ::Dataset . open_or_create ( file_metadata_path , schema : schema )
307+ dataset . add_documents ( [ metadata ] )
308+ end
309+
310+ # Clear cache after modification
311+ clear_file_metadata_cache
312+ end
313+
314+ # Delete all chunks associated with a file
315+ # @param file_path [String] The path of the file
316+ # @return [Integer] Number of chunks deleted
317+ def delete_chunks_by_file ( file_path )
318+ return 0 unless dataset_exists?
319+
320+ dataset = cached_dataset
321+ return 0 unless dataset
322+
323+ # Get all documents and filter out the ones from this file
324+ all_docs = dataset . to_a
325+ chunks_to_keep = all_docs . reject { |doc | doc [ :file_path ] == file_path }
326+ deleted_count = all_docs . size - chunks_to_keep . size
327+
328+ if deleted_count > 0
329+ # Clear cache before modifying
330+ clear_dataset_cache
331+
332+ # Recreate the dataset without the deleted chunks
333+ if chunks_to_keep . empty?
334+ # If no chunks left, just remove the dataset
335+ FileUtils . rm_rf ( @db_path ) if File . exist? ( @db_path )
336+ else
337+ # Get schema from first remaining document
338+ first_doc = chunks_to_keep . first
339+ embedding_size = first_doc [ :embedding ] &.size || 768
340+ reduced_size = first_doc [ :reduced_embedding ] &.size
341+
342+ schema = {
343+ id : :string ,
344+ chunk_text : :string ,
345+ file_path : :string ,
346+ chunk_index : :int64 ,
347+ embedding : { type : "vector" , dimension : embedding_size } ,
348+ metadata : :string
349+ }
350+
351+ # Add reduced_embedding if present
352+ if reduced_size
353+ schema [ :reduced_embedding ] = { type : "vector" , dimension : reduced_size }
354+ end
355+
356+ # Recreate dataset
357+ FileUtils . rm_rf ( @db_path ) if File . exist? ( @db_path )
358+ new_dataset = Lancelot ::Dataset . open_or_create ( @db_path , schema : schema )
359+ new_dataset . add_documents ( chunks_to_keep )
360+ end
361+
362+ # Clear cache after modification
363+ clear_dataset_cache
364+ end
365+
366+ deleted_count
367+ end
368+
369+ # Create the file metadata table
370+ def create_file_metadata_table
371+ schema = {
372+ file_path : :string ,
373+ file_hash : :string ,
374+ chunk_count : :int64 ,
375+ indexed_at : :string
376+ }
377+
378+ Lancelot ::Dataset . open_or_create ( file_metadata_path , schema : schema )
379+ clear_file_metadata_cache
380+ end
381+
382+ # Check if a table exists (for migration purposes)
383+ # @param table_name [String] Name of the table to check
384+ # @return [Boolean] true if table exists
385+ def table_exists? ( table_name )
386+ case table_name
387+ when "file_metadata"
388+ file_metadata_exists?
389+ else
390+ dataset_exists?
391+ end
392+ end
393+
254394 def full_text_search ( query , limit : 10 )
255395 return [ ] unless dataset_exists?
256396
@@ -330,8 +470,30 @@ def ensure_database_exists
330470 # Don't create directory - Lance will handle this
331471 end
332472
333- def table_exists?
334- dataset_exists?
473+ # Path to the file metadata dataset
474+ def file_metadata_path
475+ File . join ( File . dirname ( @db_path ) , "file_metadata" )
476+ end
477+
478+ # Check if file metadata dataset exists
479+ def file_metadata_exists?
480+ File . exist? ( file_metadata_path )
481+ end
482+
483+ # Cached file metadata dataset accessor
484+ def cached_file_metadata_dataset
485+ return nil unless File . exist? ( file_metadata_path )
486+
487+ @file_metadata_cache ||= begin
488+ Lancelot ::Dataset . open ( file_metadata_path )
489+ rescue => e
490+ nil
491+ end
492+ end
493+
494+ # Clear the cached file metadata dataset
495+ def clear_file_metadata_cache
496+ @file_metadata_cache = nil
335497 end
336498
337499 # Cached dataset accessor to prevent file descriptor leaks
0 commit comments