33from pathlib import Path
44from typing import Any , Optional
55
6+ from sqlite_rag .extractor import Extractor
67from sqlite_rag .logger import Logger
78from sqlite_rag .models .document_result import DocumentResult
89
@@ -25,6 +26,7 @@ def __init__(self, connection: sqlite3.Connection, settings: Settings):
2526 self ._repository = Repository (self ._conn , settings )
2627 self ._chunker = Chunker (self ._conn , settings )
2728 self ._engine = Engine (self ._conn , settings , chunker = self ._chunker )
29+ self ._extractor = Extractor ()
2830
2931 self .ready = False
3032
@@ -103,7 +105,16 @@ def add(
103105 if use_relative_paths
104106 else str (file_path .absolute ())
105107 )
106- document = Document (content = content , uri = uri , metadata = metadata .copy ())
108+
109+ content , file_metadata = self ._extractor .extract_metadata (
110+ content , file_path
111+ )
112+
113+ merged_metadata = metadata .copy ()
114+ if file_metadata :
115+ merged_metadata ["extracted" ] = file_metadata
116+
117+ document = Document (content = content , uri = uri , metadata = merged_metadata )
107118
108119 exists = self ._repository .document_exists_by_hash (document .hash ())
109120 if exists :
@@ -187,7 +198,21 @@ def rebuild(self, remove_missing: bool = False) -> dict:
187198 content = FileReader .parse_file (
188199 Path (doc .uri ), self ._settings .max_document_size_bytes
189200 )
201+
202+ if not content :
203+ self ._logger .warning (
204+ f"{ i + 1 } /{ total_docs } Skipping empty file: { doc .uri } "
205+ )
206+ not_found += 1
207+ continue
208+
209+ content , file_metadata = self ._extractor .extract_metadata (
210+ content , Path (doc .uri )
211+ )
212+
190213 doc .content = content
214+ if file_metadata :
215+ doc .metadata ["extracted" ] = file_metadata
191216
192217 self ._repository .remove_document (doc_id )
193218 processed_doc = self ._engine .process (doc )
0 commit comments