@@ -295,14 +295,20 @@ func (db *PersistentKB) storeFile(entry string, metadata map[string]string) erro
295295 return fmt .Errorf ("failed to copy file: %w" , err )
296296 }
297297
298+ // Files whose content cannot be text-extracted (audio, images, etc.) are
299+ // stored as "raw-only" entries: the binary is kept in assetDir and the
300+ // filename is indexed so it appears in ListDocuments() and can be served
301+ // via GetEntryFilePath(), but no semantic chunks are created.
302+ if ! isChunkableFile (fileName ) {
303+ xlog .Info ("Storing as raw-only entry (not semantically indexed)" , "entry" , entry , "fileName" , fileName )
304+ db .index [fileName ] = nil
305+ return db .save ()
306+ }
307+
298308 beforeCount := db .Engine .Count ()
299309 results , err := db .store (metadata , fileName )
300310 if err != nil {
301- // File is already copied to assetDir. Index it with no chunks so it
302- // still appears in ListDocuments and can be served via GetEntryFilePath.
303- xlog .Warn ("Chunking failed, storing file without chunks" , "entry" , entry , "error" , err )
304- db .index [fileName ] = nil
305- return db .save ()
311+ return fmt .Errorf ("failed to store file: %w" , err )
306312 }
307313 afterCount := db .Engine .Count ()
308314 xlog .Info ("Stored file" , "entry" , entry , "fileName" , fileName , "results_count" , len (results ), "count_before" , beforeCount , "count_after" , afterCount , "added_count" , afterCount - beforeCount )
@@ -475,6 +481,19 @@ func copyFile(src, dst string) error {
475481 return nil
476482}
477483
484+ // isChunkableFile reports whether the file type supports text extraction and
485+ // semantic chunking. Files that return false are stored as "raw-only" entries:
486+ // they are kept on disk and indexed so they appear in ListDocuments() and can
487+ // be served via GetEntryFilePath(), but they have no semantic chunks and will
488+ // not appear in search results.
489+ func isChunkableFile (path string ) bool {
490+ switch strings .ToLower (filepath .Ext (path )) {
491+ case ".pdf" , ".txt" , ".md" :
492+ return true
493+ }
494+ return false
495+ }
496+
478497// fileToText extracts the full text from a stored file (same logic as chunkFile but no splitting).
479498// Used by GetEntryFileContent to return content without chunk overlap.
480499func fileToText (fpath string ) (string , error ) {
0 commit comments