Skip to content

Commit 96d6387

Browse files
committed
chore(refactor): make it prominent that we store even non indexable content
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent cc8b502 commit 96d6387

File tree

1 file changed

+24
-5
lines changed

1 file changed

+24
-5
lines changed

rag/persistency.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,20 @@ func (db *PersistentKB) storeFile(entry string, metadata map[string]string) erro
295295
return fmt.Errorf("failed to copy file: %w", err)
296296
}
297297

298+
// Files whose content cannot be text-extracted (audio, images, etc.) are
299+
// stored as "raw-only" entries: the binary is kept in assetDir and the
300+
// filename is indexed so it appears in ListDocuments() and can be served
301+
// via GetEntryFilePath(), but no semantic chunks are created.
302+
if !isChunkableFile(fileName) {
303+
xlog.Info("Storing as raw-only entry (not semantically indexed)", "entry", entry, "fileName", fileName)
304+
db.index[fileName] = nil
305+
return db.save()
306+
}
307+
298308
beforeCount := db.Engine.Count()
299309
results, err := db.store(metadata, fileName)
300310
if err != nil {
301-
// File is already copied to assetDir. Index it with no chunks so it
302-
// still appears in ListDocuments and can be served via GetEntryFilePath.
303-
xlog.Warn("Chunking failed, storing file without chunks", "entry", entry, "error", err)
304-
db.index[fileName] = nil
305-
return db.save()
311+
return fmt.Errorf("failed to store file: %w", err)
306312
}
307313
afterCount := db.Engine.Count()
308314
xlog.Info("Stored file", "entry", entry, "fileName", fileName, "results_count", len(results), "count_before", beforeCount, "count_after", afterCount, "added_count", afterCount-beforeCount)
@@ -475,6 +481,19 @@ func copyFile(src, dst string) error {
475481
return nil
476482
}
477483

484+
// isChunkableFile reports whether the file type supports text extraction and
485+
// semantic chunking. Files that return false are stored as "raw-only" entries:
486+
// they are kept on disk and indexed so they appear in ListDocuments() and can
487+
// be served via GetEntryFilePath(), but they have no semantic chunks and will
488+
// not appear in search results.
489+
func isChunkableFile(path string) bool {
490+
switch strings.ToLower(filepath.Ext(path)) {
491+
case ".pdf", ".txt", ".md":
492+
return true
493+
}
494+
return false
495+
}
496+
478497
// fileToText extracts the full text from a stored file (same logic as chunkFile but no splitting).
479498
// Used by GetEntryFileContent to return content without chunk overlap.
480499
func fileToText(fpath string) (string, error) {

0 commit comments

Comments
 (0)