Skip to content

Commit 7e4b5f3

Browse files
committed
fix: correctly handle UTF-8 and invalid chars
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 330bcb2 commit 7e4b5f3

File tree

2 files changed

+9
-3
lines changed

2 files changed

+9
-3
lines changed

rag/engine/postgres.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -481,13 +481,16 @@ func (p *PostgresDB) StoreDocuments(s []string, metadata map[string]string) ([]R
481481

482482
// Insert documents
483483
for i, content := range s {
484+
// Sanitize content to prevent invalid UTF-8 from reaching PostgreSQL
485+
content = strings.ToValidUTF8(content, " ")
486+
484487
embedding := resp.Data[i].Embedding
485488
embeddingStr := formatVector(embedding)
486489

487490
// Extract title from metadata if available
488-
title := metadata["title"]
491+
title := strings.ToValidUTF8(metadata["title"], " ")
489492
if title == "" {
490-
title = metadata["source"]
493+
title = strings.ToValidUTF8(metadata["source"], " ")
491494
}
492495

493496
// Calculate word count

rag/persistency.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"path/filepath"
1010

1111
"os"
12+
"strings"
1213
"sync"
1314

1415
"github.com/dslipak/pdf"
@@ -473,7 +474,9 @@ func fileToText(fpath string) (string, error) {
473474
return "", err
474475
}
475476
buf.ReadFrom(b)
476-
return buf.String(), nil
477+
// PDF extraction can produce invalid UTF-8 byte sequences that PostgreSQL rejects.
478+
// Sanitize by replacing invalid sequences with the Unicode replacement character.
479+
return strings.ToValidUTF8(buf.String(), " "), nil
477480
case ".txt", ".md":
478481
f, err := os.Open(fpath)
479482
if err != nil {

0 commit comments

Comments
 (0)