Skip to content

Commit a1a5d2e

Browse files
authored
feat: allow to fetch entry from collection / support overlap (#41)
* feat: allow to fetch entry from collection Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Attach to webui Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add overlap support, return content from file Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 947a2fa commit a1a5d2e

File tree

14 files changed

+603
-86
lines changed

14 files changed

+603
-86
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ LocalRecall uses environment variables to configure its behavior. These variable
202202
| `LISTENING_ADDRESS` | Address the server listens on (default: `:8080`). Useful for deployments on custom ports or network interfaces. |
203203
| `VECTOR_ENGINE` | Vector database engine to use (`chromem` by default, `postgres` for PostgreSQL). |
204204
| `MAX_CHUNKING_SIZE` | Maximum size (in characters) for breaking down documents into chunks. Affects performance and accuracy. |
205+
| `CHUNK_OVERLAP` | Overlap in characters between consecutive chunks (word-aligned). Default: 0. Use to improve context across chunk boundaries. |
205206
| `HYBRID_SEARCH_BM25_WEIGHT` | Weight for BM25 keyword search in hybrid search (default: 0.5, PostgreSQL only). |
206207
| `HYBRID_SEARCH_VECTOR_WEIGHT` | Weight for vector similarity search in hybrid search (default: 0.5, PostgreSQL only). |
207208
| `API_KEYS` | Comma-separated list of API keys for securing access to the REST API (optional). |
@@ -246,6 +247,14 @@ curl -X GET $BASE_URL/collections
246247
curl -X GET $BASE_URL/collections/myCollection/entries
247248
```
248249

250+
- **Get Entry Content**:
251+
252+
```sh
253+
curl -X GET $BASE_URL/collections/myCollection/entries/file.txt
254+
```
255+
256+
Returns `collection`, `entry`, `chunks` (array of `id`, `content`, `metadata`), and `count`.
257+
249258
- **Search Collection**:
250259

251260
```sh

main.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ var (
2121
listeningAddress = os.Getenv("LISTENING_ADDRESS")
2222
vectorEngine = os.Getenv("VECTOR_ENGINE")
2323
maxChunkingSize = os.Getenv("MAX_CHUNKING_SIZE")
24+
chunkOverlap = os.Getenv("CHUNK_OVERLAP")
2425
apiKeys = os.Getenv("API_KEYS")
2526
gitPrivateKey = os.Getenv("GIT_PRIVATE_KEY")
2627
sourceManager = rag.NewSourceManager(&sources.Config{
@@ -77,7 +78,16 @@ func startAPI(listenAddress string) {
7778
}
7879
}
7980

80-
registerAPIRoutes(e, openAIClient, chunkingSize, keys)
81+
overlap := 0
82+
if chunkOverlap != "" {
83+
var err error
84+
overlap, err = strconv.Atoi(chunkOverlap)
85+
if err != nil {
86+
e.Logger.Fatal("Failed to convert CHUNK_OVERLAP to integer")
87+
}
88+
}
89+
90+
registerAPIRoutes(e, openAIClient, chunkingSize, overlap, keys)
8191

8292
e.Logger.Fatal(e.Start(listenAddress))
8393
}

pkg/chunk/chunk_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package chunk_test
22

33
import (
4+
"strings"
5+
46
. "github.com/mudler/localrecall/pkg/chunk"
57
. "github.com/onsi/ginkgo/v2"
68
. "github.com/onsi/gomega"
@@ -45,5 +47,83 @@ var _ = Describe("Chunk", func() {
4547
chunks := SplitParagraphIntoChunks(text, 30)
4648
Expect(chunks).ToNot(BeEmpty())
4749
})
50+
51+
It("should split words longer than maxChunkSize into chunks each <= maxChunkSize", func() {
52+
text := "normal verylongwordhere end"
53+
chunks := SplitParagraphIntoChunks(text, 5)
54+
Expect(chunks).ToNot(BeEmpty())
55+
for _, c := range chunks {
56+
Expect(len(c)).To(BeNumerically("<=", 5))
57+
}
58+
// "verylongwordhere" (16 chars) with max 5 -> 4 chunks of 5,5,5,1
59+
Expect(chunks).To(ContainElement("veryl"))
60+
Expect(chunks).To(ContainElement("ongwo"))
61+
Expect(chunks).To(ContainElement("rdher"))
62+
Expect(chunks).To(ContainElement("e"))
63+
})
64+
65+
It("backward compatibility: SplitParagraphIntoChunks matches Options with Overlap 0", func() {
66+
text := "This is a very long text that should be split into multiple chunks."
67+
chunksLegacy := SplitParagraphIntoChunks(text, 20)
68+
chunksOpts := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 0, SplitLongWords: true})
69+
Expect(chunksLegacy).To(Equal(chunksOpts))
70+
})
71+
})
72+
73+
Describe("SplitParagraphIntoChunksWithOptions", func() {
74+
It("should apply overlap between consecutive chunks", func() {
75+
text := "one two three four five six seven eight nine ten"
76+
chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 20, Overlap: 10})
77+
Expect(chunks).ToNot(BeEmpty())
78+
for _, c := range chunks {
79+
Expect(len(c)).To(BeNumerically("<=", 20))
80+
}
81+
// Consecutive chunks should share a suffix/prefix
82+
for i := 0; i < len(chunks)-1; i++ {
83+
tail := chunks[i]
84+
head := chunks[i+1]
85+
// Some overlap: tail of chunk i should appear at start of chunk i+1
86+
found := false
87+
for n := 1; n <= len(tail) && n <= len(head); n++ {
88+
if tail[len(tail)-n:] == head[:n] {
89+
found = true
90+
break
91+
}
92+
}
93+
// Or head starts with last words of tail (word-aligned)
94+
wordsTail := strings.Fields(tail)
95+
if len(wordsTail) > 0 {
96+
lastWord := wordsTail[len(wordsTail)-1]
97+
if strings.HasPrefix(head, lastWord) || head == lastWord {
98+
found = true
99+
}
100+
}
101+
Expect(found).To(BeTrue(), "chunk %d and %d should share overlap", i, i+1)
102+
}
103+
})
104+
105+
It("Overlap 0 matches no overlap", func() {
106+
text := "a b c d e f g h i j k l m n o p"
107+
chunksOverlap := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 0})
108+
chunksLegacy := SplitParagraphIntoChunks(text, 10)
109+
Expect(chunksOverlap).To(Equal(chunksLegacy))
110+
})
111+
112+
It("SplitLongWords false allows a single word to exceed MaxSize as one chunk", func() {
113+
text := "short verylongwordhere end"
114+
chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 5, Overlap: 0, SplitLongWords: false})
115+
Expect(chunks).To(ContainElement("verylongwordhere"))
116+
Expect(chunks).To(ContainElement("short"))
117+
Expect(chunks).To(ContainElement("end"))
118+
})
119+
120+
It("clamps Overlap >= MaxSize to MaxSize-1", func() {
121+
text := "one two three four five"
122+
chunks := SplitParagraphIntoChunksWithOptions(text, Options{MaxSize: 10, Overlap: 99})
123+
Expect(chunks).ToNot(BeEmpty())
124+
for _, c := range chunks {
125+
Expect(len(c)).To(BeNumerically("<=", 10))
126+
}
127+
})
48128
})
49129
})

pkg/chunk/chunking.go

Lines changed: 165 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,43 +4,188 @@ import (
44
"strings"
55
)
66

7-
// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
8-
// and returns a slice of strings where each string is a chunk of the paragraph
9-
// that is at most maxChunkSize long, ensuring that words are not split.
10-
func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string {
11-
if len(paragraph) <= maxChunkSize {
12-
return []string{paragraph}
7+
// Options configures paragraph chunking.
8+
type Options struct {
9+
// MaxSize is the maximum characters per chunk (required, must be > 0).
10+
MaxSize int
11+
// Overlap is the overlap in characters between consecutive chunks, word-aligned (0 = no overlap).
12+
// Must be < MaxSize; values >= MaxSize are clamped to MaxSize-1.
13+
Overlap int
14+
// SplitLongWords, when true, splits words longer than MaxSize into smaller chunks so no chunk exceeds MaxSize (default true).
15+
SplitLongWords bool
16+
}
17+
18+
// splitLongString splits s into pieces of at most maxSize characters.
19+
// Returns a slice of substrings; each has length <= maxSize.
20+
func splitLongString(s string, maxSize int) []string {
21+
if maxSize <= 0 || len(s) <= maxSize {
22+
return []string{s}
23+
}
24+
var pieces []string
25+
for len(s) > 0 {
26+
n := maxSize
27+
if n > len(s) {
28+
n = len(s)
29+
}
30+
pieces = append(pieces, s[:n])
31+
s = s[n:]
1332
}
33+
return pieces
34+
}
1435

36+
// overlapTail returns the suffix of chunk that is at most overlap characters and word-aligned (whole words only).
37+
// If overlap is 0 or chunk is empty, returns "".
38+
func overlapTail(chunk string, overlap int) string {
39+
if overlap <= 0 || chunk == "" {
40+
return ""
41+
}
42+
words := strings.Fields(chunk)
43+
if len(words) == 0 {
44+
return ""
45+
}
46+
// Take words from the end until we would exceed overlap (length includes spaces between words).
47+
var tail []string
48+
length := 0
49+
for i := len(words) - 1; i >= 0; i-- {
50+
w := words[i]
51+
addLen := len(w)
52+
if len(tail) > 0 {
53+
addLen++ // space before this word
54+
}
55+
if length+addLen > overlap {
56+
break
57+
}
58+
tail = append([]string{w}, tail...)
59+
length += addLen
60+
}
61+
return strings.Join(tail, " ")
62+
}
63+
64+
// SplitParagraphIntoChunksWithOptions splits a paragraph into chunks according to opts.
65+
// Chunks are word-boundary aligned; consecutive chunks may overlap by opts.Overlap characters (word-aligned).
66+
// Words longer than opts.MaxSize are split into smaller chunks when opts.SplitLongWords is true.
67+
func SplitParagraphIntoChunksWithOptions(paragraph string, opts Options) []string {
68+
maxSize := opts.MaxSize
69+
if maxSize <= 0 {
70+
maxSize = 1
71+
}
72+
overlap := opts.Overlap
73+
if overlap >= maxSize {
74+
overlap = maxSize - 1
75+
}
76+
if overlap < 0 {
77+
overlap = 0
78+
}
79+
splitLongWords := opts.SplitLongWords
80+
81+
// Empty or single-chunk within limit (no overlap needed)
82+
if paragraph == "" {
83+
return []string{""}
84+
}
85+
if len(paragraph) <= maxSize && overlap == 0 {
86+
words := strings.Fields(paragraph)
87+
needSplit := false
88+
for _, w := range words {
89+
if len(w) > maxSize && splitLongWords {
90+
needSplit = true
91+
break
92+
}
93+
}
94+
if !needSplit {
95+
return []string{paragraph}
96+
}
97+
}
98+
99+
words := strings.Fields(paragraph)
15100
var chunks []string
16101
var currentChunk strings.Builder
17-
18-
words := strings.Fields(paragraph) // Splits the paragraph into words.
102+
var overlapPrefix string // word-aligned prefix for next chunk (from previous chunk's tail)
19103

20104
for _, word := range words {
21-
// If adding the next word would exceed maxChunkSize (considering a space if not the first word in a chunk),
22-
// add the currentChunk to chunks, and reset currentChunk.
23-
if currentChunk.Len() > 0 && currentChunk.Len()+len(word)+1 > maxChunkSize { // +1 for the space if not the first word
24-
chunks = append(chunks, currentChunk.String())
25-
currentChunk.Reset()
26-
} else if currentChunk.Len() == 0 && len(word) > maxChunkSize { // Word itself exceeds maxChunkSize, split the word
27-
chunks = append(chunks, word)
105+
// Long word: split into pieces when SplitLongWords is true
106+
if len(word) > maxSize && splitLongWords {
107+
// Flush current chunk first
108+
if currentChunk.Len() > 0 {
109+
chunks = append(chunks, currentChunk.String())
110+
if overlap > 0 {
111+
overlapPrefix = overlapTail(currentChunk.String(), overlap)
112+
} else {
113+
overlapPrefix = ""
114+
}
115+
currentChunk.Reset()
116+
}
117+
pieces := splitLongString(word, maxSize)
118+
for _, p := range pieces {
119+
chunks = append(chunks, p)
120+
if overlap > 0 {
121+
overlapPrefix = overlapTail(p, overlap)
122+
}
123+
}
28124
continue
29125
}
30126

31-
// Add a space before the word if it's not the beginning of a new chunk.
127+
// Normal word: compute length if we add this word
128+
var nextLen int
32129
if currentChunk.Len() > 0 {
33-
currentChunk.WriteString(" ")
130+
nextLen = currentChunk.Len() + 1 + len(word)
131+
} else if overlapPrefix != "" {
132+
nextLen = len(overlapPrefix) + 1 + len(word)
133+
} else {
134+
nextLen = len(word)
34135
}
35136

36-
// Add the word to the current chunk.
37-
currentChunk.WriteString(word)
137+
if nextLen > maxSize {
138+
// Flush current chunk
139+
if currentChunk.Len() > 0 {
140+
chunks = append(chunks, currentChunk.String())
141+
if overlap > 0 {
142+
overlapPrefix = overlapTail(currentChunk.String(), overlap)
143+
} else {
144+
overlapPrefix = ""
145+
}
146+
currentChunk.Reset()
147+
}
148+
// Start new chunk with overlap prefix only if it fits with the word
149+
if overlapPrefix != "" && len(overlapPrefix)+1+len(word) <= maxSize {
150+
currentChunk.WriteString(overlapPrefix)
151+
currentChunk.WriteString(" ")
152+
currentChunk.WriteString(word)
153+
overlapPrefix = ""
154+
} else {
155+
currentChunk.WriteString(word)
156+
overlapPrefix = ""
157+
}
158+
} else {
159+
if currentChunk.Len() == 0 && overlapPrefix != "" {
160+
currentChunk.WriteString(overlapPrefix)
161+
currentChunk.WriteString(" ")
162+
currentChunk.WriteString(word)
163+
overlapPrefix = ""
164+
} else if currentChunk.Len() > 0 {
165+
currentChunk.WriteString(" ")
166+
currentChunk.WriteString(word)
167+
} else {
168+
currentChunk.WriteString(word)
169+
}
170+
}
38171
}
39172

40-
// After the loop, add any remaining content in currentChunk to chunks.
41173
if currentChunk.Len() > 0 {
42174
chunks = append(chunks, currentChunk.String())
43175
}
44176

45177
return chunks
46178
}
179+
180+
// SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
181+
// and returns a slice of strings where each string is a chunk of the paragraph
182+
// that is at most maxChunkSize long, ensuring that words are not split.
183+
// Words longer than maxChunkSize are split into smaller chunks.
184+
// For overlap and other options, use SplitParagraphIntoChunksWithOptions.
185+
func SplitParagraphIntoChunks(paragraph string, maxChunkSize int) []string {
186+
return SplitParagraphIntoChunksWithOptions(paragraph, Options{
187+
MaxSize: maxChunkSize,
188+
Overlap: 0,
189+
SplitLongWords: true,
190+
})
191+
}

0 commit comments

Comments
 (0)