@@ -4,43 +4,188 @@ import (
44 "strings"
55)
66
7- // SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
8- // and returns a slice of strings where each string is a chunk of the paragraph
9- // that is at most maxChunkSize long, ensuring that words are not split.
10- func SplitParagraphIntoChunks (paragraph string , maxChunkSize int ) []string {
11- if len (paragraph ) <= maxChunkSize {
12- return []string {paragraph }
7+ // Options configures paragraph chunking.
8+ type Options struct {
9+ // MaxSize is the maximum characters per chunk (required, must be > 0).
10+ MaxSize int
11+ // Overlap is the overlap in characters between consecutive chunks, word-aligned (0 = no overlap).
12+ // Must be < MaxSize; values >= MaxSize are clamped to MaxSize-1.
13+ Overlap int
14+ // SplitLongWords, when true, splits words longer than MaxSize into smaller chunks so no chunk exceeds MaxSize (default true).
15+ SplitLongWords bool
16+ }
17+
18+ // splitLongString splits s into pieces of at most maxSize characters.
19+ // Returns a slice of substrings; each has length <= maxSize.
20+ func splitLongString (s string , maxSize int ) []string {
21+ if maxSize <= 0 || len (s ) <= maxSize {
22+ return []string {s }
23+ }
24+ var pieces []string
25+ for len (s ) > 0 {
26+ n := maxSize
27+ if n > len (s ) {
28+ n = len (s )
29+ }
30+ pieces = append (pieces , s [:n ])
31+ s = s [n :]
1332 }
33+ return pieces
34+ }
1435
36+ // overlapTail returns the suffix of chunk that is at most overlap characters and word-aligned (whole words only).
37+ // If overlap is 0 or chunk is empty, returns "".
38+ func overlapTail (chunk string , overlap int ) string {
39+ if overlap <= 0 || chunk == "" {
40+ return ""
41+ }
42+ words := strings .Fields (chunk )
43+ if len (words ) == 0 {
44+ return ""
45+ }
46+ // Take words from the end until we would exceed overlap (length includes spaces between words).
47+ var tail []string
48+ length := 0
49+ for i := len (words ) - 1 ; i >= 0 ; i -- {
50+ w := words [i ]
51+ addLen := len (w )
52+ if len (tail ) > 0 {
53+ addLen ++ // space before this word
54+ }
55+ if length + addLen > overlap {
56+ break
57+ }
58+ tail = append ([]string {w }, tail ... )
59+ length += addLen
60+ }
61+ return strings .Join (tail , " " )
62+ }
63+
64+ // SplitParagraphIntoChunksWithOptions splits a paragraph into chunks according to opts.
65+ // Chunks are word-boundary aligned; consecutive chunks may overlap by opts.Overlap characters (word-aligned).
66+ // Words longer than opts.MaxSize are split into smaller chunks when opts.SplitLongWords is true.
67+ func SplitParagraphIntoChunksWithOptions (paragraph string , opts Options ) []string {
68+ maxSize := opts .MaxSize
69+ if maxSize <= 0 {
70+ maxSize = 1
71+ }
72+ overlap := opts .Overlap
73+ if overlap >= maxSize {
74+ overlap = maxSize - 1
75+ }
76+ if overlap < 0 {
77+ overlap = 0
78+ }
79+ splitLongWords := opts .SplitLongWords
80+
81+ // Empty or single-chunk within limit (no overlap needed)
82+ if paragraph == "" {
83+ return []string {"" }
84+ }
85+ if len (paragraph ) <= maxSize && overlap == 0 {
86+ words := strings .Fields (paragraph )
87+ needSplit := false
88+ for _ , w := range words {
89+ if len (w ) > maxSize && splitLongWords {
90+ needSplit = true
91+ break
92+ }
93+ }
94+ if ! needSplit {
95+ return []string {paragraph }
96+ }
97+ }
98+
99+ words := strings .Fields (paragraph )
15100 var chunks []string
16101 var currentChunk strings.Builder
17-
18- words := strings .Fields (paragraph ) // Splits the paragraph into words.
102+ var overlapPrefix string // word-aligned prefix for next chunk (from previous chunk's tail)
19103
20104 for _ , word := range words {
21- // If adding the next word would exceed maxChunkSize (considering a space if not the first word in a chunk),
22- // add the currentChunk to chunks, and reset currentChunk.
23- if currentChunk .Len () > 0 && currentChunk .Len ()+ len (word )+ 1 > maxChunkSize { // +1 for the space if not the first word
24- chunks = append (chunks , currentChunk .String ())
25- currentChunk .Reset ()
26- } else if currentChunk .Len () == 0 && len (word ) > maxChunkSize { // Word itself exceeds maxChunkSize, split the word
27- chunks = append (chunks , word )
105+ // Long word: split into pieces when SplitLongWords is true
106+ if len (word ) > maxSize && splitLongWords {
107+ // Flush current chunk first
108+ if currentChunk .Len () > 0 {
109+ chunks = append (chunks , currentChunk .String ())
110+ if overlap > 0 {
111+ overlapPrefix = overlapTail (currentChunk .String (), overlap )
112+ } else {
113+ overlapPrefix = ""
114+ }
115+ currentChunk .Reset ()
116+ }
117+ pieces := splitLongString (word , maxSize )
118+ for _ , p := range pieces {
119+ chunks = append (chunks , p )
120+ if overlap > 0 {
121+ overlapPrefix = overlapTail (p , overlap )
122+ }
123+ }
28124 continue
29125 }
30126
31- // Add a space before the word if it's not the beginning of a new chunk.
127+ // Normal word: compute length if we add this word
128+ var nextLen int
32129 if currentChunk .Len () > 0 {
33- currentChunk .WriteString (" " )
130+ nextLen = currentChunk .Len () + 1 + len (word )
131+ } else if overlapPrefix != "" {
132+ nextLen = len (overlapPrefix ) + 1 + len (word )
133+ } else {
134+ nextLen = len (word )
34135 }
35136
36- // Add the word to the current chunk.
37- currentChunk .WriteString (word )
137+ if nextLen > maxSize {
138+ // Flush current chunk
139+ if currentChunk .Len () > 0 {
140+ chunks = append (chunks , currentChunk .String ())
141+ if overlap > 0 {
142+ overlapPrefix = overlapTail (currentChunk .String (), overlap )
143+ } else {
144+ overlapPrefix = ""
145+ }
146+ currentChunk .Reset ()
147+ }
148+ // Start new chunk with overlap prefix only if it fits with the word
149+ if overlapPrefix != "" && len (overlapPrefix )+ 1 + len (word ) <= maxSize {
150+ currentChunk .WriteString (overlapPrefix )
151+ currentChunk .WriteString (" " )
152+ currentChunk .WriteString (word )
153+ overlapPrefix = ""
154+ } else {
155+ currentChunk .WriteString (word )
156+ overlapPrefix = ""
157+ }
158+ } else {
159+ if currentChunk .Len () == 0 && overlapPrefix != "" {
160+ currentChunk .WriteString (overlapPrefix )
161+ currentChunk .WriteString (" " )
162+ currentChunk .WriteString (word )
163+ overlapPrefix = ""
164+ } else if currentChunk .Len () > 0 {
165+ currentChunk .WriteString (" " )
166+ currentChunk .WriteString (word )
167+ } else {
168+ currentChunk .WriteString (word )
169+ }
170+ }
38171 }
39172
40- // After the loop, add any remaining content in currentChunk to chunks.
41173 if currentChunk .Len () > 0 {
42174 chunks = append (chunks , currentChunk .String ())
43175 }
44176
45177 return chunks
46178}
179+
180+ // SplitParagraphIntoChunks takes a paragraph and a maxChunkSize as input,
181+ // and returns a slice of strings where each string is a chunk of the paragraph
182+ // that is at most maxChunkSize long, ensuring that words are not split.
183+ // Words longer than maxChunkSize are split into smaller chunks.
184+ // For overlap and other options, use SplitParagraphIntoChunksWithOptions.
185+ func SplitParagraphIntoChunks (paragraph string , maxChunkSize int ) []string {
186+ return SplitParagraphIntoChunksWithOptions (paragraph , Options {
187+ MaxSize : maxChunkSize ,
188+ Overlap : 0 ,
189+ SplitLongWords : true ,
190+ })
191+ }
0 commit comments