Skip to content

Commit 37bec00

Browse files
committed
improvement(kb): improve chunkers, respect user-specified chunk configurations, added tests (#2539)
* improvement(kb): improve chunkers, respect user-specified chunk configurations, added tests * ack PR commnets * updated docs * cleanup
1 parent 426c7d6 commit 37bec00

File tree

19 files changed

+540
-177
lines changed

19 files changed

+540
-177
lines changed

apps/docs/content/docs/en/knowledgebase/index.mdx

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,15 @@ Once your documents are processed, you can view and edit the individual chunks.
3434
<Image src="/static/knowledgebase/knowledgebase.png" alt="Document chunks view showing processed content" width={800} height={500} />
3535

3636
### Chunk Configuration
37-
- **Default chunk size**: 1,024 characters
38-
- **Configurable range**: 100-4,000 characters per chunk
39-
- **Smart overlap**: 200 characters by default for context preservation
37+
38+
When creating a knowledge base, you can configure how documents are split into chunks:
39+
40+
| Setting | Unit | Default | Range | Description |
41+
|---------|------|---------|-------|-------------|
42+
| **Max Chunk Size** | tokens | 1,024 | 100-4,000 | Maximum size of each chunk (1 token ≈ 4 characters) |
43+
| **Min Chunk Size** | characters | 1 | 1-2,000 | Minimum chunk size to avoid tiny fragments |
44+
| **Overlap** | characters | 200 | 0-500 | Context overlap between consecutive chunks |
45+
4046
- **Hierarchical splitting**: Respects document structure (sections, paragraphs, sentences)
4147

4248
### Editing Capabilities

apps/sim/app/api/knowledge/[id]/documents/route.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,24 @@ const CreateDocumentSchema = z.object({
3434
documentTagsData: z.string().optional(),
3535
})
3636

37+
/**
38+
* Schema for bulk document creation with processing options
39+
*
40+
* Processing options units:
41+
* - chunkSize: tokens (1 token ≈ 4 characters)
42+
* - minCharactersPerChunk: characters
43+
* - chunkOverlap: characters
44+
*/
3745
const BulkCreateDocumentsSchema = z.object({
3846
documents: z.array(CreateDocumentSchema),
3947
processingOptions: z.object({
48+
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
4049
chunkSize: z.number().min(100).max(4000),
50+
/** Minimum chunk size in characters */
4151
minCharactersPerChunk: z.number().min(1).max(2000),
4252
recipe: z.string(),
4353
lang: z.string(),
54+
/** Overlap between chunks in characters */
4455
chunkOverlap: z.number().min(0).max(500),
4556
}),
4657
bulk: z.literal(true),

apps/sim/app/api/knowledge/[id]/route.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ import { checkKnowledgeBaseAccess, checkKnowledgeBaseWriteAccess } from '@/app/a
1212

1313
const logger = createLogger('KnowledgeBaseByIdAPI')
1414

15+
/**
16+
* Schema for updating a knowledge base
17+
*
18+
* Chunking config units:
19+
* - maxSize: tokens (1 token ≈ 4 characters)
20+
* - minSize: characters
21+
* - overlap: tokens (1 token ≈ 4 characters)
22+
*/
1523
const UpdateKnowledgeBaseSchema = z.object({
1624
name: z.string().min(1, 'Name is required').optional(),
1725
description: z.string().optional(),
@@ -20,10 +28,23 @@ const UpdateKnowledgeBaseSchema = z.object({
2028
workspaceId: z.string().nullable().optional(),
2129
chunkingConfig: z
2230
.object({
23-
maxSize: z.number(),
24-
minSize: z.number(),
25-
overlap: z.number(),
31+
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
32+
maxSize: z.number().min(100).max(4000),
33+
/** Minimum chunk size in characters */
34+
minSize: z.number().min(1).max(2000),
35+
/** Overlap between chunks in characters */
36+
overlap: z.number().min(0).max(500),
2637
})
38+
.refine(
39+
(data) => {
40+
// Convert maxSize from tokens to characters for comparison (1 token ≈ 4 chars)
41+
const maxSizeInChars = data.maxSize * 4
42+
return data.minSize < maxSizeInChars
43+
},
44+
{
45+
message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)',
46+
}
47+
)
2748
.optional(),
2849
})
2950

apps/sim/app/api/knowledge/route.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ describe('Knowledge Base API Route', () => {
139139
const invalidData = {
140140
name: 'Test KB',
141141
chunkingConfig: {
142-
maxSize: 100,
143-
minSize: 200, // Invalid: minSize > maxSize
142+
maxSize: 100, // 100 tokens = 400 characters
143+
minSize: 500, // Invalid: minSize (500 chars) > maxSize (400 chars)
144144
overlap: 50,
145145
},
146146
}
@@ -168,7 +168,7 @@ describe('Knowledge Base API Route', () => {
168168
expect(data.data.embeddingDimension).toBe(1536)
169169
expect(data.data.chunkingConfig).toEqual({
170170
maxSize: 1024,
171-
minSize: 1,
171+
minSize: 100,
172172
overlap: 200,
173173
})
174174
})

apps/sim/app/api/knowledge/route.ts

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ import { createLogger } from '@/lib/logs/console/logger'
77

88
const logger = createLogger('KnowledgeBaseAPI')
99

10+
/**
11+
* Schema for creating a knowledge base
12+
*
13+
* Chunking config units:
14+
* - maxSize: tokens (1 token ≈ 4 characters)
15+
* - minSize: characters
16+
* - overlap: tokens (1 token ≈ 4 characters)
17+
*/
1018
const CreateKnowledgeBaseSchema = z.object({
1119
name: z.string().min(1, 'Name is required'),
1220
description: z.string().optional(),
@@ -15,18 +23,28 @@ const CreateKnowledgeBaseSchema = z.object({
1523
embeddingDimension: z.literal(1536).default(1536),
1624
chunkingConfig: z
1725
.object({
26+
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
1827
maxSize: z.number().min(100).max(4000).default(1024),
19-
minSize: z.number().min(1).max(2000).default(1),
28+
/** Minimum chunk size in characters */
29+
minSize: z.number().min(1).max(2000).default(100),
30+
/** Overlap between chunks in tokens (1 token ≈ 4 characters) */
2031
overlap: z.number().min(0).max(500).default(200),
2132
})
2233
.default({
2334
maxSize: 1024,
24-
minSize: 1,
35+
minSize: 100,
2536
overlap: 200,
2637
})
27-
.refine((data) => data.minSize < data.maxSize, {
28-
message: 'Min chunk size must be less than max chunk size',
29-
}),
38+
.refine(
39+
(data) => {
40+
// Convert maxSize from tokens to characters for comparison (1 token ≈ 4 chars)
41+
const maxSizeInChars = data.maxSize * 4
42+
return data.minSize < maxSizeInChars
43+
},
44+
{
45+
message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)',
46+
}
47+
),
3048
})
3149

3250
export async function GET(req: NextRequest) {

apps/sim/app/api/knowledge/utils.test.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,12 @@ describe('Knowledge Utils', () => {
183183

184184
describe('processDocumentAsync', () => {
185185
it.concurrent('should insert embeddings before updating document counters', async () => {
186-
kbRows.push({ id: 'kb1', userId: 'user1', workspaceId: null })
186+
kbRows.push({
187+
id: 'kb1',
188+
userId: 'user1',
189+
workspaceId: null,
190+
chunkingConfig: { maxSize: 1024, minSize: 1, overlap: 200 },
191+
})
187192
docRows.push({ id: 'doc1', knowledgeBaseId: 'kb1' })
188193

189194
await processDocumentAsync(

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -44,23 +44,33 @@ const FormSchema = z
4444
.max(100, 'Name must be less than 100 characters')
4545
.refine((value) => value.trim().length > 0, 'Name cannot be empty'),
4646
description: z.string().max(500, 'Description must be less than 500 characters').optional(),
47+
/** Minimum chunk size in characters */
4748
minChunkSize: z
4849
.number()
49-
.min(1, 'Min chunk size must be at least 1')
50-
.max(2000, 'Min chunk size must be less than 2000'),
50+
.min(1, 'Min chunk size must be at least 1 character')
51+
.max(2000, 'Min chunk size must be less than 2000 characters'),
52+
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
5153
maxChunkSize: z
5254
.number()
53-
.min(100, 'Max chunk size must be at least 100')
54-
.max(4000, 'Max chunk size must be less than 4000'),
55+
.min(100, 'Max chunk size must be at least 100 tokens')
56+
.max(4000, 'Max chunk size must be less than 4000 tokens'),
57+
/** Overlap between chunks in tokens */
5558
overlapSize: z
5659
.number()
57-
.min(0, 'Overlap size must be non-negative')
58-
.max(500, 'Overlap size must be less than 500'),
59-
})
60-
.refine((data) => data.minChunkSize < data.maxChunkSize, {
61-
message: 'Min chunk size must be less than max chunk size',
62-
path: ['minChunkSize'],
60+
.min(0, 'Overlap must be non-negative')
61+
.max(500, 'Overlap must be less than 500 tokens'),
6362
})
63+
.refine(
64+
(data) => {
65+
// Convert maxChunkSize from tokens to characters for comparison (1 token ≈ 4 chars)
66+
const maxChunkSizeInChars = data.maxChunkSize * 4
67+
return data.minChunkSize < maxChunkSizeInChars
68+
},
69+
{
70+
message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)',
71+
path: ['minChunkSize'],
72+
}
73+
)
6474

6575
type FormValues = z.infer<typeof FormSchema>
6676

@@ -123,7 +133,7 @@ export function CreateBaseModal({
123133
defaultValues: {
124134
name: '',
125135
description: '',
126-
minChunkSize: 1,
136+
minChunkSize: 100,
127137
maxChunkSize: 1024,
128138
overlapSize: 200,
129139
},
@@ -143,7 +153,7 @@ export function CreateBaseModal({
143153
reset({
144154
name: '',
145155
description: '',
146-
minChunkSize: 1,
156+
minChunkSize: 100,
147157
maxChunkSize: 1024,
148158
overlapSize: 200,
149159
})
@@ -381,10 +391,10 @@ export function CreateBaseModal({
381391
<div className='space-y-[12px] rounded-[6px] bg-[var(--surface-6)] px-[12px] py-[14px]'>
382392
<div className='grid grid-cols-2 gap-[12px]'>
383393
<div className='flex flex-col gap-[8px]'>
384-
<Label htmlFor='minChunkSize'>Min Chunk Size</Label>
394+
<Label htmlFor='minChunkSize'>Min Chunk Size (characters)</Label>
385395
<Input
386396
id='minChunkSize'
387-
placeholder='1'
397+
placeholder='100'
388398
{...register('minChunkSize', { valueAsNumber: true })}
389399
className={cn(errors.minChunkSize && 'border-[var(--text-error)]')}
390400
autoComplete='off'
@@ -394,7 +404,7 @@ export function CreateBaseModal({
394404
</div>
395405

396406
<div className='flex flex-col gap-[8px]'>
397-
<Label htmlFor='maxChunkSize'>Max Chunk Size</Label>
407+
<Label htmlFor='maxChunkSize'>Max Chunk Size (tokens)</Label>
398408
<Input
399409
id='maxChunkSize'
400410
placeholder='1024'
@@ -408,7 +418,7 @@ export function CreateBaseModal({
408418
</div>
409419

410420
<div className='flex flex-col gap-[8px]'>
411-
<Label htmlFor='overlapSize'>Overlap Size</Label>
421+
<Label htmlFor='overlapSize'>Overlap (tokens)</Label>
412422
<Input
413423
id='overlapSize'
414424
placeholder='200'
@@ -419,6 +429,9 @@ export function CreateBaseModal({
419429
name='overlap-size'
420430
/>
421431
</div>
432+
<p className='text-[11px] text-[var(--text-muted)]'>
433+
1 token ≈ 4 characters. Max chunk size and overlap are in tokens.
434+
</p>
422435
</div>
423436

424437
<div className='flex flex-col gap-[8px]'>

apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ export class DocsChunker {
3232
// Use the existing TextChunker for chunking logic
3333
this.textChunker = new TextChunker({
3434
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
35-
minChunkSize: options.minChunkSize ?? 1,
36-
overlap: options.overlap ?? 50,
35+
minCharactersPerChunk: options.minCharactersPerChunk ?? 1,
36+
chunkOverlap: options.chunkOverlap ?? 50,
3737
})
3838
// Use localhost docs in development, production docs otherwise
3939
this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai'

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,20 @@ function getTokenCount(text: string): number {
2121
* Reduced limits to ensure we stay well under OpenAI's 8,191 token limit per embedding request
2222
*/
2323
const JSON_YAML_CHUNKING_CONFIG = {
24-
TARGET_CHUNK_SIZE: 1000, // Target tokens per chunk
25-
MIN_CHUNK_SIZE: 100, // Minimum tokens per chunk
24+
TARGET_CHUNK_SIZE: 1024, // Target tokens per chunk
25+
MIN_CHARACTERS_PER_CHUNK: 100, // Minimum characters per chunk to filter tiny fragments
2626
MAX_CHUNK_SIZE: 1500, // Maximum tokens per chunk
2727
MAX_DEPTH_FOR_SPLITTING: 5, // Maximum depth to traverse for splitting
2828
}
2929

3030
export class JsonYamlChunker {
31-
private chunkSize: number
32-
private minChunkSize: number
31+
private chunkSize: number // in tokens
32+
private minCharactersPerChunk: number // in characters
3333

3434
constructor(options: ChunkerOptions = {}) {
35-
this.chunkSize = options.chunkSize || JSON_YAML_CHUNKING_CONFIG.TARGET_CHUNK_SIZE
36-
this.minChunkSize = options.minChunkSize || JSON_YAML_CHUNKING_CONFIG.MIN_CHUNK_SIZE
35+
this.chunkSize = options.chunkSize ?? JSON_YAML_CHUNKING_CONFIG.TARGET_CHUNK_SIZE
36+
this.minCharactersPerChunk =
37+
options.minCharactersPerChunk ?? JSON_YAML_CHUNKING_CONFIG.MIN_CHARACTERS_PER_CHUNK
3738
}
3839

3940
/**
@@ -99,7 +100,8 @@ export class JsonYamlChunker {
99100
const content = JSON.stringify(data, null, 2)
100101
const tokenCount = getTokenCount(content)
101102

102-
if (tokenCount >= this.minChunkSize) {
103+
// Filter tiny fragments using character count
104+
if (content.length >= this.minCharactersPerChunk) {
103105
chunks.push({
104106
text: content,
105107
tokenCount,
@@ -318,7 +320,8 @@ export class JsonYamlChunker {
318320
}
319321
}
320322

321-
if (currentChunk && currentTokens >= this.minChunkSize) {
323+
// Filter tiny fragments using character count
324+
if (currentChunk && currentChunk.length >= this.minCharactersPerChunk) {
322325
chunks.push({
323326
text: currentChunk,
324327
tokenCount: currentTokens,

0 commit comments

Comments
 (0)