Skip to content

Commit 2a41e1b

Browse files
committed
chore: fix edge cases in embedding
1 parent feff224 commit 2a41e1b

File tree

6 files changed

+339
-21
lines changed

6 files changed

+339
-21
lines changed

packages/core/src/chunking.ts

Lines changed: 114 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ const CHUNK_LEVEL_MAP: Record<Exclude<ChunkingStrategy["chunk_by"], "file">, num
3535
h3: 3
3636
};
3737

38+
/**
39+
* Default maximum chunk size in characters (~6,700 tokens at ~3 chars/token),
40+
* well under OpenAI's 8,191-token embedding limit. Applied when no explicit
41+
* `max_chunk_size` is configured.
42+
*/
43+
export const DEFAULT_MAX_CHUNK_SIZE = 20_000;
44+
3845
// ─── Public API ──────────────────────────────────────────────────
3946

4047
export function buildChunks(input: BuildChunksInput): Chunk[] {
@@ -240,28 +247,23 @@ function slugify(value: string): string {
240247
// ─── AST-safe size rules ─────────────────────────────────────────
241248

242249
function applySizeRules(segments: Segment[], strategy: ChunkingStrategy): Segment[] {
243-
const max = strategy.max_chunk_size;
250+
const max = strategy.max_chunk_size ?? DEFAULT_MAX_CHUNK_SIZE;
244251
const min = strategy.min_chunk_size;
245252

246-
// Phase 1: split oversized segments using AST node boundaries
253+
// Phase 1: split oversized segments — try recursive heading refinement first,
254+
// then fall back to AST node boundary splitting.
247255
const expanded: Segment[] = [];
248256

249257
for (const segment of segments) {
250258
const contentLength = rawMarkdown(segment.nodes, segment.fullMarkdown).length;
251259

252-
if (!max || contentLength <= max) {
260+
if (contentLength <= max) {
253261
expanded.push(segment);
254262
continue;
255263
}
256264

257-
const nodeGroups = splitByNodeSize(segment.nodes, segment.fullMarkdown, max);
258-
nodeGroups.forEach((groupNodes, partIndex) => {
259-
expanded.push({
260-
...segment,
261-
nodes: groupNodes,
262-
part: partIndex + 1
263-
});
264-
});
265+
const refined = refineOversizedSegment(segment, max);
266+
expanded.push(...refined);
265267
}
266268

267269
// Phase 2: merge undersized segments into previous (Opus-style breadcrumb check)
@@ -289,6 +291,107 @@ function applySizeRules(segments: Segment[], strategy: ChunkingStrategy): Segmen
289291
return merged;
290292
}
291293

294+
/**
295+
* Recursively refine an oversized segment by splitting at progressively finer
296+
* heading levels (headingLevel+1, +2, ... up to h6). Falls back to AST node
297+
* boundary splitting when no sub-headings exist.
298+
*/
299+
function refineOversizedSegment(segment: Segment, max: number): Segment[] {
300+
const nextLevel = segment.headingLevel + 1;
301+
if (nextLevel > 6) {
302+
return splitByNodeSizeSegments(segment, max);
303+
}
304+
305+
// Find sub-heading boundaries at nextLevel within this segment's nodes
306+
const subBoundaries: Array<{ nodeIndex: number; heading: string; slug: string }> = [];
307+
const slugCounts = new Map<string, number>();
308+
309+
for (let i = 0; i < segment.nodes.length; i += 1) {
310+
const node = segment.nodes[i]!;
311+
if (node.type === "heading" && node.depth === nextLevel) {
312+
const heading = toString(node).trim() || "section";
313+
const baseSlug = slugify(heading) || "section";
314+
const count = (slugCounts.get(baseSlug) ?? 0) + 1;
315+
slugCounts.set(baseSlug, count);
316+
const slug = count === 1 ? baseSlug : `${baseSlug}-${count}`;
317+
subBoundaries.push({ nodeIndex: i, heading, slug });
318+
}
319+
}
320+
321+
if (subBoundaries.length === 0) {
322+
// No sub-headings at this level — try the next level down
323+
const deeper: Segment = { ...segment, headingLevel: nextLevel };
324+
return refineOversizedSegment(deeper, max);
325+
}
326+
327+
const subSegments: Segment[] = [];
328+
329+
// Preamble: nodes before the first sub-heading (inherits parent heading)
330+
if (subBoundaries[0]!.nodeIndex > 0) {
331+
const preambleNodes = segment.nodes.slice(0, subBoundaries[0]!.nodeIndex);
332+
const preambleContent = rawMarkdown(preambleNodes, segment.fullMarkdown);
333+
if (preambleContent.trim()) {
334+
subSegments.push({
335+
...segment,
336+
nodes: preambleNodes,
337+
part: 1
338+
});
339+
}
340+
}
341+
342+
// Create sub-segments for each sub-heading
343+
for (let i = 0; i < subBoundaries.length; i += 1) {
344+
const boundary = subBoundaries[i]!;
345+
const next = subBoundaries[i + 1];
346+
const startIdx = boundary.nodeIndex;
347+
const endIdx = next ? next.nodeIndex : segment.nodes.length;
348+
const sectionNodes = segment.nodes.slice(startIdx, endIdx);
349+
350+
const content = rawMarkdown(sectionNodes, segment.fullMarkdown);
351+
if (!content.trim()) {
352+
continue;
353+
}
354+
355+
subSegments.push({
356+
kind: "heading",
357+
heading: boundary.heading,
358+
headingLevel: nextLevel,
359+
ancestorTexts: [...segment.ancestorTexts, ...(segment.heading ? [segment.heading] : [])],
360+
ancestorSlugs: [...segment.ancestorSlugs, ...(segment.slug ? [segment.slug] : [])],
361+
slug: boundary.slug,
362+
nodes: sectionNodes,
363+
fullMarkdown: segment.fullMarkdown,
364+
part: 1
365+
});
366+
}
367+
368+
// Recursively refine any sub-segments that are still oversized
369+
const result: Segment[] = [];
370+
for (const sub of subSegments) {
371+
const subLength = rawMarkdown(sub.nodes, sub.fullMarkdown).length;
372+
if (subLength <= max) {
373+
result.push(sub);
374+
} else {
375+
result.push(...refineOversizedSegment(sub, max));
376+
}
377+
}
378+
379+
return result;
380+
}
381+
382+
/**
383+
* Fallback: split an oversized segment at AST node boundaries, producing
384+
* multi-part segments with the same heading metadata.
385+
*/
386+
function splitByNodeSizeSegments(segment: Segment, max: number): Segment[] {
387+
const nodeGroups = splitByNodeSize(segment.nodes, segment.fullMarkdown, max);
388+
return nodeGroups.map((groupNodes, partIndex) => ({
389+
...segment,
390+
nodes: groupNodes,
391+
part: partIndex + 1
392+
}));
393+
}
394+
292395
/**
293396
* AST-safe max-size splitting (from Gemini approach).
294397
*

packages/core/src/embedding.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ export class HashEmbeddingProvider implements EmbeddingProvider {
7373
}
7474
}
7575

76+
/**
77+
* Conservative character limit per text input to stay under the 8191-token
78+
* context window of OpenAI embedding models. We use ~3 chars/token as a
79+
* safety margin so 8000 * 3 = 24 000 characters.
80+
*/
81+
const DEFAULT_MAX_INPUT_CHARS = 24_000;
82+
7683
export class OpenAIEmbeddingProvider implements EmbeddingProvider {
7784
readonly name = "openai";
7885
readonly model: string;
@@ -139,6 +146,16 @@ export class OpenAIEmbeddingProvider implements EmbeddingProvider {
139146

140147
private async embedBatchWithRetry(batch: string[]): Promise<number[][]> {
141148
let attempt = 0;
149+
const truncated = batch.map((text) => {
150+
if (text.length > DEFAULT_MAX_INPUT_CHARS) {
151+
console.warn(
152+
`[docs-mcp] Embedding input truncated from ${text.length} to ${DEFAULT_MAX_INPUT_CHARS} characters. ` +
153+
`Consider lowering max_chunk_size in your chunking strategy to avoid content loss.`
154+
);
155+
return text.slice(0, DEFAULT_MAX_INPUT_CHARS);
156+
}
157+
return text;
158+
});
142159

143160
while (true) {
144161
const response = await fetch(`${this.baseUrl}/embeddings`, {
@@ -149,7 +166,7 @@ export class OpenAIEmbeddingProvider implements EmbeddingProvider {
149166
},
150167
body: JSON.stringify({
151168
model: this.model,
152-
input: batch,
169+
input: truncated,
153170
dimensions: this.dimensions
154171
})
155172
});

packages/core/src/manifest-schema.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export const ChunkingStrategySchema = z
1313
.positive()
1414
.optional()
1515
.describe(
16-
"Maximum chunk size in characters. Chunks exceeding this limit are split at the next available boundary to prevent oversized results."
16+
"Maximum chunk size in characters (default: 20000). Oversized chunks are first split recursively at finer heading levels (e.g. h2→h3→h4→…→h6), preserving semantic structure and breadcrumbs. Only when no further sub-headings exist does it fall back to AST node boundary splitting."
1717
)
1818
.meta({ examples: [8000] }),
1919
min_chunk_size: z

packages/core/test/chunking.test.ts

Lines changed: 166 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { describe, expect, it } from "vitest";
2-
import { buildChunks } from "../src/chunking.js";
2+
import { buildChunks, DEFAULT_MAX_CHUNK_SIZE } from "../src/chunking.js";
33

44
describe("buildChunks", () => {
55
it("creates deterministic chunk IDs and resolves duplicates", () => {
@@ -114,4 +114,169 @@ describe("buildChunks", () => {
114114
expect(chunks[0]?.chunk_id).toBe("guides/min-merge-parts.md#one");
115115
expect(chunks[0]?.content).toContain("tiny");
116116
});
117+
118+
describe("recursive heading refinement", () => {
119+
const bigBody = (chars: number) => "x".repeat(chars);
120+
121+
it("splits oversized h2 at h3 sub-heading boundaries", () => {
122+
const markdown = [
123+
"## Authentication",
124+
bigBody(50),
125+
"",
126+
"### OAuth",
127+
bigBody(50),
128+
"",
129+
"### JWT",
130+
bigBody(50),
131+
"",
132+
"### API Keys",
133+
bigBody(50)
134+
].join("\n");
135+
136+
const chunks = buildChunks({
137+
filepath: "docs/auth.md",
138+
markdown,
139+
strategy: { chunk_by: "h2", max_chunk_size: 100 }
140+
});
141+
142+
const ids = chunks.map((c) => c.chunk_id);
143+
// Preamble content before the first h3 inherits the parent heading
144+
expect(ids).toContain("docs/auth.md#authentication");
145+
// Sub-headings get proper nested IDs
146+
expect(ids).toContain("docs/auth.md#authentication/oauth");
147+
expect(ids).toContain("docs/auth.md#authentication/jwt");
148+
expect(ids).toContain("docs/auth.md#authentication/api-keys");
149+
150+
// Sub-chunks have correct breadcrumbs
151+
const oauthChunk = chunks.find((c) => c.chunk_id === "docs/auth.md#authentication/oauth");
152+
expect(oauthChunk?.breadcrumb).toBe("docs/auth.md > Authentication > OAuth");
153+
expect(oauthChunk?.heading).toBe("OAuth");
154+
expect(oauthChunk?.heading_level).toBe(3);
155+
});
156+
157+
it("recursively refines h2 > h3 > h4 when multiple levels are oversized", () => {
158+
const markdown = [
159+
"## Config",
160+
bigBody(50),
161+
"",
162+
"### Advanced",
163+
bigBody(50),
164+
"",
165+
"#### Timeouts",
166+
bigBody(50),
167+
"",
168+
"#### Retries",
169+
bigBody(50)
170+
].join("\n");
171+
172+
const chunks = buildChunks({
173+
filepath: "docs/config.md",
174+
markdown,
175+
strategy: { chunk_by: "h2", max_chunk_size: 100 }
176+
});
177+
178+
const ids = chunks.map((c) => c.chunk_id);
179+
expect(ids).toContain("docs/config.md#config");
180+
expect(ids).toContain("docs/config.md#config/advanced");
181+
expect(ids).toContain("docs/config.md#config/advanced/timeouts");
182+
expect(ids).toContain("docs/config.md#config/advanced/retries");
183+
184+
// Verify deep breadcrumbs
185+
const timeouts = chunks.find((c) => c.chunk_id === "docs/config.md#config/advanced/timeouts");
186+
expect(timeouts?.breadcrumb).toBe("docs/config.md > Config > Advanced > Timeouts");
187+
});
188+
189+
it("falls back to AST node splitting when no sub-headings exist", () => {
190+
const markdown = [
191+
"## Huge Section",
192+
"Paragraph one. " + bigBody(80),
193+
"",
194+
"Paragraph two. " + bigBody(80)
195+
].join("\n");
196+
197+
const chunks = buildChunks({
198+
filepath: "docs/huge.md",
199+
markdown,
200+
strategy: { chunk_by: "h2", max_chunk_size: 100 }
201+
});
202+
203+
// Should produce multiple parts since there are no sub-headings
204+
expect(chunks.length).toBeGreaterThan(1);
205+
// All parts share the same base slug with part suffixes
206+
expect(chunks[0]?.chunk_id).toBe("docs/huge.md#huge-section");
207+
expect(chunks[1]?.chunk_id).toBe("docs/huge.md#huge-section-part-2");
208+
});
209+
210+
it("applies DEFAULT_MAX_CHUNK_SIZE when no explicit max_chunk_size is set", () => {
211+
// Create a chunk that exceeds DEFAULT_MAX_CHUNK_SIZE
212+
const markdown = [
213+
"## Giant",
214+
bigBody(DEFAULT_MAX_CHUNK_SIZE + 1000),
215+
"",
216+
"## Small",
217+
"tiny"
218+
].join("\n");
219+
220+
const chunks = buildChunks({
221+
filepath: "docs/giant.md",
222+
markdown,
223+
strategy: { chunk_by: "h2" }
224+
});
225+
226+
// The giant section should be split even without explicit max_chunk_size
227+
const giantChunks = chunks.filter((c) => c.chunk_id.startsWith("docs/giant.md#giant"));
228+
expect(giantChunks.length).toBeGreaterThan(1);
229+
});
230+
231+
it("preserves preamble content within a refined section", () => {
232+
const markdown = [
233+
"## Parent",
234+
"This is the preamble before any h3.",
235+
"",
236+
"### Child One",
237+
bigBody(80),
238+
"",
239+
"### Child Two",
240+
bigBody(80)
241+
].join("\n");
242+
243+
const chunks = buildChunks({
244+
filepath: "docs/preamble-refine.md",
245+
markdown,
246+
strategy: { chunk_by: "h2", max_chunk_size: 100 }
247+
});
248+
249+
// Preamble content should be preserved in a chunk with the parent heading
250+
const parentChunk = chunks.find((c) => c.chunk_id === "docs/preamble-refine.md#parent");
251+
expect(parentChunk).toBeDefined();
252+
expect(parentChunk?.content).toContain("preamble before any h3");
253+
254+
// Sub-heading chunks should also exist
255+
const ids = chunks.map((c) => c.chunk_id);
256+
expect(ids).toContain("docs/preamble-refine.md#parent/child-one");
257+
expect(ids).toContain("docs/preamble-refine.md#parent/child-two");
258+
});
259+
260+
it("deduplicates slugs within a refined section", () => {
261+
const markdown = [
262+
"## Parent",
263+
"",
264+
"### Example",
265+
bigBody(80),
266+
"",
267+
"### Example",
268+
bigBody(80)
269+
].join("\n");
270+
271+
const chunks = buildChunks({
272+
filepath: "docs/dedup.md",
273+
markdown,
274+
strategy: { chunk_by: "h2", max_chunk_size: 100 }
275+
});
276+
277+
const ids = chunks.map((c) => c.chunk_id);
278+
expect(ids).toContain("docs/dedup.md#parent/example");
279+
expect(ids).toContain("docs/dedup.md#parent/example-2");
280+
});
281+
});
117282
});

0 commit comments

Comments
 (0)