chore: fix edge cases in embedding

ThomasRooney · ThomasRooney · commit 2a41e1b562fd · 2026-02-25T09:53:12.000Z
diff --git a/packages/core/src/chunking.ts b/packages/core/src/chunking.ts
@@ -35,6 +35,13 @@ const CHUNK_LEVEL_MAP: Record<Exclude<ChunkingStrategy["chunk_by"], "file">, num
   h3: 3
 };
 
+/**
+ * Default maximum chunk size in characters (~6,700 tokens at ~3 chars/token),
+ * well under OpenAI's 8,191-token embedding limit. Applied when no explicit
+ * `max_chunk_size` is configured.
+ */
+export const DEFAULT_MAX_CHUNK_SIZE = 20_000;
+
 // ─── Public API ──────────────────────────────────────────────────
 
 export function buildChunks(input: BuildChunksInput): Chunk[] {
@@ -240,28 +247,23 @@ function slugify(value: string): string {
 // ─── AST-safe size rules ─────────────────────────────────────────
 
 function applySizeRules(segments: Segment[], strategy: ChunkingStrategy): Segment[] {
-  const max = strategy.max_chunk_size;
+  const max = strategy.max_chunk_size ?? DEFAULT_MAX_CHUNK_SIZE;
   const min = strategy.min_chunk_size;
 
-  // Phase 1: split oversized segments using AST node boundaries
+  // Phase 1: split oversized segments — try recursive heading refinement first,
+  // then fall back to AST node boundary splitting.
   const expanded: Segment[] = [];
 
   for (const segment of segments) {
     const contentLength = rawMarkdown(segment.nodes, segment.fullMarkdown).length;
 
-    if (!max || contentLength <= max) {
+    if (contentLength <= max) {
       expanded.push(segment);
       continue;
     }
 
-    const nodeGroups = splitByNodeSize(segment.nodes, segment.fullMarkdown, max);
-    nodeGroups.forEach((groupNodes, partIndex) => {
-      expanded.push({
-        ...segment,
-        nodes: groupNodes,
-        part: partIndex + 1
-      });
-    });
+    const refined = refineOversizedSegment(segment, max);
+    expanded.push(...refined);
   }
 
   // Phase 2: merge undersized segments into previous (Opus-style breadcrumb check)
@@ -289,6 +291,107 @@ function applySizeRules(segments: Segment[], strategy: ChunkingStrategy): Segmen
   return merged;
 }
 
+/**
+ * Recursively refine an oversized segment by splitting at progressively finer
+ * heading levels (headingLevel+1, +2, ... up to h6). Falls back to AST node
+ * boundary splitting when no sub-headings exist.
+ */
+function refineOversizedSegment(segment: Segment, max: number): Segment[] {
+  const nextLevel = segment.headingLevel + 1;
+  if (nextLevel > 6) {
+    return splitByNodeSizeSegments(segment, max);
+  }
+
+  // Find sub-heading boundaries at nextLevel within this segment's nodes
+  const subBoundaries: Array<{ nodeIndex: number; heading: string; slug: string }> = [];
+  const slugCounts = new Map<string, number>();
+
+  for (let i = 0; i < segment.nodes.length; i += 1) {
+    const node = segment.nodes[i]!;
+    if (node.type === "heading" && node.depth === nextLevel) {
+      const heading = toString(node).trim() || "section";
+      const baseSlug = slugify(heading) || "section";
+      const count = (slugCounts.get(baseSlug) ?? 0) + 1;
+      slugCounts.set(baseSlug, count);
+      const slug = count === 1 ? baseSlug : `${baseSlug}-${count}`;
+      subBoundaries.push({ nodeIndex: i, heading, slug });
+    }
+  }
+
+  if (subBoundaries.length === 0) {
+    // No sub-headings at this level — try the next level down
+    const deeper: Segment = { ...segment, headingLevel: nextLevel };
+    return refineOversizedSegment(deeper, max);
+  }
+
+  const subSegments: Segment[] = [];
+
+  // Preamble: nodes before the first sub-heading (inherits parent heading)
+  if (subBoundaries[0]!.nodeIndex > 0) {
+    const preambleNodes = segment.nodes.slice(0, subBoundaries[0]!.nodeIndex);
+    const preambleContent = rawMarkdown(preambleNodes, segment.fullMarkdown);
+    if (preambleContent.trim()) {
+      subSegments.push({
+        ...segment,
+        nodes: preambleNodes,
+        part: 1
+      });
+    }
+  }
+
+  // Create sub-segments for each sub-heading
+  for (let i = 0; i < subBoundaries.length; i += 1) {
+    const boundary = subBoundaries[i]!;
+    const next = subBoundaries[i + 1];
+    const startIdx = boundary.nodeIndex;
+    const endIdx = next ? next.nodeIndex : segment.nodes.length;
+    const sectionNodes = segment.nodes.slice(startIdx, endIdx);
+
+    const content = rawMarkdown(sectionNodes, segment.fullMarkdown);
+    if (!content.trim()) {
+      continue;
+    }
+
+    subSegments.push({
+      kind: "heading",
+      heading: boundary.heading,
+      headingLevel: nextLevel,
+      ancestorTexts: [...segment.ancestorTexts, ...(segment.heading ? [segment.heading] : [])],
+      ancestorSlugs: [...segment.ancestorSlugs, ...(segment.slug ? [segment.slug] : [])],
+      slug: boundary.slug,
+      nodes: sectionNodes,
+      fullMarkdown: segment.fullMarkdown,
+      part: 1
+    });
+  }
+
+  // Recursively refine any sub-segments that are still oversized
+  const result: Segment[] = [];
+  for (const sub of subSegments) {
+    const subLength = rawMarkdown(sub.nodes, sub.fullMarkdown).length;
+    if (subLength <= max) {
+      result.push(sub);
+    } else {
+      result.push(...refineOversizedSegment(sub, max));
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Fallback: split an oversized segment at AST node boundaries, producing
+ * multi-part segments with the same heading metadata.
+ */
+function splitByNodeSizeSegments(segment: Segment, max: number): Segment[] {
+  const nodeGroups = splitByNodeSize(segment.nodes, segment.fullMarkdown, max);
+  return nodeGroups.map((groupNodes, partIndex) => ({
+    ...segment,
+    nodes: groupNodes,
+    part: partIndex + 1
+  }));
+}
+
 /**
  * AST-safe max-size splitting (from Gemini approach).
  *
diff --git a/packages/core/src/embedding.ts b/packages/core/src/embedding.ts
@@ -73,6 +73,13 @@ export class HashEmbeddingProvider implements EmbeddingProvider {
   }
 }
 
+/**
+ * Conservative character limit per text input to stay under the 8191-token
+ * context window of OpenAI embedding models.  We use ~3 chars/token as a
+ * safety margin so 8000 * 3 = 24 000 characters.
+ */
+const DEFAULT_MAX_INPUT_CHARS = 24_000;
+
 export class OpenAIEmbeddingProvider implements EmbeddingProvider {
   readonly name = "openai";
   readonly model: string;
@@ -139,6 +146,16 @@ export class OpenAIEmbeddingProvider implements EmbeddingProvider {
 
   private async embedBatchWithRetry(batch: string[]): Promise<number[][]> {
     let attempt = 0;
+    const truncated = batch.map((text) => {
+      if (text.length > DEFAULT_MAX_INPUT_CHARS) {
+        console.warn(
+          `[docs-mcp] Embedding input truncated from ${text.length} to ${DEFAULT_MAX_INPUT_CHARS} characters. ` +
+            `Consider lowering max_chunk_size in your chunking strategy to avoid content loss.`
+        );
+        return text.slice(0, DEFAULT_MAX_INPUT_CHARS);
+      }
+      return text;
+    });
 
     while (true) {
       const response = await fetch(`${this.baseUrl}/embeddings`, {
@@ -149,7 +166,7 @@ export class OpenAIEmbeddingProvider implements EmbeddingProvider {
         },
         body: JSON.stringify({
           model: this.model,
-          input: batch,
+          input: truncated,
           dimensions: this.dimensions
         })
       });
diff --git a/packages/core/src/manifest-schema.ts b/packages/core/src/manifest-schema.ts
@@ -13,7 +13,7 @@ export const ChunkingStrategySchema = z
       .positive()
       .optional()
       .describe(
-        "Maximum chunk size in characters. Chunks exceeding this limit are split at the next available boundary to prevent oversized results."
+        "Maximum chunk size in characters (default: 20000). Oversized chunks are first split recursively at finer heading levels (e.g. h2→h3→h4→…→h6), preserving semantic structure and breadcrumbs. Only when no further sub-headings exist does it fall back to AST node boundary splitting."
       )
       .meta({ examples: [8000] }),
     min_chunk_size: z
diff --git a/packages/core/test/chunking.test.ts b/packages/core/test/chunking.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "vitest";
-import { buildChunks } from "../src/chunking.js";
+import { buildChunks, DEFAULT_MAX_CHUNK_SIZE } from "../src/chunking.js";
 
 describe("buildChunks", () => {
   it("creates deterministic chunk IDs and resolves duplicates", () => {
@@ -114,4 +114,169 @@ describe("buildChunks", () => {
     expect(chunks[0]?.chunk_id).toBe("guides/min-merge-parts.md#one");
     expect(chunks[0]?.content).toContain("tiny");
   });
+
+  describe("recursive heading refinement", () => {
+    const bigBody = (chars: number) => "x".repeat(chars);
+
+    it("splits oversized h2 at h3 sub-heading boundaries", () => {
+      const markdown = [
+        "## Authentication",
+        bigBody(50),
+        "",
+        "### OAuth",
+        bigBody(50),
+        "",
+        "### JWT",
+        bigBody(50),
+        "",
+        "### API Keys",
+        bigBody(50)
+      ].join("\n");
+
+      const chunks = buildChunks({
+        filepath: "docs/auth.md",
+        markdown,
+        strategy: { chunk_by: "h2", max_chunk_size: 100 }
+      });
+
+      const ids = chunks.map((c) => c.chunk_id);
+      // Preamble content before the first h3 inherits the parent heading
+      expect(ids).toContain("docs/auth.md#authentication");
+      // Sub-headings get proper nested IDs
+      expect(ids).toContain("docs/auth.md#authentication/oauth");
+      expect(ids).toContain("docs/auth.md#authentication/jwt");
+      expect(ids).toContain("docs/auth.md#authentication/api-keys");
+
+      // Sub-chunks have correct breadcrumbs
+      const oauthChunk = chunks.find((c) => c.chunk_id === "docs/auth.md#authentication/oauth");
+      expect(oauthChunk?.breadcrumb).toBe("docs/auth.md > Authentication > OAuth");
+      expect(oauthChunk?.heading).toBe("OAuth");
+      expect(oauthChunk?.heading_level).toBe(3);
+    });
+
+    it("recursively refines h2 > h3 > h4 when multiple levels are oversized", () => {
+      const markdown = [
+        "## Config",
+        bigBody(50),
+        "",
+        "### Advanced",
+        bigBody(50),
+        "",
+        "#### Timeouts",
+        bigBody(50),
+        "",
+        "#### Retries",
+        bigBody(50)
+      ].join("\n");
+
+      const chunks = buildChunks({
+        filepath: "docs/config.md",
+        markdown,
+        strategy: { chunk_by: "h2", max_chunk_size: 100 }
+      });
+
+      const ids = chunks.map((c) => c.chunk_id);
+      expect(ids).toContain("docs/config.md#config");
+      expect(ids).toContain("docs/config.md#config/advanced");
+      expect(ids).toContain("docs/config.md#config/advanced/timeouts");
+      expect(ids).toContain("docs/config.md#config/advanced/retries");
+
+      // Verify deep breadcrumbs
+      const timeouts = chunks.find((c) => c.chunk_id === "docs/config.md#config/advanced/timeouts");
+      expect(timeouts?.breadcrumb).toBe("docs/config.md > Config > Advanced > Timeouts");
+    });
+
+    it("falls back to AST node splitting when no sub-headings exist", () => {
+      const markdown = [
+        "## Huge Section",
+        "Paragraph one. " + bigBody(80),
+        "",
+        "Paragraph two. " + bigBody(80)
+      ].join("\n");
+
+      const chunks = buildChunks({
+        filepath: "docs/huge.md",
+        markdown,
+        strategy: { chunk_by: "h2", max_chunk_size: 100 }
+      });
+
+      // Should produce multiple parts since there are no sub-headings
+      expect(chunks.length).toBeGreaterThan(1);
+      // All parts share the same base slug with part suffixes
+      expect(chunks[0]?.chunk_id).toBe("docs/huge.md#huge-section");
+      expect(chunks[1]?.chunk_id).toBe("docs/huge.md#huge-section-part-2");
+    });
+
+    it("applies DEFAULT_MAX_CHUNK_SIZE when no explicit max_chunk_size is set", () => {
+      // Create a chunk that exceeds DEFAULT_MAX_CHUNK_SIZE
+      const markdown = [
+        "## Giant",
+        bigBody(DEFAULT_MAX_CHUNK_SIZE + 1000),
+        "",
+        "## Small",
+        "tiny"
+      ].join("\n");
+
+      const chunks = buildChunks({
+        filepath: "docs/giant.md",
+        markdown,
+        strategy: { chunk_by: "h2" }
+      });
+
+      // The giant section should be split even without explicit max_chunk_size
+      const giantChunks = chunks.filter((c) => c.chunk_id.startsWith("docs/giant.md#giant"));
+      expect(giantChunks.length).toBeGreaterThan(1);
+    });
+
+    it("preserves preamble content within a refined section", () => {
+      const markdown = [
+        "## Parent",
+        "This is the preamble before any h3.",
+        "",
+        "### Child One",
+        bigBody(80),
+        "",
+        "### Child Two",
+        bigBody(80)
+      ].join("\n");
+
+      const chunks = buildChunks({
+        filepath: "docs/preamble-refine.md",
+        markdown,
+        strategy: { chunk_by: "h2", max_chunk_size: 100 }
+      });
+
+      // Preamble content should be preserved in a chunk with the parent heading
+      const parentChunk = chunks.find((c) => c.chunk_id === "docs/preamble-refine.md#parent");
+      expect(parentChunk).toBeDefined();
+      expect(parentChunk?.content).toContain("preamble before any h3");
+
+      // Sub-heading chunks should also exist
+      const ids = chunks.map((c) => c.chunk_id);
+      expect(ids).toContain("docs/preamble-refine.md#parent/child-one");
+      expect(ids).toContain("docs/preamble-refine.md#parent/child-two");
+    });
+
+    it("deduplicates slugs within a refined section", () => {
+      const markdown = [
+        "## Parent",
+        "",
+        "### Example",
+        bigBody(80),
+        "",
+        "### Example",
+        bigBody(80)
+      ].join("\n");
+
+      const chunks = buildChunks({
+        filepath: "docs/dedup.md",
+        markdown,
+        strategy: { chunk_by: "h2", max_chunk_size: 100 }
+      });
+
+      const ids = chunks.map((c) => c.chunk_id);
+      expect(ids).toContain("docs/dedup.md#parent/example");
+      expect(ids).toContain("docs/dedup.md#parent/example-2");
+    });
+  });
 });
diff --git a/packages/core/test/embedding.test.ts b/packages/core/test/embedding.test.ts
diff --git a/schemas/docs-mcp.schema.json b/schemas/docs-mcp.schema.json

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ export const ChunkingStrategySchema = z`
`13`	`13`	`.positive()`
`14`	`14`	`.optional()`
`15`	`15`	`.describe(`
`16`		`- "Maximum chunk size in characters. Chunks exceeding this limit are split at the next available boundary to prevent oversized results."`
	`16`	`+ "Maximum chunk size in characters (default: 20000). Oversized chunks are first split recursively at finer heading levels (e.g. h2→h3→h4→…→h6), preserving semantic structure and breadcrumbs. Only when no further sub-headings exist does it fall back to AST node boundary splitting."`
`17`	`17`	`)`
`18`	`18`	`.meta({ examples: [8000] }),`
`19`	`19`	`min_chunk_size: z`