@@ -35,6 +35,13 @@ const CHUNK_LEVEL_MAP: Record<Exclude<ChunkingStrategy["chunk_by"], "file">, num
3535 h3 : 3
3636} ;
3737
38+ /**
39+ * Default maximum chunk size in characters (~6,700 tokens at ~3 chars/token),
40+ * well under OpenAI's 8,191-token embedding limit. Applied when no explicit
41+ * `max_chunk_size` is configured.
42+ */
43+ export const DEFAULT_MAX_CHUNK_SIZE = 20_000 ;
44+
3845// ─── Public API ──────────────────────────────────────────────────
3946
4047export function buildChunks ( input : BuildChunksInput ) : Chunk [ ] {
@@ -240,28 +247,23 @@ function slugify(value: string): string {
240247// ─── AST-safe size rules ─────────────────────────────────────────
241248
242249function applySizeRules ( segments : Segment [ ] , strategy : ChunkingStrategy ) : Segment [ ] {
243- const max = strategy . max_chunk_size ;
250+ const max = strategy . max_chunk_size ?? DEFAULT_MAX_CHUNK_SIZE ;
244251 const min = strategy . min_chunk_size ;
245252
246- // Phase 1: split oversized segments using AST node boundaries
253+ // Phase 1: split oversized segments — try recursive heading refinement first,
254+ // then fall back to AST node boundary splitting.
247255 const expanded : Segment [ ] = [ ] ;
248256
249257 for ( const segment of segments ) {
250258 const contentLength = rawMarkdown ( segment . nodes , segment . fullMarkdown ) . length ;
251259
252- if ( ! max || contentLength <= max ) {
260+ if ( contentLength <= max ) {
253261 expanded . push ( segment ) ;
254262 continue ;
255263 }
256264
257- const nodeGroups = splitByNodeSize ( segment . nodes , segment . fullMarkdown , max ) ;
258- nodeGroups . forEach ( ( groupNodes , partIndex ) => {
259- expanded . push ( {
260- ...segment ,
261- nodes : groupNodes ,
262- part : partIndex + 1
263- } ) ;
264- } ) ;
265+ const refined = refineOversizedSegment ( segment , max ) ;
266+ expanded . push ( ...refined ) ;
265267 }
266268
267269 // Phase 2: merge undersized segments into previous (Opus-style breadcrumb check)
@@ -289,6 +291,107 @@ function applySizeRules(segments: Segment[], strategy: ChunkingStrategy): Segmen
289291 return merged ;
290292}
291293
294+ /**
295+ * Recursively refine an oversized segment by splitting at progressively finer
296+ * heading levels (headingLevel+1, +2, ... up to h6). Falls back to AST node
297+ * boundary splitting when no sub-headings exist.
298+ */
299+ function refineOversizedSegment ( segment : Segment , max : number ) : Segment [ ] {
300+ const nextLevel = segment . headingLevel + 1 ;
301+ if ( nextLevel > 6 ) {
302+ return splitByNodeSizeSegments ( segment , max ) ;
303+ }
304+
305+ // Find sub-heading boundaries at nextLevel within this segment's nodes
306+ const subBoundaries : Array < { nodeIndex : number ; heading : string ; slug : string } > = [ ] ;
307+ const slugCounts = new Map < string , number > ( ) ;
308+
309+ for ( let i = 0 ; i < segment . nodes . length ; i += 1 ) {
310+ const node = segment . nodes [ i ] ! ;
311+ if ( node . type === "heading" && node . depth === nextLevel ) {
312+ const heading = toString ( node ) . trim ( ) || "section" ;
313+ const baseSlug = slugify ( heading ) || "section" ;
314+ const count = ( slugCounts . get ( baseSlug ) ?? 0 ) + 1 ;
315+ slugCounts . set ( baseSlug , count ) ;
316+ const slug = count === 1 ? baseSlug : `${ baseSlug } -${ count } ` ;
317+ subBoundaries . push ( { nodeIndex : i , heading, slug } ) ;
318+ }
319+ }
320+
321+ if ( subBoundaries . length === 0 ) {
322+ // No sub-headings at this level — try the next level down
323+ const deeper : Segment = { ...segment , headingLevel : nextLevel } ;
324+ return refineOversizedSegment ( deeper , max ) ;
325+ }
326+
327+ const subSegments : Segment [ ] = [ ] ;
328+
329+ // Preamble: nodes before the first sub-heading (inherits parent heading)
330+ if ( subBoundaries [ 0 ] ! . nodeIndex > 0 ) {
331+ const preambleNodes = segment . nodes . slice ( 0 , subBoundaries [ 0 ] ! . nodeIndex ) ;
332+ const preambleContent = rawMarkdown ( preambleNodes , segment . fullMarkdown ) ;
333+ if ( preambleContent . trim ( ) ) {
334+ subSegments . push ( {
335+ ...segment ,
336+ nodes : preambleNodes ,
337+ part : 1
338+ } ) ;
339+ }
340+ }
341+
342+ // Create sub-segments for each sub-heading
343+ for ( let i = 0 ; i < subBoundaries . length ; i += 1 ) {
344+ const boundary = subBoundaries [ i ] ! ;
345+ const next = subBoundaries [ i + 1 ] ;
346+ const startIdx = boundary . nodeIndex ;
347+ const endIdx = next ? next . nodeIndex : segment . nodes . length ;
348+ const sectionNodes = segment . nodes . slice ( startIdx , endIdx ) ;
349+
350+ const content = rawMarkdown ( sectionNodes , segment . fullMarkdown ) ;
351+ if ( ! content . trim ( ) ) {
352+ continue ;
353+ }
354+
355+ subSegments . push ( {
356+ kind : "heading" ,
357+ heading : boundary . heading ,
358+ headingLevel : nextLevel ,
359+ ancestorTexts : [ ...segment . ancestorTexts , ...( segment . heading ? [ segment . heading ] : [ ] ) ] ,
360+ ancestorSlugs : [ ...segment . ancestorSlugs , ...( segment . slug ? [ segment . slug ] : [ ] ) ] ,
361+ slug : boundary . slug ,
362+ nodes : sectionNodes ,
363+ fullMarkdown : segment . fullMarkdown ,
364+ part : 1
365+ } ) ;
366+ }
367+
368+ // Recursively refine any sub-segments that are still oversized
369+ const result : Segment [ ] = [ ] ;
370+ for ( const sub of subSegments ) {
371+ const subLength = rawMarkdown ( sub . nodes , sub . fullMarkdown ) . length ;
372+ if ( subLength <= max ) {
373+ result . push ( sub ) ;
374+ } else {
375+ result . push ( ...refineOversizedSegment ( sub , max ) ) ;
376+ }
377+ }
378+
379+ return result ;
380+ }
381+
382+ /**
383+ * Fallback: split an oversized segment at AST node boundaries, producing
384+ * multi-part segments with the same heading metadata.
385+ */
386+ function splitByNodeSizeSegments ( segment : Segment , max : number ) : Segment [ ] {
387+ const nodeGroups = splitByNodeSize ( segment . nodes , segment . fullMarkdown , max ) ;
388+ return nodeGroups . map ( ( groupNodes , partIndex ) => ( {
389+ ...segment ,
390+ nodes : groupNodes ,
391+ part : partIndex + 1
392+ } ) ) ;
393+ }
394+
292395/**
293396 * AST-safe max-size splitting (from Gemini approach).
294397 *
0 commit comments