1+ import * as yaml from 'js-yaml'
2+ import { createLogger } from '@/lib/logs/console/logger'
3+ import { getAccurateTokenCount } from '@/lib/tokenization'
14import { estimateTokenCount } from '@/lib/tokenization/estimators'
25import type { Chunk , ChunkerOptions } from './types'
36
7+ const logger = createLogger ( 'JsonYamlChunker' )
8+
49function getTokenCount ( text : string ) : number {
5- const estimate = estimateTokenCount ( text )
6- return estimate . count
10+ try {
11+ return getAccurateTokenCount ( text , 'text-embedding-3-small' )
12+ } catch ( error ) {
13+ logger . warn ( 'Tiktoken failed, falling back to estimation' )
14+ const estimate = estimateTokenCount ( text )
15+ return estimate . count
16+ }
717}
818
919/**
1020 * Configuration for JSON/YAML chunking
21+ * Reduced limits to ensure we stay well under OpenAI's 8,191 token limit per embedding request
1122 */
1223const JSON_YAML_CHUNKING_CONFIG = {
13- TARGET_CHUNK_SIZE : 2000 , // Target tokens per chunk
24+ TARGET_CHUNK_SIZE : 1000 , // Target tokens per chunk
1425 MIN_CHUNK_SIZE : 100 , // Minimum tokens per chunk
15- MAX_CHUNK_SIZE : 3000 , // Maximum tokens per chunk
26+ MAX_CHUNK_SIZE : 1500 , // Maximum tokens per chunk
1627 MAX_DEPTH_FOR_SPLITTING : 5 , // Maximum depth to traverse for splitting
1728}
1829
@@ -34,7 +45,6 @@ export class JsonYamlChunker {
3445 return true
3546 } catch {
3647 try {
37- const yaml = require ( 'js-yaml' )
3848 yaml . load ( content )
3949 return true
4050 } catch {
@@ -48,9 +58,26 @@ export class JsonYamlChunker {
4858 */
4959 async chunk ( content : string ) : Promise < Chunk [ ] > {
5060 try {
51- const data = JSON . parse ( content )
52- return this . chunkStructuredData ( data )
61+ let data : any
62+ try {
63+ data = JSON . parse ( content )
64+ } catch {
65+ data = yaml . load ( content )
66+ }
67+ const chunks = this . chunkStructuredData ( data )
68+
69+ const tokenCounts = chunks . map ( ( c ) => c . tokenCount )
70+ const totalTokens = tokenCounts . reduce ( ( a , b ) => a + b , 0 )
71+ const maxTokens = Math . max ( ...tokenCounts )
72+ const avgTokens = Math . round ( totalTokens / chunks . length )
73+
74+ logger . info (
75+ `JSON chunking complete: ${ chunks . length } chunks, ${ totalTokens } total tokens (avg: ${ avgTokens } , max: ${ maxTokens } )`
76+ )
77+
78+ return chunks
5379 } catch ( error ) {
80+ logger . info ( 'JSON parsing failed, falling back to text chunking' )
5481 return this . chunkAsText ( content )
5582 }
5683 }
@@ -102,7 +129,6 @@ export class JsonYamlChunker {
102129 const itemTokens = getTokenCount ( itemStr )
103130
104131 if ( itemTokens > this . chunkSize ) {
105- // Save current batch if it has items
106132 if ( currentBatch . length > 0 ) {
107133 const batchContent = contextHeader + JSON . stringify ( currentBatch , null , 2 )
108134 chunks . push ( {
@@ -134,7 +160,7 @@ export class JsonYamlChunker {
134160 const batchContent = contextHeader + JSON . stringify ( currentBatch , null , 2 )
135161 chunks . push ( {
136162 text : batchContent ,
137- tokenCount : currentTokens ,
163+ tokenCount : getTokenCount ( batchContent ) ,
138164 metadata : {
139165 startIndex : i - currentBatch . length ,
140166 endIndex : i - 1 ,
@@ -152,7 +178,7 @@ export class JsonYamlChunker {
152178 const batchContent = contextHeader + JSON . stringify ( currentBatch , null , 2 )
153179 chunks . push ( {
154180 text : batchContent ,
155- tokenCount : currentTokens ,
181+ tokenCount : getTokenCount ( batchContent ) ,
156182 metadata : {
157183 startIndex : arr . length - currentBatch . length ,
158184 endIndex : arr . length - 1 ,
@@ -194,12 +220,11 @@ export class JsonYamlChunker {
194220 const valueTokens = getTokenCount ( valueStr )
195221
196222 if ( valueTokens > this . chunkSize ) {
197- // Save current object if it has properties
198223 if ( Object . keys ( currentObj ) . length > 0 ) {
199224 const objContent = JSON . stringify ( currentObj , null , 2 )
200225 chunks . push ( {
201226 text : objContent ,
202- tokenCount : currentTokens ,
227+ tokenCount : getTokenCount ( objContent ) ,
203228 metadata : {
204229 startIndex : 0 ,
205230 endIndex : objContent . length ,
@@ -230,7 +255,7 @@ export class JsonYamlChunker {
230255 const objContent = JSON . stringify ( currentObj , null , 2 )
231256 chunks . push ( {
232257 text : objContent ,
233- tokenCount : currentTokens ,
258+ tokenCount : getTokenCount ( objContent ) ,
234259 metadata : {
235260 startIndex : 0 ,
236261 endIndex : objContent . length ,
@@ -250,7 +275,7 @@ export class JsonYamlChunker {
250275 const objContent = JSON . stringify ( currentObj , null , 2 )
251276 chunks . push ( {
252277 text : objContent ,
253- tokenCount : currentTokens ,
278+ tokenCount : getTokenCount ( objContent ) ,
254279 metadata : {
255280 startIndex : 0 ,
256281 endIndex : objContent . length ,
@@ -262,7 +287,7 @@ export class JsonYamlChunker {
262287 }
263288
264289 /**
265- * Fall back to text chunking if JSON parsing fails.
290+ * Fall back to text chunking if JSON parsing fails
266291 */
267292 private async chunkAsText ( content : string ) : Promise < Chunk [ ] > {
268293 const chunks : Chunk [ ] = [ ]
@@ -308,7 +333,7 @@ export class JsonYamlChunker {
308333 }
309334
310335 /**
311- * Static method for chunking JSON/YAML data with default options.
336+ * Static method for chunking JSON/YAML data with default options
312337 */
313338 static async chunkJsonYaml ( content : string , options : ChunkerOptions = { } ) : Promise < Chunk [ ] > {
314339 const chunker = new JsonYamlChunker ( options )
0 commit comments