1
- import type { FileData } from './loader .ts'
1
+ import { TextData } from './textExtractor .ts'
2
2
3
3
export type Chunk = {
4
4
id : string
@@ -8,7 +8,7 @@ export type Chunk = {
8
8
}
9
9
}
10
10
11
- export const createTitleChunks = ( file : FileData ) : Chunk [ ] => {
11
+ export const createTitleChunks = ( file : TextData ) : Chunk [ ] => {
12
12
const lines = file . content . split ( '\n' )
13
13
14
14
const titleHierarchy = [ file . fileName ]
@@ -27,6 +27,7 @@ export const createTitleChunks = (file: FileData): Chunk[] => {
27
27
metadata : {
28
28
title,
29
29
titleHierarchy : [ ...titleHierarchy ] ,
30
+ type : file . type ,
30
31
} ,
31
32
} )
32
33
@@ -60,14 +61,15 @@ export const createTitleChunks = (file: FileData): Chunk[] => {
60
61
metadata : {
61
62
title,
62
63
titleHierarchy : [ ...titleHierarchy ] ,
64
+ type : file . type ,
63
65
} ,
64
66
} )
65
67
}
66
68
67
69
return chunks
68
70
}
69
71
70
- export const createSplittedTitleChunks = ( file : FileData ) : Chunk [ ] => {
72
+ export const createSplittedTitleChunks = ( file : TextData ) : Chunk [ ] => {
71
73
return createTitleChunks ( file ) . flatMap ( ( chunk ) => {
72
74
const title = chunk . metadata ?. title
73
75
const titleHierarchy = chunk . metadata ?. titleHierarchy
@@ -81,29 +83,36 @@ export const createSplittedTitleChunks = (file: FileData): Chunk[] => {
81
83
metadata : {
82
84
title : `${ title } - ${ index + 1 } ` ,
83
85
titleHierarchy : [ ...titleHierarchy , index + 1 ] ,
86
+ type : file . type ,
84
87
} ,
85
88
} ) )
86
89
} )
87
90
}
88
91
89
- export const createStaticChunks = ( file : FileData ) : Chunk [ ] => {
90
- const lines = file . content . split ( '\n' ) . filter ( ( line ) => line . trim ( ) !== '' )
91
-
92
- if ( lines . length <= 2 ) return [ ]
92
+ export const createStaticChunks = ( file : TextData , length : number = 800 , overlap : number = 400 ) : Chunk [ ] => {
93
+ const content = file . content
93
94
94
95
const chunks : Chunk [ ] = [ ]
95
96
96
- for ( let i = 1 ; i < lines . length - 1 ; i ++ ) {
97
- const chunkContent = [ lines [ i - 1 ] . trim ( ) , lines [ i ] . trim ( ) , lines [ i + 1 ] . trim ( ) ]
98
-
99
- chunks . push ( {
100
- id : `${ file . fileName } -${ i } ` ,
101
- content : [ ...chunkContent ] ,
102
- metadata : {
103
- title : `Chunk ${ i } ` ,
104
- } ,
105
- } )
97
+ for ( let i = overlap ; i < content . length - length - overlap ; i += length ) {
98
+ const chunkContent = content . slice ( i - overlap , i + length + overlap )
99
+ if ( chunkContent . length > 0 ) {
100
+ chunks . push ( {
101
+ id : `${ file . fileName } -${ chunks . length } ` ,
102
+ content : chunkContent . split ( '\n' ) ,
103
+ metadata : {
104
+ title : file . fileName ,
105
+ type : file . type ,
106
+ } ,
107
+ } )
108
+ }
106
109
}
107
110
108
111
return chunks
109
112
}
113
+
114
+ export const chunkingAlgorithms = {
115
+ static : createStaticChunks ,
116
+ title : createTitleChunks ,
117
+ splittedTitle : createSplittedTitleChunks ,
118
+ }
0 commit comments