-
Notifications
You must be signed in to change notification settings - Fork 33
Expand file tree
/
Copy pathsemantic-split.js
More file actions
107 lines (92 loc) · 2.94 KB
/
semantic-split.js
File metadata and controls
107 lines (92 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
export function splitTextSmart(text, maxChunkLength = 500) {
const paragraphChunks = text.split(/\n\s*\n/);
const finalChunks = [];
for (let para of paragraphChunks) {
if (para.length <= maxChunkLength) {
finalChunks.push(para.trim());
continue;
}
const sentenceRegex = /(?<=[.?!])(?=\s+["“”'a-z])/gi;
const sentences = para.split(sentenceRegex);
let chunk = '';
for (let sentence of sentences) {
sentence = sentence.trim();
if (sentence.length > maxChunkLength) {
// Sentence too long — fallback split
const subChunks = splitLongSentence(sentence, maxChunkLength);
for (let sub of subChunks) {
if ((chunk + ' ' + sub).length > maxChunkLength) {
if (chunk) finalChunks.push(chunk.trim());
chunk = sub;
} else {
chunk += (chunk ? ' ' : '') + sub;
}
}
continue;
}
if ((chunk + ' ' + sentence).length > maxChunkLength) {
if (chunk) finalChunks.push(chunk.trim());
chunk = sentence;
} else {
chunk += (chunk ? ' ' : '') + sentence;
}
}
if (chunk) finalChunks.push(chunk.trim());
}
return finalChunks;
}
export function splitLongSentence(sentence, maxLen) {
const chunks = [];
let current = '';
const commaParts = sentence.split(/,\s*/);
for (let part of commaParts) {
if ((current + ', ' + part).length > maxLen) {
if (current) chunks.push(current.trim());
if (part.length > maxLen) {
const words = part.split(/\s+/);
let wordChunk = '';
for (let word of words) {
if ((wordChunk + ' ' + word).length > maxLen) {
if (wordChunk) chunks.push(wordChunk.trim());
wordChunk = word;
} else {
wordChunk += (wordChunk ? ' ' : '') + word;
}
}
if (wordChunk) chunks.push(wordChunk.trim());
current = '';
} else {
current = part;
}
} else {
current += (current ? ', ' : '') + part;
}
}
if (current) chunks.push(current.trim());
return chunks;
}
function splitTextSmartOld(text, maxChunkLength = 500) {
const paragraphChunks = text.split(/\n\s*\n/); // Step 1: split on double returns
const finalChunks = [];
for (let para of paragraphChunks) {
if (para.length <= maxChunkLength) {
finalChunks.push(para.trim());
continue;
}
// Step 2: Further split on sentence boundaries if too long
const sentenceRegex = /(?<=[.?!])(?=\s+["“”'a-z])/gi;
const sentences = para.split(sentenceRegex);
let chunk = '';
for (let sentence of sentences) {
sentence = sentence.trim();
if ((chunk + ' ' + sentence).length > maxChunkLength) {
if (chunk) finalChunks.push(chunk.trim());
chunk = sentence;
} else {
chunk += (chunk ? ' ' : '') + sentence;
}
}
if (chunk) finalChunks.push(chunk.trim());
}
return finalChunks;
}