|
8 | 8 | // == github repo: https://github.com/jparkerweb/semantic-chunking == |
9 | 9 | // ================================================================== |
10 | 10 |
|
11 | | -import { splitBySentence } from "string-segmenter" |
| 11 | +import sentencize from '@stdlib/nlp-sentencize'; |
12 | 12 | import { DEFAULT_CONFIG } from './config.js'; |
13 | 13 | import { initializeEmbeddingUtils, tokenizer, createEmbedding } from './embeddingUtils.js'; |
14 | 14 | import { computeAdvancedSimilarities, adjustThreshold } from './similarityUtils.js'; |
@@ -77,10 +77,7 @@ export async function chunkit( |
77 | 77 | doc.document_text = normalizedText; |
78 | 78 |
|
79 | 79 | // Split the text into sentences |
80 | | - const sentences = []; |
81 | | - for (const { segment } of splitBySentence(doc.document_text)) { |
82 | | - sentences.push(segment.trim()); |
83 | | - } |
| 80 | + const sentences = sentencize(doc.document_text); |
84 | 81 |
|
85 | 82 | // Compute similarities and create chunks |
86 | 83 | const { similarities, average, variance } = await computeAdvancedSimilarities( |
@@ -220,10 +217,7 @@ export async function cramit( |
220 | 217 | } |
221 | 218 |
|
222 | 219 | // Split the text into sentences |
223 | | - const sentences = []; |
224 | | - for (const { segment } of splitBySentence(doc.document_text)) { |
225 | | - sentences.push(segment.trim()); |
226 | | - } |
| 220 | + const sentences = sentencize(doc.document_text); |
227 | 221 |
|
228 | 222 | // Create chunks without considering similarities |
229 | 223 | const chunks = createChunks(sentences, null, maxTokenSize, 0, logging); |
@@ -331,12 +325,7 @@ export async function sentenceit( |
331 | 325 | } |
332 | 326 |
|
333 | 327 | // Split the text into sentences |
334 | | - const chunks = []; |
335 | | - for (const { segment } of splitBySentence(doc.document_text)) { |
336 | | - if (segment.trim().length > 0) { |
337 | | - chunks.push(segment.trim()); |
338 | | - } |
339 | | - } |
| 328 | + const chunks = sentencize(doc.document_text); |
340 | 329 |
|
341 | 330 | if (logging) { |
342 | 331 | console.log('\nSENTENCEIT'); |
|
0 commit comments