Skip to content

Commit 000401a

Browse files
committed
Refactor sentence splitting and replace Map() with lru-cache for embedding cache
1 parent 77b4a07 commit 000401a

File tree

4 files changed

+516
-36
lines changed

4 files changed

+516
-36
lines changed

chunkit.js

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
// == github repo: https://github.com/jparkerweb/semantic-chunking ==
99
// ==================================================================
1010

11-
import { splitBySentence } from "string-segmenter"
11+
import sentencize from '@stdlib/nlp-sentencize';
1212
import { DEFAULT_CONFIG } from './config.js';
1313
import { initializeEmbeddingUtils, tokenizer, createEmbedding } from './embeddingUtils.js';
1414
import { computeAdvancedSimilarities, adjustThreshold } from './similarityUtils.js';
@@ -77,10 +77,7 @@ export async function chunkit(
7777
doc.document_text = normalizedText;
7878

7979
// Split the text into sentences
80-
const sentences = [];
81-
for (const { segment } of splitBySentence(doc.document_text)) {
82-
sentences.push(segment.trim());
83-
}
80+
const sentences = sentencize(doc.document_text);
8481

8582
// Compute similarities and create chunks
8683
const { similarities, average, variance } = await computeAdvancedSimilarities(
@@ -220,10 +217,7 @@ export async function cramit(
220217
}
221218

222219
// Split the text into sentences
223-
const sentences = [];
224-
for (const { segment } of splitBySentence(doc.document_text)) {
225-
sentences.push(segment.trim());
226-
}
220+
const sentences = sentencize(doc.document_text);
227221

228222
// Create chunks without considering similarities
229223
const chunks = createChunks(sentences, null, maxTokenSize, 0, logging);
@@ -331,12 +325,7 @@ export async function sentenceit(
331325
}
332326

333327
// Split the text into sentences
334-
const chunks = [];
335-
for (const { segment } of splitBySentence(doc.document_text)) {
336-
if (segment.trim().length > 0) {
337-
chunks.push(segment.trim());
338-
}
339-
}
328+
const chunks = sentencize(doc.document_text);
340329

341330
if (logging) {
342331
console.log('\nSENTENCEIT');

embeddingUtils.js

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
11
import { env, pipeline, AutoTokenizer } from '@huggingface/transformers';
2+
import { LRUCache } from 'lru-cache';
23

34
let tokenizer;
45
let generateEmbedding;
5-
const embeddingCache = new Map();
6+
const embeddingCache = new LRUCache({
7+
max: 500,
8+
maxSize: 50_000_000,
9+
sizeCalculation: (value, key) => {
10+
return (value.length * 4) + key.length;
11+
},
12+
ttl: 1000 * 60 * 60,
13+
});
614

715
// --------------------------------------------
816
// -- Initialize embedding model and tokenizer --
@@ -35,8 +43,9 @@ export async function initializeEmbeddingUtils(
3543
// -- Function to generate embeddings --
3644
// -------------------------------------
3745
export async function createEmbedding(text) {
38-
if (embeddingCache.has(text)) {
39-
return embeddingCache.get(text);
46+
const cached = embeddingCache.get(text);
47+
if (cached) {
48+
return cached;
4049
}
4150

4251
const embeddings = await generateEmbedding(text, { pooling: 'mean', normalize: true });

0 commit comments

Comments
 (0)