Skip to content

Commit 01ba8d5

Browse files
committed
Update sentence processing to using max sentence blocks of 250 chars
1 parent 039ac1f commit 01ba8d5

File tree

1 file changed

+20
-15
lines changed

1 file changed

+20
-15
lines changed

src/utils/nlp.ts

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,31 +15,36 @@ export const preprocessSentenceForAudio = (text: string): string => {
1515
.trim();
1616
};
1717

18-
const isShortSentence = (sentence: string): boolean => {
19-
const words = sentence.trim().split(/\s+/);
20-
return words.length <= 5;
21-
};
18+
const MAX_BLOCK_LENGTH = 250; // Maximum characters per block
2219

2320
export const splitIntoSentences = (text: string): string[] => {
2421
// Preprocess the text before splitting into sentences
2522
const cleanedText = preprocessSentenceForAudio(text);
2623
const doc = nlp(cleanedText);
2724
const rawSentences = doc.sentences().out('array') as string[];
2825

29-
// Combine short sentences with previous ones
30-
const processedSentences: string[] = [];
31-
32-
for (let i = 0; i < rawSentences.length; i++) {
33-
const currentSentence = rawSentences[i].trim();
26+
const blocks: string[] = [];
27+
let currentBlock = '';
28+
29+
for (const sentence of rawSentences) {
30+
const trimmedSentence = sentence.trim();
3431

35-
if (isShortSentence(currentSentence) && processedSentences.length > 0) {
36-
// Combine with previous sentence
37-
const lastIndex = processedSentences.length - 1;
38-
processedSentences[lastIndex] = `${processedSentences[lastIndex]} ${currentSentence}`;
32+
// If adding this sentence would exceed the limit, start a new block
33+
if (currentBlock && (currentBlock.length + trimmedSentence.length + 1) > MAX_BLOCK_LENGTH) {
34+
blocks.push(currentBlock.trim());
35+
currentBlock = trimmedSentence;
3936
} else {
40-
processedSentences.push(currentSentence);
37+
// Add to current block with a space if not empty
38+
currentBlock = currentBlock
39+
? `${currentBlock} ${trimmedSentence}`
40+
: trimmedSentence;
4141
}
4242
}
43+
44+
// Add the last block if not empty
45+
if (currentBlock) {
46+
blocks.push(currentBlock.trim());
47+
}
4348

44-
return processedSentences;
49+
return blocks;
4550
};

0 commit comments

Comments
 (0)