1+ /**
2+ * Natural Language Processing Utilities
3+ *
4+ * This module provides consistent sentence processing functionality across the application.
5+ * It handles text preprocessing, sentence splitting, and block creation for optimal TTS processing.
6+ */
7+
8+ import nlp from 'compromise' ;
9+
10+ const MAX_BLOCK_LENGTH = 300 ;
11+
12+ /**
13+ * Preprocesses text for audio generation by cleaning up various text artifacts
14+ *
15+ * @param {string } text - The text to preprocess
16+ * @returns {string } The cleaned text
17+ */
18+ export const preprocessSentenceForAudio = ( text : string ) : string => {
19+ return text
20+ . replace ( / \S * (?: h t t p s ? : \/ \/ | w w w \. ) ( [ ^ \/ \s ] + ) (?: \/ \S * ) ? / gi, '- (link to $1) -' )
21+ . replace ( / ( \w + ) - \s + ( \w + ) / g, '$1$2' ) // Remove hyphenation
22+ // Remove special character *
23+ . replace ( / \* / g, '' )
24+ . replace ( / \s + / g, ' ' )
25+ . trim ( ) ;
26+ } ;
27+
28+ /**
29+ * Splits text into sentences and groups them into blocks suitable for TTS processing
30+ *
31+ * @param {string } text - The text to split into sentences
32+ * @returns {string[] } Array of sentence blocks
33+ */
34+ export const splitIntoSentences = ( text : string ) : string [ ] => {
35+ const paragraphs = text . split ( / \n + / ) ;
36+ const blocks : string [ ] = [ ] ;
37+
38+ for ( const paragraph of paragraphs ) {
39+ if ( ! paragraph . trim ( ) ) continue ;
40+
41+ const cleanedText = preprocessSentenceForAudio ( paragraph ) ;
42+ const doc = nlp ( cleanedText ) ;
43+ const rawSentences = doc . sentences ( ) . out ( 'array' ) as string [ ] ;
44+
45+ let currentBlock = '' ;
46+
47+ for ( const sentence of rawSentences ) {
48+ const trimmedSentence = sentence . trim ( ) ;
49+
50+ if ( currentBlock && ( currentBlock . length + trimmedSentence . length + 1 ) > MAX_BLOCK_LENGTH ) {
51+ blocks . push ( currentBlock . trim ( ) ) ;
52+ currentBlock = trimmedSentence ;
53+ } else {
54+ currentBlock = currentBlock
55+ ? `${ currentBlock } ${ trimmedSentence } `
56+ : trimmedSentence ;
57+ }
58+ }
59+
60+ if ( currentBlock ) {
61+ blocks . push ( currentBlock . trim ( ) ) ;
62+ }
63+ }
64+
65+ return blocks ;
66+ } ;
67+
68+ /**
69+ * Main sentence processing function that handles both short and long texts
70+ *
71+ * @param {string } text - The text to process
72+ * @returns {string[] } Array of processed sentences/blocks
73+ */
74+ export const processTextToSentences = ( text : string ) : string [ ] => {
75+ if ( ! text || text . length < 1 ) {
76+ return [ ] ;
77+ }
78+
79+ if ( text . length <= MAX_BLOCK_LENGTH ) {
80+ // Single sentence preprocessing
81+ const cleanedText = preprocessSentenceForAudio ( text ) ;
82+ return [ cleanedText ] ;
83+ }
84+
85+ // Full text splitting into sentences
86+ return splitIntoSentences ( text ) ;
87+ } ;
88+
89+ /**
90+ * Gets raw sentences from text without preprocessing or grouping
91+ * This is useful for text matching and highlighting
92+ *
93+ * @param {string } text - The text to extract sentences from
94+ * @returns {string[] } Array of raw sentences
95+ */
96+ export const getRawSentences = ( text : string ) : string [ ] => {
97+ if ( ! text || text . length < 1 ) {
98+ return [ ] ;
99+ }
100+
101+ return nlp ( text ) . sentences ( ) . out ( 'array' ) as string [ ] ;
102+ } ;
103+
104+ /**
105+ * Enhanced sentence processing that returns both processed sentences and raw sentences
106+ * This allows for better mapping between the two for click-to-highlight functionality
107+ *
108+ * @param {string } text - The text to process
109+ * @returns {Object } Object containing processed sentences and raw sentences with mapping
110+ */
111+ export const processTextWithMapping = ( text : string ) : {
112+ processedSentences : string [ ] ;
113+ rawSentences : string [ ] ;
114+ sentenceMapping : Array < { processedIndex : number ; rawIndices : number [ ] } > ;
115+ } => {
116+ const rawSentences = getRawSentences ( text ) ;
117+ const processedSentences = processTextToSentences ( text ) ;
118+
119+ // Create a mapping between processed sentences and raw sentences
120+ const sentenceMapping : Array < { processedIndex : number ; rawIndices : number [ ] } > = [ ] ;
121+
122+ // For simple mapping, we'll track which raw sentences contributed to each processed sentence
123+ let rawIndex = 0 ;
124+
125+ for ( let processedIndex = 0 ; processedIndex < processedSentences . length ; processedIndex ++ ) {
126+ const processedSentence = processedSentences [ processedIndex ] ;
127+ const rawIndices : number [ ] = [ ] ;
128+
129+ // Find which raw sentences are contained in this processed sentence
130+ let remainingText = processedSentence ;
131+
132+ while ( rawIndex < rawSentences . length && remainingText . length > 0 ) {
133+ const rawSentence = rawSentences [ rawIndex ] ;
134+ const cleanedRawSentence = preprocessSentenceForAudio ( rawSentence ) ;
135+
136+ if ( remainingText . includes ( cleanedRawSentence ) || cleanedRawSentence . includes ( remainingText ) ) {
137+ rawIndices . push ( rawIndex ) ;
138+ rawIndex ++ ;
139+ break ;
140+ } else {
141+ rawIndex ++ ;
142+ }
143+ }
144+
145+ sentenceMapping . push ( { processedIndex, rawIndices } ) ;
146+ }
147+
148+ return {
149+ processedSentences,
150+ rawSentences,
151+ sentenceMapping
152+ } ;
153+ } ;
0 commit comments