Refactored NLP into a single, shared function for splitting to sentences.

thepycoder · thepycoder · commit f594ca836bc7 · 2025-06-20T08:25:29.000+02:00
now both handleClick and global sentences are the exact same, so the sentenceIndex will always be correct.
diff --git a/src/app/api/nlp/route.ts b/src/app/api/nlp/route.ts
@@ -1,51 +1,5 @@
 import { NextRequest, NextResponse } from 'next/server';
-import nlp from 'compromise';
-
-const MAX_BLOCK_LENGTH = 300;
-
-const preprocessSentenceForAudio = (text: string): string => {
-  return text
-    .replace(/\S*(?:https?:\/\/|www\.)([^\/\s]+)(?:\/\S*)?/gi, '- (link to $1) -')
-    .replace(/(\w+)-\s+(\w+)/g, '$1$2') // Remove hyphenation
-    // Remove special character *
-    .replace(/\*/g, '')
-    .replace(/\s+/g, ' ')
-    .trim();
-};
-
-const splitIntoSentences = (text: string): string[] => {
-  const paragraphs = text.split(/\n+/);
-  const blocks: string[] = [];
-
-  for (const paragraph of paragraphs) {
-    if (!paragraph.trim()) continue;
-
-    const cleanedText = preprocessSentenceForAudio(paragraph);
-    const doc = nlp(cleanedText);
-    const rawSentences = doc.sentences().out('array') as string[];
-    
-    let currentBlock = '';
-
-    for (const sentence of rawSentences) {
-      const trimmedSentence = sentence.trim();
-      
-      if (currentBlock && (currentBlock.length + trimmedSentence.length + 1) > MAX_BLOCK_LENGTH) {
-        blocks.push(currentBlock.trim());
-        currentBlock = trimmedSentence;
-      } else {
-        currentBlock = currentBlock 
-          ? `${currentBlock} ${trimmedSentence}`
-          : trimmedSentence;
-      }
-    }
-
-    if (currentBlock) {
-      blocks.push(currentBlock.trim());
-    }
-  }
-  
-  return blocks;
-};
+import { processTextToSentences } from '@/utils/nlp';
 
 export async function POST(req: NextRequest) {
   // First check if the request body is empty
@@ -104,14 +58,8 @@ export async function POST(req: NextRequest) {
       );
     }
 
-    if (text.length <= MAX_BLOCK_LENGTH) {
-      // Single sentence preprocessing
-      const cleanedText = preprocessSentenceForAudio(text);
-      return NextResponse.json({ sentences: [cleanedText] });
-    }
-
-    // Full text splitting into sentences
-    const sentences = splitIntoSentences(text);
+    // Use the shared utility function for consistent processing
+    const sentences = processTextToSentences(text);
     return NextResponse.json({ sentences });
   } catch (error) {
     console.error('Error processing text:', error);
diff --git a/src/contexts/TTSContext.tsx b/src/contexts/TTSContext.tsx
@@ -37,6 +37,7 @@ import { useAudioContext } from '@/hooks/audio/useAudioContext';
 import { getLastDocumentLocation } from '@/utils/indexedDB';
 import { useBackgroundState } from '@/hooks/audio/useBackgroundState';
 import { withRetry } from '@/utils/audio';
+import { processTextToSentences } from '@/utils/nlp';
 
 // Media globals
 declare global {
@@ -150,28 +151,18 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
   const activeAbortControllers = useRef<Set<AbortController>>(new Set());
 
   /**
-   * Processes text through the NLP API to split it into sentences
+   * Processes text into sentences using the shared NLP utility
    * 
    * @param {string} text - The text to be processed
    * @returns {Promise<string[]>} Array of processed sentences
    */
-  const processTextToSentences = useCallback(async (text: string): Promise<string[]> => {
+  const processTextToSentencesLocal = useCallback(async (text: string): Promise<string[]> => {
     if (text.length < 1) {
       return [];
     }
 
-    const response = await fetch('/api/nlp', {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ text }),
-    });
-
-    if (!response.ok) {
-      throw new Error('Failed to process text');
-    }
-
-    const { sentences } = await response.json();
-    return sentences;
+    // Use the shared utility directly instead of making an API call
+    return processTextToSentences(text);
   }, []);
 
   /**
@@ -308,7 +299,7 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
     setIsProcessing(true); // Set processing state before text processing starts
 
     console.log('Setting text:', text);
-    processTextToSentences(text)
+    processTextToSentencesLocal(text)
       .then(newSentences => {
         if (newSentences.length === 0) {
           console.warn('No sentences found in text');
@@ -337,7 +328,7 @@ export function TTSProvider({ children }: { children: ReactNode }): ReactElement
           duration: 3000,
         });
       });
-  }, [isPlaying, handleBlankSection, abortAudio, processTextToSentences]);
+  }, [isPlaying, handleBlankSection, abortAudio, processTextToSentencesLocal]);
 
   /**
    * Toggles the playback state between playing and paused
diff --git a/src/utils/nlp.ts b/src/utils/nlp.ts
@@ -0,0 +1,153 @@
+/**
+ * Natural Language Processing Utilities
+ * 
+ * This module provides consistent sentence processing functionality across the application.
+ * It handles text preprocessing, sentence splitting, and block creation for optimal TTS processing.
+ */
+
+import nlp from 'compromise';
+
+const MAX_BLOCK_LENGTH = 300;
+
+/**
+ * Preprocesses text for audio generation by cleaning up various text artifacts
+ * 
+ * @param {string} text - The text to preprocess
+ * @returns {string} The cleaned text
+ */
+export const preprocessSentenceForAudio = (text: string): string => {
+  return text
+    .replace(/\S*(?:https?:\/\/|www\.)([^\/\s]+)(?:\/\S*)?/gi, '- (link to $1) -')
+    .replace(/(\w+)-\s+(\w+)/g, '$1$2') // Remove hyphenation
+    // Remove special character *
+    .replace(/\*/g, '')
+    .replace(/\s+/g, ' ')
+    .trim();
+};
+
+/**
+ * Splits text into sentences and groups them into blocks suitable for TTS processing
+ * 
+ * @param {string} text - The text to split into sentences
+ * @returns {string[]} Array of sentence blocks
+ */
+export const splitIntoSentences = (text: string): string[] => {
+  const paragraphs = text.split(/\n+/);
+  const blocks: string[] = [];
+
+  for (const paragraph of paragraphs) {
+    if (!paragraph.trim()) continue;
+
+    const cleanedText = preprocessSentenceForAudio(paragraph);
+    const doc = nlp(cleanedText);
+    const rawSentences = doc.sentences().out('array') as string[];
+    
+    let currentBlock = '';
+
+    for (const sentence of rawSentences) {
+      const trimmedSentence = sentence.trim();
+      
+      if (currentBlock && (currentBlock.length + trimmedSentence.length + 1) > MAX_BLOCK_LENGTH) {
+        blocks.push(currentBlock.trim());
+        currentBlock = trimmedSentence;
+      } else {
+        currentBlock = currentBlock 
+          ? `${currentBlock} ${trimmedSentence}`
+          : trimmedSentence;
+      }
+    }
+
+    if (currentBlock) {
+      blocks.push(currentBlock.trim());
+    }
+  }
+  
+  return blocks;
+};
+
+/**
+ * Main sentence processing function that handles both short and long texts
+ * 
+ * @param {string} text - The text to process
+ * @returns {string[]} Array of processed sentences/blocks
+ */
+export const processTextToSentences = (text: string): string[] => {
+  if (!text || text.length < 1) {
+    return [];
+  }
+
+  if (text.length <= MAX_BLOCK_LENGTH) {
+    // Single sentence preprocessing
+    const cleanedText = preprocessSentenceForAudio(text);
+    return [cleanedText];
+  }
+
+  // Full text splitting into sentences
+  return splitIntoSentences(text);
+};
+
+/**
+ * Gets raw sentences from text without preprocessing or grouping
+ * This is useful for text matching and highlighting
+ * 
+ * @param {string} text - The text to extract sentences from
+ * @returns {string[]} Array of raw sentences
+ */
+export const getRawSentences = (text: string): string[] => {
+  if (!text || text.length < 1) {
+    return [];
+  }
+  
+  return nlp(text).sentences().out('array') as string[];
+};
+
+/**
+ * Enhanced sentence processing that returns both processed sentences and raw sentences
+ * This allows for better mapping between the two for click-to-highlight functionality
+ * 
+ * @param {string} text - The text to process
+ * @returns {Object} Object containing processed sentences and raw sentences with mapping
+ */
+export const processTextWithMapping = (text: string): {
+  processedSentences: string[];
+  rawSentences: string[];
+  sentenceMapping: Array<{ processedIndex: number; rawIndices: number[] }>;
+} => {
+  const rawSentences = getRawSentences(text);
+  const processedSentences = processTextToSentences(text);
+  
+  // Create a mapping between processed sentences and raw sentences
+  const sentenceMapping: Array<{ processedIndex: number; rawIndices: number[] }> = [];
+  
+  // For simple mapping, we'll track which raw sentences contributed to each processed sentence
+  let rawIndex = 0;
+  
+  for (let processedIndex = 0; processedIndex < processedSentences.length; processedIndex++) {
+    const processedSentence = processedSentences[processedIndex];
+    const rawIndices: number[] = [];
+    
+    // Find which raw sentences are contained in this processed sentence
+    let remainingText = processedSentence;
+    
+    while (rawIndex < rawSentences.length && remainingText.length > 0) {
+      const rawSentence = rawSentences[rawIndex];
+      const cleanedRawSentence = preprocessSentenceForAudio(rawSentence);
+      
+      if (remainingText.includes(cleanedRawSentence) || cleanedRawSentence.includes(remainingText)) {
+        rawIndices.push(rawIndex);
+        rawIndex++;
+        break;
+      } else {
+        rawIndex++;
+      }
+    }
+    
+    sentenceMapping.push({ processedIndex, rawIndices });
+  }
+  
+  return {
+    processedSentences,
+    rawSentences,
+    sentenceMapping
+  };
+}; 
diff --git a/src/utils/pdf.ts b/src/utils/pdf.ts
@@ -1,9 +1,9 @@
 import { pdfjs } from 'react-pdf';
-import nlp from 'compromise';
 import stringSimilarity from 'string-similarity';
 import type { TextItem } from 'pdfjs-dist/types/src/display/api';
 import type { PDFDocumentProxy } from 'pdfjs-dist';
 import "core-js/proposals/promise-with-resolvers";
+import { processTextToSentences } from '@/utils/nlp';
 
 // Function to detect if we need to use legacy build
 function shouldUseLegacyBuild() {
@@ -325,7 +325,9 @@ export function handleTextClick(
 
   if (bestMatch.rating >= similarityThreshold) {
     const matchText = bestMatch.text;
-    const sentences = nlp(pdfText).sentences().out('array') as string[];
+    // Use the same sentence processing logic as TTSContext for consistency
+    const sentences = processTextToSentences(pdfText);
+    console.log("sentences inside handleTextClick: %d", sentences.length)
     let bestSentenceMatch = { sentence: '', rating: 0 };
 
     for (const sentence of sentences) {