@@ -148,89 +148,89 @@ export function PDFProvider({ children }: { children: ReactNode }) {
148148 }
149149 } , [ ] ) ;
150150
151- function onDocumentLoadSuccess ( { numPages } : { numPages : number } ) : void {
151+ const onDocumentLoadSuccess = useCallback ( ( { numPages } : { numPages : number } ) => {
152152 console . log ( 'Document loaded:' , numPages ) ;
153153 setCurrDocPages ( numPages ) ;
154- }
154+ } , [ ] ) ;
155155
156156 // Extract text from a PDF file
157- const extractTextFromPDF = useCallback ( async ( pdfURL : string , currDocPage : number ) : Promise < string > => {
158- try {
159- const base64Data = pdfURL . split ( ',' ) [ 1 ] ;
160- const binaryData = atob ( base64Data ) ;
161- const bytes = new Uint8Array ( binaryData . length ) ;
162- for ( let i = 0 ; i < binaryData . length ; i ++ ) {
163- bytes [ i ] = binaryData . charCodeAt ( i ) ;
157+ const extractTextFromPDF = useCallback ( async ( pdfURL : string , currDocPage : number ) : Promise < string > => {
158+ try {
159+ const base64Data = pdfURL . split ( ',' ) [ 1 ] ;
160+ const binaryData = atob ( base64Data ) ;
161+ const bytes = new Uint8Array ( binaryData . length ) ;
162+ for ( let i = 0 ; i < binaryData . length ; i ++ ) {
163+ bytes [ i ] = binaryData . charCodeAt ( i ) ;
164+ }
165+
166+ const loadingTask = pdfjs . getDocument ( { data : bytes } ) ;
167+ const pdf = await loadingTask . promise ;
168+
169+ // Get only the specified page
170+ const page = await pdf . getPage ( currDocPage ) ;
171+ const textContent = await page . getTextContent ( ) ;
172+
173+ // Filter out non-text items and assert proper type
174+ const textItems = textContent . items . filter ( ( item ) : item is TextItem =>
175+ 'str' in item && 'transform' in item
176+ ) ;
177+
178+ // Group text items into lines based on their vertical position
179+ const tolerance = 2 ;
180+ const lines : TextItem [ ] [ ] = [ ] ;
181+ let currentLine : TextItem [ ] = [ ] ;
182+ let currentY : number | null = null ;
183+
184+ textItems . forEach ( ( item ) => {
185+ const y = item . transform [ 5 ] ;
186+ if ( currentY === null ) {
187+ currentY = y ;
188+ currentLine . push ( item ) ;
189+ } else if ( Math . abs ( y - currentY ) < tolerance ) {
190+ currentLine . push ( item ) ;
191+ } else {
192+ lines . push ( currentLine ) ;
193+ currentLine = [ item ] ;
194+ currentY = y ;
164195 }
165-
166- const loadingTask = pdfjs . getDocument ( { data : bytes } ) ;
167- const pdf = await loadingTask . promise ;
168-
169- // Get only the specified page
170- const page = await pdf . getPage ( currDocPage ) ;
171- const textContent = await page . getTextContent ( ) ;
172-
173- // Filter out non-text items and assert proper type
174- const textItems = textContent . items . filter ( ( item ) : item is TextItem =>
175- 'str' in item && 'transform' in item
176- ) ;
177-
178- // Group text items into lines based on their vertical position
179- const tolerance = 2 ;
180- const lines : TextItem [ ] [ ] = [ ] ;
181- let currentLine : TextItem [ ] = [ ] ;
182- let currentY : number | null = null ;
183-
184- textItems . forEach ( ( item ) => {
185- const y = item . transform [ 5 ] ;
186- if ( currentY === null ) {
187- currentY = y ;
188- currentLine . push ( item ) ;
189- } else if ( Math . abs ( y - currentY ) < tolerance ) {
190- currentLine . push ( item ) ;
196+ } ) ;
197+ lines . push ( currentLine ) ;
198+
199+ // Process each line to build text
200+ let pageText = '' ;
201+ for ( const line of lines ) {
202+ // Sort items horizontally within the line
203+ line . sort ( ( a , b ) => a . transform [ 4 ] - b . transform [ 4 ] ) ;
204+
205+ let lineText = '' ;
206+ let prevItem : TextItem | null = null ;
207+
208+ for ( const item of line ) {
209+ if ( ! prevItem ) {
210+ lineText = item . str ;
191211 } else {
192- lines . push ( currentLine ) ;
193- currentLine = [ item ] ;
194- currentY = y ;
195- }
196- } ) ;
197- lines . push ( currentLine ) ;
198-
199- // Process each line to build text
200- let pageText = '' ;
201- for ( const line of lines ) {
202- // Sort items horizontally within the line
203- line . sort ( ( a , b ) => a . transform [ 4 ] - b . transform [ 4 ] ) ;
204-
205- let lineText = '' ;
206- let prevItem : TextItem | null = null ;
207-
208- for ( const item of line ) {
209- if ( ! prevItem ) {
210- lineText = item . str ;
212+ const prevEndX = prevItem . transform [ 4 ] + ( prevItem . width ?? 0 ) ;
213+ const currentStartX = item . transform [ 4 ] ;
214+ const space = currentStartX - prevEndX ;
215+
216+ // Add space if gap is significant, otherwise concatenate directly
217+ if ( space > ( ( item . width ?? 0 ) * 0.3 ) ) {
218+ lineText += ' ' + item . str ;
211219 } else {
212- const prevEndX = prevItem . transform [ 4 ] + ( prevItem . width ?? 0 ) ;
213- const currentStartX = item . transform [ 4 ] ;
214- const space = currentStartX - prevEndX ;
215-
216- // Add space if gap is significant, otherwise concatenate directly
217- if ( space > ( ( item . width ?? 0 ) * 0.3 ) ) {
218- lineText += ' ' + item . str ;
219- } else {
220- lineText += item . str ;
221- }
220+ lineText += item . str ;
222221 }
223- prevItem = item ;
224222 }
225- pageText += lineText + ' ' ;
223+ prevItem = item ;
226224 }
227-
228- return pageText . replace ( / \s + / g, ' ' ) . trim ( ) ;
229- } catch ( error ) {
230- console . error ( 'Error extracting text from PDF:' , error ) ;
231- throw new Error ( 'Failed to extract text from PDF' ) ;
225+ pageText += lineText + ' ' ;
232226 }
233- } , [ ] ) ;
227+
228+ return pageText . replace ( / \s + / g, ' ' ) . trim ( ) ;
229+ } catch ( error ) {
230+ console . error ( 'Error extracting text from PDF:' , error ) ;
231+ throw new Error ( 'Failed to extract text from PDF' ) ;
232+ }
233+ } , [ ] ) ;
234234
235235 // Load curr doc text
236236 const loadCurrDocText = useCallback ( async ( ) => {
@@ -267,18 +267,18 @@ export function PDFProvider({ children }: { children: ReactNode }) {
267267 console . error ( 'Failed to get document URL:' , error ) ;
268268 setError ( 'Failed to retrieve the document. Please try again.' ) ;
269269 }
270- } , [ getDocument , loadCurrDocText ] ) ;
270+ } , [ getDocument ] ) ;
271271
272272 const clearCurrDoc = useCallback ( ( ) => {
273273 setCurrDocName ( undefined ) ;
274274 setCurrDocURL ( undefined ) ;
275275 setCurrDocText ( undefined ) ;
276- setCurrDocPages ( undefined ) ;
277276
278277 // Clear TTS text
278+ setCurrDocPages ( undefined ) ; // Goes to TTS context
279279 setTTSText ( '' ) ;
280280
281- } , [ ] ) ;
281+ } , [ setCurrDocPages , setTTSText ] ) ;
282282
283283 // Clear all highlights in the PDF viewer
284284 const clearHighlights = useCallback ( ( ) => {
0 commit comments