Skip to content

Commit f948601

Browse files
committed
Add configurable text extraction margin for PDF processing
1 parent b87b833 commit f948601

File tree

4 files changed

+114
-16
lines changed

4 files changed

+114
-16
lines changed

src/components/DocumentSettings.tsx

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
'use client';
22

3-
import { Fragment, useState, useRef } from 'react';
3+
import { Fragment, useState, useRef, useCallback, useEffect } from 'react';
44
import { Dialog, DialogPanel, Transition, TransitionChild, Listbox, ListboxButton, ListboxOptions, ListboxOption, Button } from '@headlessui/react';
55
import { useConfig, ViewType } from '@/contexts/ConfigContext';
66
import { ChevronUpDownIcon, CheckIcon } from '@/components/icons/Icons';
@@ -21,13 +21,33 @@ const viewTypes = [
2121
];
2222

2323
export function DocumentSettings({ isOpen, setIsOpen, epub }: DocViewSettingsProps) {
24-
const { viewType, skipBlank, epubTheme, updateConfigKey } = useConfig();
24+
const { viewType, skipBlank, epubTheme, textExtractionMargin, updateConfigKey } = useConfig();
2525
const { createFullAudioBook } = useEPUB();
2626
const [progress, setProgress] = useState(0);
2727
const [isGenerating, setIsGenerating] = useState(false);
28+
const [localMargin, setLocalMargin] = useState(textExtractionMargin);
2829
const abortControllerRef = useRef<AbortController | null>(null);
2930
const selectedView = viewTypes.find(v => v.id === viewType) || viewTypes[0];
3031

32+
//console.log(localMargin, textExtractionMargin);
33+
34+
// Sync local margin with global state
35+
useEffect(() => {
36+
setLocalMargin(textExtractionMargin);
37+
}, [textExtractionMargin]);
38+
39+
// Handler for slider change (updates local state only)
40+
const handleMarginChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => {
41+
setLocalMargin(Number(event.target.value));
42+
}, []);
43+
44+
// Handler for slider release
45+
const handleMarginChangeComplete = useCallback(() => {
46+
if (localMargin !== textExtractionMargin) {
47+
updateConfigKey('textExtractionMargin', localMargin);
48+
}
49+
}, [localMargin, textExtractionMargin, updateConfigKey]);
50+
3151
const handleStartGeneration = async () => {
3252
setIsGenerating(true);
3353
setProgress(0);
@@ -132,13 +152,38 @@ export function DocumentSettings({ isOpen, setIsOpen, epub }: DocViewSettingsPro
132152
</div>
133153
)}
134154
</div>}
135-
{!epub && <div className="space-y-2">
136-
<label className="block text-sm font-medium text-foreground">Mode</label>
155+
{!epub && <div className="space-y-6">
156+
<div className="mt-4 space-y-2">
157+
<label className="block text-sm font-medium text-foreground">
158+
Text Extraction Margin
159+
</label>
160+
<div className="flex justify-between">
161+
<span className="text-xs">0%</span>
162+
<span className="text-xs font-bold">{Math.round(localMargin * 100)}%</span>
163+
<span className="text-xs">20%</span>
164+
</div>
165+
<input
166+
type="range"
167+
min="0"
168+
max="0.2"
169+
step="0.01"
170+
value={localMargin}
171+
onChange={handleMarginChange}
172+
onMouseUp={handleMarginChangeComplete}
173+
onKeyUp={handleMarginChangeComplete}
174+
onTouchEnd={handleMarginChangeComplete}
175+
className="w-full bg-offbase rounded-lg appearance-none cursor-pointer accent-accent [&::-webkit-slider-runnable-track]:bg-offbase [&::-webkit-slider-runnable-track]:rounded-lg [&::-webkit-slider-thumb]:appearance-none [&::-webkit-slider-thumb]:h-4 [&::-webkit-slider-thumb]:w-4 [&::-webkit-slider-thumb]:rounded-full [&::-webkit-slider-thumb]:bg-accent [&::-moz-range-track]:bg-offbase [&::-moz-range-track]:rounded-lg [&::-moz-range-thumb]:appearance-none [&::-moz-range-thumb]:h-4 [&::-moz-range-thumb]:w-4 [&::-moz-range-thumb]:rounded-full [&::-moz-range-thumb]:bg-accent"
176+
/>
177+
<p className="text-xs text-muted">
178+
{"Don't"} include content from outer rim of the page during text extraction (experimental)
179+
</p>
180+
</div>
137181
<Listbox
138182
value={selectedView}
139183
onChange={(newView) => updateConfigKey('viewType', newView.id as ViewType)}
140184
>
141-
<div className="relative z-10">
185+
<div className="relative z-10 space-y-2">
186+
<label className="block text-sm font-medium text-foreground">Mode</label>
142187
<ListboxButton className="relative w-full cursor-pointer rounded-lg bg-background py-2 pl-3 pr-10 text-left text-foreground shadow-sm focus:outline-none focus:ring-2 focus:ring-accent transform transition-transform duration-200 ease-in-out hover:scale-[1.01] hover:text-accent">
143188
<span className="block truncate">{selectedView.name}</span>
144189
<span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
@@ -177,14 +222,16 @@ export function DocumentSettings({ isOpen, setIsOpen, epub }: DocViewSettingsPro
177222
))}
178223
</ListboxOptions>
179224
</Transition>
225+
{selectedView.id === 'scroll' && (
226+
<p className="text-sm text-warning pt-2">
227+
Note: Continuous scroll may perform poorly for larger documents.
228+
</p>
229+
)}
180230
</div>
181231
</Listbox>
182-
{selectedView.id === 'scroll' && (
183-
<p className="text-sm text-warning pt-2">
184-
Note: Continuous scroll may perform poorly for larger documents.
185-
</p>
186-
)}
232+
187233
</div>}
234+
188235
<div className="space-y-2">
189236
<label className="flex items-center space-x-2">
190237
<input

src/contexts/ConfigContext.tsx

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ type ConfigValues = {
1515
voice: string;
1616
skipBlank: boolean;
1717
epubTheme: boolean;
18+
textExtractionMargin: number;
1819
};
1920

2021
/** Interface defining the configuration context shape and functionality */
@@ -26,6 +27,7 @@ interface ConfigContextType {
2627
voice: string;
2728
skipBlank: boolean;
2829
epubTheme: boolean;
30+
textExtractionMargin: number;
2931
updateConfig: (newConfig: Partial<{ apiKey: string; baseUrl: string; viewType: ViewType }>) => Promise<void>;
3032
updateConfigKey: <K extends keyof ConfigValues>(key: K, value: ConfigValues[K]) => Promise<void>;
3133
isLoading: boolean;
@@ -49,6 +51,7 @@ export function ConfigProvider({ children }: { children: ReactNode }) {
4951
const [voice, setVoice] = useState<string>('af_sarah');
5052
const [skipBlank, setSkipBlank] = useState<boolean>(true);
5153
const [epubTheme, setEpubTheme] = useState<boolean>(false);
54+
const [textExtractionMargin, setTextExtractionMargin] = useState<number>(0.07);
5255

5356
const [isLoading, setIsLoading] = useState(true);
5457
const [isDBReady, setIsDBReady] = useState(false);
@@ -68,6 +71,7 @@ export function ConfigProvider({ children }: { children: ReactNode }) {
6871
const cachedVoice = await getItem('voice');
6972
const cachedSkipBlank = await getItem('skipBlank');
7073
const cachedEpubTheme = await getItem('epubTheme');
74+
const cachedMargin = await getItem('textExtractionMargin');
7175

7276
// Only set API key and base URL if they were explicitly saved by the user
7377
if (cachedApiKey) {
@@ -85,6 +89,7 @@ export function ConfigProvider({ children }: { children: ReactNode }) {
8589
setVoice(cachedVoice || 'af_sarah');
8690
setSkipBlank(cachedSkipBlank === 'false' ? false : true);
8791
setEpubTheme(cachedEpubTheme === 'true');
92+
setTextExtractionMargin(parseFloat(cachedMargin || '0.07'));
8893

8994
// Only save non-sensitive settings by default
9095
if (!cachedViewType) {
@@ -96,6 +101,9 @@ export function ConfigProvider({ children }: { children: ReactNode }) {
96101
if (cachedEpubTheme === null) {
97102
await setItem('epubTheme', 'false');
98103
}
104+
if (cachedMargin === null) {
105+
await setItem('textExtractionMargin', '0.07');
106+
}
99107

100108
} catch (error) {
101109
console.error('Error initializing:', error);
@@ -170,6 +178,9 @@ export function ConfigProvider({ children }: { children: ReactNode }) {
170178
case 'epubTheme':
171179
setEpubTheme(value as boolean);
172180
break;
181+
case 'textExtractionMargin':
182+
setTextExtractionMargin(value as number);
183+
break;
173184
}
174185
} catch (error) {
175186
console.error(`Error updating config key ${key}:`, error);
@@ -186,6 +197,7 @@ export function ConfigProvider({ children }: { children: ReactNode }) {
186197
voice,
187198
skipBlank,
188199
epubTheme,
200+
textExtractionMargin,
189201
updateConfig,
190202
updateConfigKey,
191203
isLoading,

src/contexts/PDFContext.tsx

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import {
2626

2727
import { indexedDBService } from '@/utils/indexedDB';
2828
import { useTTS } from '@/contexts/TTSContext';
29+
import { useConfig } from '@/contexts/ConfigContext';
2930
import {
3031
extractTextFromPDF,
3132
convertPDFDataToURL,
@@ -77,6 +78,7 @@ const PDFContext = createContext<PDFContextType | undefined>(undefined);
7778
*/
7879
export function PDFProvider({ children }: { children: ReactNode }) {
7980
const { setText: setTTSText, stop, currDocPageNumber: currDocPage, currDocPages, setCurrDocPages } = useTTS();
81+
const { textExtractionMargin } = useConfig();
8082

8183
// Current document state
8284
const [currDocURL, setCurrDocURL] = useState<string>();
@@ -104,7 +106,7 @@ export function PDFProvider({ children }: { children: ReactNode }) {
104106
const loadCurrDocText = useCallback(async () => {
105107
try {
106108
if (!pdfDocument) return;
107-
const text = await extractTextFromPDF(pdfDocument, currDocPage);
109+
const text = await extractTextFromPDF(pdfDocument, currDocPage, textExtractionMargin);
108110
// Only update TTS text if the content has actually changed
109111
// This prevents unnecessary resets of the sentence index
110112
if (text !== currDocText || text === '') {
@@ -114,7 +116,7 @@ export function PDFProvider({ children }: { children: ReactNode }) {
114116
} catch (error) {
115117
console.error('Error loading PDF text:', error);
116118
}
117-
}, [pdfDocument, currDocPage, setTTSText, currDocText]);
119+
}, [pdfDocument, currDocPage, setTTSText, currDocText, textExtractionMargin]);
118120

119121
/**
120122
* Effect hook to update document text when the page changes

src/utils/pdf.ts

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,51 @@ export function convertPDFDataToURL(pdfData: Blob): Promise<string> {
2626
}
2727

2828
// Text Processing functions
29-
export async function extractTextFromPDF(pdf: PDFDocumentProxy, pageNumber: number): Promise<string> {
29+
export async function extractTextFromPDF(pdf: PDFDocumentProxy, pageNumber: number, margin = 0.07): Promise<string> {
3030
try {
3131
const page = await pdf.getPage(pageNumber);
3232
const textContent = await page.getTextContent();
33+
34+
// Get page viewport to help with positioning
35+
const viewport = page.getViewport({ scale: 1.0 });
36+
const pageHeight = viewport.height;
37+
const pageWidth = viewport.width;
38+
39+
const textItems = textContent.items.filter((item): item is TextItem => {
40+
if (!('str' in item && 'transform' in item)) return false;
41+
42+
// Get all transform matrix values
43+
const [scaleX, skewX, skewY, scaleY, x, y] = item.transform;
44+
45+
// Check for reasonable scale values (not too small or too large)
46+
if (Math.abs(scaleX) < 1 || Math.abs(scaleX) > 20) return false;
47+
if (Math.abs(scaleY) < 1 || Math.abs(scaleY) > 20) return false;
48+
49+
// Check for reasonable skew values (should be close to 0 for normal text)
50+
if (Math.abs(skewX) > 0.5 || Math.abs(skewY) > 0.5) return false;
51+
52+
// Filter out positions in header/footer areas using configurable margin
53+
const topMargin = pageHeight * margin;
54+
const bottomMargin = pageHeight * (1 - margin);
55+
if (y < topMargin || y > bottomMargin) {
56+
return false;
57+
}
58+
59+
// Filter out positions in left/right margin areas
60+
const leftMargin = pageWidth * margin;
61+
const rightMargin = pageWidth * (1 - margin);
62+
if (x < leftMargin || x > rightMargin) {
63+
return false;
64+
}
65+
66+
// Check for reasonable x position values
67+
if (x < 0 || x > pageWidth) return false;
68+
69+
// Filter out empty strings or strings with only whitespace
70+
return item.str.trim().length > 0;
71+
});
3372

34-
const textItems = textContent.items.filter((item): item is TextItem =>
35-
'str' in item && 'transform' in item
36-
);
73+
console.log('Filtered text items:', textItems);
3774

3875
const tolerance = 2;
3976
const lines: TextItem[][] = [];

0 commit comments

Comments
 (0)