Skip to content

Commit 90a16a5

Browse files
committed
feat: Enhance PageIndex chat and search with OCR content and link RAG documents to PageIndex IDs.
1 parent 7f0b7e2 commit 90a16a5

File tree

5 files changed

+130
-6
lines changed

5 files changed

+130
-6
lines changed

src/components/DashboardViews/DocumentsView.tsx

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ export default function DocumentsView({ isDark = true }: { isDark?: boolean }) {
195195
const [pageIndexDocIdByDocId, setPageIndexDocIdByDocId] = useState<Record<string, string>>({});
196196
const [pageIndexLoadingByDocId, setPageIndexLoadingByDocId] = useState<Record<string, boolean>>({});
197197
const [pageIndexErrorByDocId, setPageIndexErrorByDocId] = useState<Record<string, string>>({});
198+
const [currentPageIndexDocId, setCurrentPageIndexDocId] = useState<string | null>(null);
198199
const [chatMessages, setChatMessages] = useState<{ role: string; content: string }[]>([]);
199200
const [chatInput, setChatInput] = useState('');
200201
const [isChatting, setIsChatting] = useState(false);
@@ -299,6 +300,7 @@ export default function DocumentsView({ isDark = true }: { isDark?: boolean }) {
299300
);
300301

301302
setPageIndexTree(result);
303+
setCurrentPageIndexDocId(docId);
302304
setIntelligenceTab('tree');
303305

304306
// Initial AI greeting
@@ -321,7 +323,8 @@ export default function DocumentsView({ isDark = true }: { isDark?: boolean }) {
321323
selectedFile,
322324
kbId,
323325
user.id,
324-
() => { } // Silent progress for standard RAG
326+
() => { }, // Silent progress for standard RAG
327+
docId
325328
).then(async (ragDoc) => {
326329
if (ragDoc) {
327330
await ragService.setPageIndexDocId(ragDoc.id, docId);
@@ -356,7 +359,12 @@ export default function DocumentsView({ isDark = true }: { isDark?: boolean }) {
356359

357360
try {
358361
const history = chatMessages.map(m => ({ role: m.role, content: m.content }));
359-
const response = await pageIndexService.chatWithDocument(userMsg, pageIndexTree, history);
362+
const response = await pageIndexService.chatWithDocument(
363+
userMsg,
364+
pageIndexTree,
365+
history,
366+
{ docId: currentPageIndexDocId || undefined }
367+
);
360368
setChatMessages(prev => [...prev, { role: 'assistant', content: response }]);
361369
} catch (err) {
362370
setChatMessages(prev => [...prev, { role: 'assistant', content: "Sorry, I encountered an error while processing your request." }]);

src/components/PageIndex/PageIndexView.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ export default function PageIndexView() {
166166

167167
try {
168168
const history = chatMessages.map(m => ({ role: m.role, content: m.content }));
169-
const response = await pageIndexService.chatWithDocument(userMsg, tree, history);
169+
const response = await pageIndexService.chatWithDocument(userMsg, tree, history, { docId: docId || undefined });
170170
setChatMessages(prev => [...prev, { role: 'assistant', content: response }]);
171171
} catch (err) {
172172
setChatMessages(prev => [...prev, { role: 'assistant', content: "Sorry, I encountered an error while processing your request." }]);

src/services/localLLMService.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,21 @@ class GroqLLMService {
692692
});
693693
}
694694

695+
const ocrMatches = await pageIndexService.searchOcrPages(pageIndexDoc.pageindex_doc_id, query, 5);
696+
if (ocrMatches.length > 0) {
697+
return JSON.stringify({
698+
source: 'pageindex_ocr',
699+
document: {
700+
id: pageIndexDoc.id,
701+
name: pageIndexDoc.file_name,
702+
pageindex_doc_id: pageIndexDoc.pageindex_doc_id,
703+
},
704+
results: ocrMatches.map((m, i) => ({ rank: i + 1, ...m })),
705+
total: ocrMatches.length,
706+
note: 'Tree had no matches; using OCR content.',
707+
});
708+
}
709+
695710
return JSON.stringify({
696711
source: 'pageindex',
697712
document: {
@@ -701,7 +716,7 @@ class GroqLLMService {
701716
},
702717
results: [],
703718
total: 0,
704-
note: 'No matching nodes found in PageIndex.',
719+
note: 'No matching nodes found in PageIndex or OCR.',
705720
});
706721
} else if (documentName) {
707722
pageIndexNote = 'No PageIndex document found for the requested name.';

src/services/pageIndexService.ts

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,15 @@ export interface TreeResponse {
2323
retrieval_ready?: boolean;
2424
}
2525

26+
interface OcrPage {
27+
page_index: number;
28+
markdown: string;
29+
images?: string[];
30+
}
31+
2632
class PageIndexService {
2733
private treeCache = new Map<string, PageIndexNode[]>();
34+
private ocrCache = new Map<string, OcrPage[] | string>();
2835

2936
private getPageIndexHeaders() {
3037
if (!PAGEINDEX_API_KEY) {
@@ -141,18 +148,62 @@ class PageIndexService {
141148
throw new Error(`PageIndex status: ${treeResponse.status}`);
142149
}
143150

151+
/**
152+
* Fetch OCR results (page/node/raw).
153+
*/
154+
async getOcr(docId: string, format: 'page' | 'node' | 'raw' = 'page'): Promise<OcrPage[] | string> {
155+
const cacheKey = `${docId}:${format}`;
156+
const cached = this.ocrCache.get(cacheKey);
157+
if (cached) return cached;
158+
159+
const headers = this.getPageIndexHeaders();
160+
const response = await axios.get(`${PAGEINDEX_API_URL}/doc/${docId}/`, {
161+
headers,
162+
params: {
163+
type: 'ocr',
164+
format,
165+
},
166+
});
167+
168+
const data = response.data;
169+
const result = (Array.isArray(data) ? data : data?.result ?? data) as OcrPage[] | string;
170+
this.ocrCache.set(cacheKey, result);
171+
return result;
172+
}
173+
144174
/**
145175
* Chat with Groq using the flattened tree as context
146176
*/
147177
async chatWithDocument(
148178
question: string,
149179
treeNodes: PageIndexNode[],
150-
history: { role: string; content: string }[] = []
180+
history: { role: string; content: string }[] = [],
181+
options?: { docId?: string }
151182
): Promise<string> {
152183
if (!GROQ_API_KEY) {
153184
throw new Error('Missing Groq API key');
154185
}
155186
const flattenedTree = this.flattenTree(treeNodes);
187+
const treeTextChars = this.countTextChars(treeNodes);
188+
let ocrContext = '';
189+
190+
if (options?.docId && treeTextChars < 200) {
191+
try {
192+
const matches = await this.searchOcrPages(options.docId, question, 4);
193+
if (matches.length > 0) {
194+
ocrContext = matches
195+
.map(m => `[Page ${m.page_index}] ${m.excerpt}`)
196+
.join('\n');
197+
} else {
198+
const raw = await this.getOcr(options.docId, 'raw');
199+
if (typeof raw === 'string' && raw.trim().length > 0) {
200+
ocrContext = raw.substring(0, 4000);
201+
}
202+
}
203+
} catch {
204+
// Ignore OCR fallback errors, continue with tree only
205+
}
206+
}
156207

157208
const systemPrompt = `You are a helpful document assistant.
158209
You have been given a structured index tree extracted from a PDF document.
@@ -168,7 +219,7 @@ Rules:
168219
{ role: 'system', content: systemPrompt },
169220
{
170221
role: 'user',
171-
content: `Here is the indexed structure of the document:\n\n${flattenedTree}\n\nI will now ask questions about this document.`
222+
content: `Here is the indexed structure of the document:\n\n${flattenedTree}\n\n${ocrContext ? `Additional OCR content (page-based):\n${ocrContext}\n\n` : ''}I will now ask questions about this document.`
172223
},
173224
{ role: 'assistant', content: "Got it! I've read the full index. Ask me anything." },
174225
...history,
@@ -220,6 +271,21 @@ Rules:
220271
return result;
221272
}
222273

274+
/**
275+
* Count total text characters in the tree (summary/text).
276+
*/
277+
private countTextChars(nodes: PageIndexNode[]): number {
278+
let count = 0;
279+
for (const node of nodes) {
280+
const text = node.summary || node.text || '';
281+
count += text.length;
282+
if (node.nodes && node.nodes.length > 0) {
283+
count += this.countTextChars(node.nodes);
284+
}
285+
}
286+
return count;
287+
}
288+
223289
/**
224290
* Search the tree for a query (simple keyword match).
225291
*/
@@ -280,6 +346,39 @@ Rules:
280346
});
281347
}
282348

349+
/**
350+
* Search OCR page results for a query.
351+
*/
352+
async searchOcrPages(docId: string, query: string, limit: number = 5): Promise<Array<{
353+
page_index: number;
354+
excerpt: string;
355+
score: number;
356+
}>> {
357+
const terms = query.toLowerCase().split(/\s+/).filter(Boolean);
358+
if (terms.length === 0) return [];
359+
360+
const ocr = await this.getOcr(docId, 'page');
361+
if (!Array.isArray(ocr)) return [];
362+
363+
const scored = ocr.map((page) => {
364+
const hay = (page.markdown || '').toLowerCase();
365+
let score = 0;
366+
for (const term of terms) {
367+
if (hay.includes(term)) score += 1;
368+
}
369+
return { page, score };
370+
}).filter(item => item.score > 0);
371+
372+
return scored
373+
.sort((a, b) => b.score - a.score)
374+
.slice(0, limit)
375+
.map(({ page, score }) => ({
376+
page_index: page.page_index,
377+
excerpt: page.markdown.length > 400 ? `${page.markdown.substring(0, 400)}...` : page.markdown,
378+
score,
379+
}));
380+
}
381+
283382
/**
284383
* Count total nodes in the tree
285384
*/

src/services/ragService.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ class RAGService {
202202
kbId: string,
203203
userId: string,
204204
onProgress?: (progress: ProcessingProgress) => void,
205+
pageIndexDocId?: string,
205206
): Promise<UploadedDocument | null> {
206207
const fileType = file.name.split('.').pop()?.toLowerCase() || 'txt';
207208
let documentRecord: UploadedDocument | null = null;
@@ -233,6 +234,7 @@ class RAGService {
233234
file_size: file.size,
234235
storage_path: storagePath,
235236
status: 'processing',
237+
...(pageIndexDocId ? { pageindex_doc_id: pageIndexDocId } : {}),
236238
})
237239
.select()
238240
.single();

0 commit comments

Comments
 (0)