support pdfs in rag

Veikkosuhonen · Veikkosuhonen · commit b888c5b6e8bb · 2025-05-21T13:16:32.000+03:00
diff --git a/package.json b/package.json
@@ -42,14 +42,14 @@
     "@types/express": "^5.0.0",
     "@types/lodash": "^4.17.0",
     "@types/morgan": "^1.9.9",
+    "@types/multer": "^1.4.11",
     "@types/node": "^22.0.0",
     "@types/node-cron": "^3.0.11",
     "@types/react": "^18.2.52",
     "@types/react-dom": "^18.2.18",
     "@types/react-router-dom": "^5.3.3",
     "@typescript-eslint/eslint-plugin": "^8.11.0",
     "@typescript-eslint/parser": "^8.11.0",
-    "@types/multer": "^1.4.11",
     "concurrently": "^9.0.0",
     "eslint": "^9.13.0",
     "eslint-config-prettier": "^9.1.0",
diff --git a/src/client/components/Rag.tsx b/src/client/components/Rag.tsx
@@ -12,6 +12,7 @@ type RagResponse = {
     title: string
     content: string
     score: number
+    metadata: Record<string, any>
   }
 }
 
@@ -114,7 +115,15 @@ const Rag: React.FC = () => {
       topK,
     })
     console.log('Response from server:', res.data)
-    setResponse(res.data)
+    // Parse metadatas
+    const parsedResponse = res.data.map((doc) => ({
+      ...doc,
+      value: {
+        ...doc.value,
+        metadata: JSON.parse(doc.value.metadata),
+      },
+    }))
+    setResponse(parsedResponse)
     setInputValue('')
   }
 
@@ -238,7 +247,14 @@ const Rag: React.FC = () => {
             {response.map((doc) => (
               <Paper key={doc.id} sx={{ marginBottom: 2, p: 1 }} elevation={2}>
                 <Typography variant="caption">Score: {doc.value.score}</Typography>
-                <Markdown>{doc.value.content}</Markdown>
+                <Typography variant="subtitle1" fontFamily="monospace" mb={2}>{JSON.stringify(doc.value.metadata, null, 2)}</Typography>
+                {doc.value.metadata.type === 'md' ? (
+                  <Markdown>{doc.value.content}</Markdown>
+                ) : (
+                  <Typography whiteSpace="pre-line" variant="body1">
+                    {doc.value.content}
+                  </Typography>
+                )}
               </Paper>
             ))}
           </Box>
diff --git a/src/server/routes/rag.ts b/src/server/routes/rag.ts
@@ -98,7 +98,7 @@ const upload = multer({
     },
   }),
   limits: {
-    fileSize: 10 * 1024 * 1024, // 10 MB
+    fileSize: 50 * 1024 * 1024, // 50 MB
   },
 })
 const uploadMiddleware = upload.array('files')
diff --git a/src/server/services/rag/chunkDb.ts b/src/server/services/rag/chunkDb.ts
@@ -118,7 +118,7 @@ export const vectorSearchKChunks = async (ragIndex: RagIndex, embedding: number[
       vec_param: embeddingBuffer,
     },
     DIALECT: 2,
-    RETURN: ['content', 'title', 'score'], // Specify the fields to return
+    RETURN: ['content', 'title', 'score', 'metadata'], // Specify the fields to return
   })
 
   return results as {
@@ -136,11 +136,11 @@ export const vectorSearchKChunks = async (ragIndex: RagIndex, embedding: number[
 }
 
 export const fullTextSearchChunks = async (ragIndex: RagIndex, query: string) => {
-  const queryString = `@content:%${query}% | @title:%${query}%`
+  const queryString = `@content:"%${query}%" | @title:"%${query}%"`
 
   const results = await redisClient.ft.search(ragIndex.metadata.name, queryString, {
     DIALECT: 2,
-    RETURN: ['content', 'title'],
+    RETURN: ['content', 'title', 'metadata'],
     SLOP: 1,
     INORDER: true,
   })
diff --git a/src/server/services/rag/ingestion/chunker.ts b/src/server/services/rag/ingestion/chunker.ts
@@ -1,8 +1,8 @@
 import { Transform } from 'node:stream'
-import { createSplittedTitleChunks, createStaticChunks, createTitleChunks } from './chunkingAlgorithms.ts'
-import type { FileData } from './loader.ts'
+import { chunkingAlgorithms } from './chunkingAlgorithms.ts'
 import { mkdirSync } from 'node:fs'
 import { writeFile } from 'node:fs/promises'
+import { TextData } from './textExtractor.ts'
 
 export class Chunker extends Transform {
   private cachePath: string
@@ -16,8 +16,10 @@ export class Chunker extends Transform {
     mkdirSync(this.cachePath, { recursive: true })
   }
 
-  _transform(data: FileData, _encoding: BufferEncoding, callback: (error?: Error | null) => void) {
-    const chunks = createTitleChunks(data)
+  _transform(data: TextData, _encoding: BufferEncoding, callback: (error?: Error | null) => void) {
+    const chunkingAlgorithm = chunkingAlgorithms[data.chunkingStrategy]
+
+    const chunks = chunkingAlgorithm(data)
     for (const chunk of chunks) {
       this.push(chunk)
     }
diff --git a/src/server/services/rag/ingestion/chunkingAlgorithms.ts b/src/server/services/rag/ingestion/chunkingAlgorithms.ts
@@ -1,4 +1,4 @@
-import type { FileData } from './loader.ts'
+import { TextData } from './textExtractor.ts'
 
 export type Chunk = {
   id: string
@@ -8,7 +8,7 @@ export type Chunk = {
   }
 }
 
-export const createTitleChunks = (file: FileData): Chunk[] => {
+export const createTitleChunks = (file: TextData): Chunk[] => {
   const lines = file.content.split('\n')
 
   const titleHierarchy = [file.fileName]
@@ -27,6 +27,7 @@ export const createTitleChunks = (file: FileData): Chunk[] => {
         metadata: {
           title,
           titleHierarchy: [...titleHierarchy],
+          type: file.type,
         },
       })
 
@@ -60,14 +61,15 @@ export const createTitleChunks = (file: FileData): Chunk[] => {
       metadata: {
         title,
         titleHierarchy: [...titleHierarchy],
+        type: file.type,
       },
     })
   }
 
   return chunks
 }
 
-export const createSplittedTitleChunks = (file: FileData): Chunk[] => {
+export const createSplittedTitleChunks = (file: TextData): Chunk[] => {
   return createTitleChunks(file).flatMap((chunk) => {
     const title = chunk.metadata?.title
     const titleHierarchy = chunk.metadata?.titleHierarchy
@@ -81,29 +83,36 @@ export const createSplittedTitleChunks = (file: FileData): Chunk[] => {
         metadata: {
           title: `${title} - ${index + 1}`,
           titleHierarchy: [...titleHierarchy, index + 1],
+          type: file.type,
         },
       }))
   })
 }
 
-export const createStaticChunks = (file: FileData): Chunk[] => {
-  const lines = file.content.split('\n').filter((line) => line.trim() !== '')
-
-  if (lines.length <= 2) return []
+export const createStaticChunks = (file: TextData, length: number = 800, overlap: number = 400): Chunk[] => {
+  const content = file.content
 
   const chunks: Chunk[] = []
 
-  for (let i = 1; i < lines.length - 1; i++) {
-    const chunkContent = [lines[i - 1].trim(), lines[i].trim(), lines[i + 1].trim()]
-
-    chunks.push({
-      id: `${file.fileName}-${i}`,
-      content: [...chunkContent],
-      metadata: {
-        title: `Chunk ${i}`,
-      },
-    })
+  for (let i = overlap; i < content.length - length - overlap; i += length) {
+    const chunkContent = content.slice(i - overlap, i + length + overlap)
+    if (chunkContent.length > 0) {
+      chunks.push({
+        id: `${file.fileName}-${chunks.length}`,
+        content: chunkContent.split('\n'),
+        metadata: {
+          title: file.fileName,
+          type: file.type,
+        },
+      })
+    }
   }
 
   return chunks
 }
+
+export const chunkingAlgorithms = {
+  static: createStaticChunks,
+  title: createTitleChunks,
+  splittedTitle: createSplittedTitleChunks,
+}
diff --git a/src/server/services/rag/ingestion/loader.ts b/src/server/services/rag/ingestion/loader.ts
@@ -1,10 +1,7 @@
 import { readdir, readFile, stat } from 'node:fs/promises'
 import { Readable } from 'node:stream'
 
-export type FileData = {
-  fileName: string
-  content: string
-}
+export type FileData = { fileName: string; type: 'text' | 'md'; content: string } | { fileName: string; type: 'pdf'; content: Buffer }
 
 async function* loadFiles(loadpath: string): AsyncGenerator<FileData> {
   // Check if the path is a file
@@ -29,11 +26,23 @@ async function* loadFiles(loadpath: string): AsyncGenerator<FileData> {
 }
 
 const loadFile = async (filePath: string): Promise<FileData> => {
-  const content = await readFile(filePath, 'utf-8')
+  const extension = filePath.split('.').pop()
   const fileName = filePath.split('/').pop() || 'unknown'
+
+  if (extension === 'pdf') {
+    const content = await readFile(filePath)
+    return {
+      fileName,
+      content,
+      type: 'pdf',
+    }
+  }
+
+  const content = await readFile(filePath, 'utf-8')
   return {
     fileName,
     content,
+    type: extension === 'md' ? 'md' : 'text',
   }
 }
 
diff --git a/src/server/services/rag/ingestion/pipeline.ts b/src/server/services/rag/ingestion/pipeline.ts
@@ -6,6 +6,7 @@ import { Embedder } from './embedder.ts'
 import { RedisStorer } from './storer.ts'
 import type OpenAI from 'openai'
 import RagIndex from '../../../db/models/ragIndex.ts'
+import { TextExtractor } from './textExtractor.ts'
 
 // Pipeline debug cache in pipeline/
 // Check if exists, if not create it.
@@ -24,6 +25,13 @@ const initPipelineCache = async () => {
 export const ingestionPipeline = async (client: OpenAI, loadpath: string, ragIndex: RagIndex) => {
   await initPipelineCache()
 
-  await pipeline([new FileLoader(loadpath), new Chunker(pipelineCachePath), new Embedder(client, pipelineCachePath, 10), new RedisStorer(ragIndex)])
+  await pipeline([
+    new FileLoader(loadpath),
+    new TextExtractor(pipelineCachePath),
+    new Chunker(pipelineCachePath),
+    new Embedder(client, pipelineCachePath, 10),
+    new RedisStorer(ragIndex),
+  ])
+
   console.log('Ingestion pipeline completed')
 }
diff --git a/src/server/services/rag/ingestion/textExtractor.ts b/src/server/services/rag/ingestion/textExtractor.ts
@@ -0,0 +1,47 @@
+import { Transform } from 'node:stream'
+import type { FileData } from './loader.ts'
+import { mkdirSync } from 'node:fs'
+import { writeFile } from 'node:fs/promises'
+import { pdfToText } from '../../../util/pdfToText.ts'
+
+export type TextData = {
+  fileName: string
+  content: string
+  type: 'text' | 'md' | 'pdf'
+  chunkingStrategy: 'static' | 'title' | 'splittedTitle'
+}
+
+export class TextExtractor extends Transform {
+  private cachePath: string
+
+  constructor(cachePath: string) {
+    super({ objectMode: true })
+
+    this.cachePath = cachePath + '/texts'
+
+    // Make sure the cache path exists
+    mkdirSync(this.cachePath, { recursive: true })
+  }
+
+  async _transform(data: FileData, _encoding: BufferEncoding, callback: (error?: Error | null) => void) {
+    let textContent = data.type === 'text' ? data.content : ''
+
+    if (data.type === 'pdf') {
+      textContent = await pdfToText(data.content)
+    }
+
+    const textData: TextData = {
+      fileName: data.fileName,
+      content: textContent,
+      type: data.type,
+      chunkingStrategy: data.type === 'pdf' ? 'static' : 'title',
+    }
+
+    this.push(textData)
+
+    // Save text data to cache
+    const textPath = `${this.cachePath}/${data.fileName}.txt`
+    await writeFile(textPath, textContent, 'utf-8')
+    callback()
+  }
+}