Skip to content

Commit b888c5b

Browse files
committed
support pdfs in rag
1 parent 67e69f3 commit b888c5b

File tree

9 files changed

+125
-34
lines changed

9 files changed

+125
-34
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,14 @@
4242
"@types/express": "^5.0.0",
4343
"@types/lodash": "^4.17.0",
4444
"@types/morgan": "^1.9.9",
45+
"@types/multer": "^1.4.11",
4546
"@types/node": "^22.0.0",
4647
"@types/node-cron": "^3.0.11",
4748
"@types/react": "^18.2.52",
4849
"@types/react-dom": "^18.2.18",
4950
"@types/react-router-dom": "^5.3.3",
5051
"@typescript-eslint/eslint-plugin": "^8.11.0",
5152
"@typescript-eslint/parser": "^8.11.0",
52-
"@types/multer": "^1.4.11",
5353
"concurrently": "^9.0.0",
5454
"eslint": "^9.13.0",
5555
"eslint-config-prettier": "^9.1.0",

src/client/components/Rag.tsx

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ type RagResponse = {
1212
title: string
1313
content: string
1414
score: number
15+
metadata: Record<string, any>
1516
}
1617
}
1718

@@ -114,7 +115,15 @@ const Rag: React.FC = () => {
114115
topK,
115116
})
116117
console.log('Response from server:', res.data)
117-
setResponse(res.data)
118+
// Parse metadatas
119+
const parsedResponse = res.data.map((doc) => ({
120+
...doc,
121+
value: {
122+
...doc.value,
123+
metadata: JSON.parse(doc.value.metadata),
124+
},
125+
}))
126+
setResponse(parsedResponse)
118127
setInputValue('')
119128
}
120129

@@ -238,7 +247,14 @@ const Rag: React.FC = () => {
238247
{response.map((doc) => (
239248
<Paper key={doc.id} sx={{ marginBottom: 2, p: 1 }} elevation={2}>
240249
<Typography variant="caption">Score: {doc.value.score}</Typography>
241-
<Markdown>{doc.value.content}</Markdown>
250+
<Typography variant="subtitle1" fontFamily="monospace" mb={2}>{JSON.stringify(doc.value.metadata, null, 2)}</Typography>
251+
{doc.value.metadata.type === 'md' ? (
252+
<Markdown>{doc.value.content}</Markdown>
253+
) : (
254+
<Typography whiteSpace="pre-line" variant="body1">
255+
{doc.value.content}
256+
</Typography>
257+
)}
242258
</Paper>
243259
))}
244260
</Box>

src/server/routes/rag.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ const upload = multer({
9898
},
9999
}),
100100
limits: {
101-
fileSize: 10 * 1024 * 1024, // 10 MB
101+
fileSize: 50 * 1024 * 1024, // 50 MB
102102
},
103103
})
104104
const uploadMiddleware = upload.array('files')

src/server/services/rag/chunkDb.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ export const vectorSearchKChunks = async (ragIndex: RagIndex, embedding: number[
118118
vec_param: embeddingBuffer,
119119
},
120120
DIALECT: 2,
121-
RETURN: ['content', 'title', 'score'], // Specify the fields to return
121+
RETURN: ['content', 'title', 'score', 'metadata'], // Specify the fields to return
122122
})
123123

124124
return results as {
@@ -136,11 +136,11 @@ export const vectorSearchKChunks = async (ragIndex: RagIndex, embedding: number[
136136
}
137137

138138
export const fullTextSearchChunks = async (ragIndex: RagIndex, query: string) => {
139-
const queryString = `@content:%${query}% | @title:%${query}%`
139+
const queryString = `@content:"%${query}%" | @title:"%${query}%"`
140140

141141
const results = await redisClient.ft.search(ragIndex.metadata.name, queryString, {
142142
DIALECT: 2,
143-
RETURN: ['content', 'title'],
143+
RETURN: ['content', 'title', 'metadata'],
144144
SLOP: 1,
145145
INORDER: true,
146146
})

src/server/services/rag/ingestion/chunker.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import { Transform } from 'node:stream'
2-
import { createSplittedTitleChunks, createStaticChunks, createTitleChunks } from './chunkingAlgorithms.ts'
3-
import type { FileData } from './loader.ts'
2+
import { chunkingAlgorithms } from './chunkingAlgorithms.ts'
43
import { mkdirSync } from 'node:fs'
54
import { writeFile } from 'node:fs/promises'
5+
import { TextData } from './textExtractor.ts'
66

77
export class Chunker extends Transform {
88
private cachePath: string
@@ -16,8 +16,10 @@ export class Chunker extends Transform {
1616
mkdirSync(this.cachePath, { recursive: true })
1717
}
1818

19-
_transform(data: FileData, _encoding: BufferEncoding, callback: (error?: Error | null) => void) {
20-
const chunks = createTitleChunks(data)
19+
_transform(data: TextData, _encoding: BufferEncoding, callback: (error?: Error | null) => void) {
20+
const chunkingAlgorithm = chunkingAlgorithms[data.chunkingStrategy]
21+
22+
const chunks = chunkingAlgorithm(data)
2123
for (const chunk of chunks) {
2224
this.push(chunk)
2325
}

src/server/services/rag/ingestion/chunkingAlgorithms.ts

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { FileData } from './loader.ts'
1+
import { TextData } from './textExtractor.ts'
22

33
export type Chunk = {
44
id: string
@@ -8,7 +8,7 @@ export type Chunk = {
88
}
99
}
1010

11-
export const createTitleChunks = (file: FileData): Chunk[] => {
11+
export const createTitleChunks = (file: TextData): Chunk[] => {
1212
const lines = file.content.split('\n')
1313

1414
const titleHierarchy = [file.fileName]
@@ -27,6 +27,7 @@ export const createTitleChunks = (file: FileData): Chunk[] => {
2727
metadata: {
2828
title,
2929
titleHierarchy: [...titleHierarchy],
30+
type: file.type,
3031
},
3132
})
3233

@@ -60,14 +61,15 @@ export const createTitleChunks = (file: FileData): Chunk[] => {
6061
metadata: {
6162
title,
6263
titleHierarchy: [...titleHierarchy],
64+
type: file.type,
6365
},
6466
})
6567
}
6668

6769
return chunks
6870
}
6971

70-
export const createSplittedTitleChunks = (file: FileData): Chunk[] => {
72+
export const createSplittedTitleChunks = (file: TextData): Chunk[] => {
7173
return createTitleChunks(file).flatMap((chunk) => {
7274
const title = chunk.metadata?.title
7375
const titleHierarchy = chunk.metadata?.titleHierarchy
@@ -81,29 +83,36 @@ export const createSplittedTitleChunks = (file: FileData): Chunk[] => {
8183
metadata: {
8284
title: `${title} - ${index + 1}`,
8385
titleHierarchy: [...titleHierarchy, index + 1],
86+
type: file.type,
8487
},
8588
}))
8689
})
8790
}
8891

89-
export const createStaticChunks = (file: FileData): Chunk[] => {
90-
const lines = file.content.split('\n').filter((line) => line.trim() !== '')
91-
92-
if (lines.length <= 2) return []
92+
export const createStaticChunks = (file: TextData, length: number = 800, overlap: number = 400): Chunk[] => {
93+
const content = file.content
9394

9495
const chunks: Chunk[] = []
9596

96-
for (let i = 1; i < lines.length - 1; i++) {
97-
const chunkContent = [lines[i - 1].trim(), lines[i].trim(), lines[i + 1].trim()]
98-
99-
chunks.push({
100-
id: `${file.fileName}-${i}`,
101-
content: [...chunkContent],
102-
metadata: {
103-
title: `Chunk ${i}`,
104-
},
105-
})
97+
for (let i = overlap; i < content.length - length - overlap; i += length) {
98+
const chunkContent = content.slice(i - overlap, i + length + overlap)
99+
if (chunkContent.length > 0) {
100+
chunks.push({
101+
id: `${file.fileName}-${chunks.length}`,
102+
content: chunkContent.split('\n'),
103+
metadata: {
104+
title: file.fileName,
105+
type: file.type,
106+
},
107+
})
108+
}
106109
}
107110

108111
return chunks
109112
}
113+
114+
export const chunkingAlgorithms = {
115+
static: createStaticChunks,
116+
title: createTitleChunks,
117+
splittedTitle: createSplittedTitleChunks,
118+
}

src/server/services/rag/ingestion/loader.ts

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
import { readdir, readFile, stat } from 'node:fs/promises'
22
import { Readable } from 'node:stream'
33

4-
export type FileData = {
5-
fileName: string
6-
content: string
7-
}
4+
export type FileData = { fileName: string; type: 'text' | 'md'; content: string } | { fileName: string; type: 'pdf'; content: Buffer }
85

96
async function* loadFiles(loadpath: string): AsyncGenerator<FileData> {
107
// Check if the path is a file
@@ -29,11 +26,23 @@ async function* loadFiles(loadpath: string): AsyncGenerator<FileData> {
2926
}
3027

3128
const loadFile = async (filePath: string): Promise<FileData> => {
32-
const content = await readFile(filePath, 'utf-8')
29+
const extension = filePath.split('.').pop()
3330
const fileName = filePath.split('/').pop() || 'unknown'
31+
32+
if (extension === 'pdf') {
33+
const content = await readFile(filePath)
34+
return {
35+
fileName,
36+
content,
37+
type: 'pdf',
38+
}
39+
}
40+
41+
const content = await readFile(filePath, 'utf-8')
3442
return {
3543
fileName,
3644
content,
45+
type: extension === 'md' ? 'md' : 'text',
3746
}
3847
}
3948

src/server/services/rag/ingestion/pipeline.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { Embedder } from './embedder.ts'
66
import { RedisStorer } from './storer.ts'
77
import type OpenAI from 'openai'
88
import RagIndex from '../../../db/models/ragIndex.ts'
9+
import { TextExtractor } from './textExtractor.ts'
910

1011
// Pipeline debug cache in pipeline/
1112
// Check if exists, if not create it.
@@ -24,6 +25,13 @@ const initPipelineCache = async () => {
2425
export const ingestionPipeline = async (client: OpenAI, loadpath: string, ragIndex: RagIndex) => {
2526
await initPipelineCache()
2627

27-
await pipeline([new FileLoader(loadpath), new Chunker(pipelineCachePath), new Embedder(client, pipelineCachePath, 10), new RedisStorer(ragIndex)])
28+
await pipeline([
29+
new FileLoader(loadpath),
30+
new TextExtractor(pipelineCachePath),
31+
new Chunker(pipelineCachePath),
32+
new Embedder(client, pipelineCachePath, 10),
33+
new RedisStorer(ragIndex),
34+
])
35+
2836
console.log('Ingestion pipeline completed')
2937
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { Transform } from 'node:stream'
2+
import type { FileData } from './loader.ts'
3+
import { mkdirSync } from 'node:fs'
4+
import { writeFile } from 'node:fs/promises'
5+
import { pdfToText } from '../../../util/pdfToText.ts'
6+
7+
export type TextData = {
8+
fileName: string
9+
content: string
10+
type: 'text' | 'md' | 'pdf'
11+
chunkingStrategy: 'static' | 'title' | 'splittedTitle'
12+
}
13+
14+
export class TextExtractor extends Transform {
15+
private cachePath: string
16+
17+
constructor(cachePath: string) {
18+
super({ objectMode: true })
19+
20+
this.cachePath = cachePath + '/texts'
21+
22+
// Make sure the cache path exists
23+
mkdirSync(this.cachePath, { recursive: true })
24+
}
25+
26+
async _transform(data: FileData, _encoding: BufferEncoding, callback: (error?: Error | null) => void) {
27+
let textContent = data.type === 'text' ? data.content : ''
28+
29+
if (data.type === 'pdf') {
30+
textContent = await pdfToText(data.content)
31+
}
32+
33+
const textData: TextData = {
34+
fileName: data.fileName,
35+
content: textContent,
36+
type: data.type,
37+
chunkingStrategy: data.type === 'pdf' ? 'static' : 'title',
38+
}
39+
40+
this.push(textData)
41+
42+
// Save text data to cache
43+
const textPath = `${this.cachePath}/${data.fileName}.txt`
44+
await writeFile(textPath, textContent, 'utf-8')
45+
callback()
46+
}
47+
}

0 commit comments

Comments
 (0)