Skip to content

Commit a017ba1

Browse files
committed
rag index mayheming
1 parent 3fb8d2f commit a017ba1

File tree

14 files changed

+214
-264
lines changed

14 files changed

+214
-264
lines changed

src/client/components/Rag/ProgressReporter.tsx

Lines changed: 35 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,21 @@
1-
import { Box, Table, TableBody, TableCell, TableHead, TableRow, Typography } from '@mui/material'
2-
import { orderBy } from 'lodash'
1+
import { LinearProgress, Table, TableBody, TableCell, TableHead, TableRow, Typography } from '@mui/material'
32
import { useEffect, useReducer } from 'react'
43
import { IngestionPipelineStageKey, IngestionPipelineStageKeys, IngestionPipelineStages } from '../../../shared/constants'
54

65
type ProgressEvent = {
76
stage: string
87
items?: string[]
9-
done?: boolean
108
error?: string
119
}
1210

1311
type ProgressState = Record<
1412
IngestionPipelineStageKey,
1513
{
1614
count: number
17-
done: boolean
1815
error: boolean
1916
files: {
2017
[fileName: string]: {
2118
count: number
22-
done: boolean
2319
error: boolean
2420
}
2521
}
@@ -32,7 +28,6 @@ const getInitialProgressState = () =>
3228
stage,
3329
{
3430
count: 0,
35-
done: false,
3631
error: false,
3732
files: {},
3833
},
@@ -44,25 +39,14 @@ type Action = { type: 'UPDATE'; payload: ProgressEvent } | { type: 'RESET' }
4439
const progressReducer = (state: ProgressState, action: Action): ProgressState => {
4540
switch (action.type) {
4641
case 'UPDATE': {
47-
const { stage, items, done, error } = action.payload
48-
49-
if (done) {
50-
return {
51-
...state,
52-
[stage]: {
53-
...state[stage],
54-
done: true,
55-
error: !!error || false,
56-
},
57-
}
58-
}
42+
const { stage, items, error } = action.payload
5943

6044
if (!items) return state
6145

6246
const updatedFiles = items.reduce(
6347
(acc, fileName) => {
6448
if (!acc[fileName]) {
65-
acc[fileName] = { count: 0, done: done || false, error: !!error || false }
49+
acc[fileName] = { count: 0, error: !!error || false }
6650
}
6751
acc[fileName].count += 1
6852
return acc
@@ -74,7 +58,6 @@ const progressReducer = (state: ProgressState, action: Action): ProgressState =>
7458
...state,
7559
[stage]: {
7660
...state[stage],
77-
done: done || state[stage]?.done,
7861
error: !!error || state[stage]?.error,
7962
count: state[stage]?.count + items.length,
8063
files: updatedFiles,
@@ -144,8 +127,6 @@ export const ProgressReporter: React.FC<{ filenames: string[]; stream: ReadableS
144127
}
145128
}, [stream])
146129

147-
console.log('Progress state:', progress)
148-
149130
return (
150131
<Table size="small">
151132
<TableHead>
@@ -155,13 +136,16 @@ export const ProgressReporter: React.FC<{ filenames: string[]; stream: ReadableS
155136
<TableCell key={stage}>
156137
<Typography variant="body2">{IngestionPipelineStages[stage].name}</Typography>
157138
<Typography variant="caption" color="textSecondary">
158-
{progress[stage].count}{' '}
159-
</Typography>
160-
<Typography variant="caption" color="textSecondary">
161-
{progress[stage].done ? 'Done' : progress[stage].error ? 'Error' : 'In Progress'}
139+
{progress[stage].count}/{filenames.length}
162140
</Typography>
141+
{progress[stage].error && (
142+
<Typography variant="caption" color="error">
143+
Error
144+
</Typography>
145+
)}
163146
</TableCell>
164147
))}
148+
<TableCell>Status</TableCell>
165149
</TableRow>
166150
</TableHead>
167151
<TableBody>
@@ -170,30 +154,31 @@ export const ProgressReporter: React.FC<{ filenames: string[]; stream: ReadableS
170154
<TableCell component="th" scope="row">
171155
{filename}
172156
</TableCell>
173-
{IngestionPipelineStageKeys.map((stage) => (
174-
<TableCell
175-
key={stage}
176-
sx={{
177-
transition: 'background-color 0.3s',
178-
backgroundColor: progress[stage]?.error
179-
? 'error.light'
180-
: progress[stage]?.done
181-
? 'success.light'
182-
: progress[stage]?.files[filename]?.error
183-
? 'error.light'
184-
: progress[stage]?.files[filename]?.count
185-
? 'info.light'
186-
: 'inherit',
187-
}}
188-
>
189-
<Box display="flex" gap={2}>
190-
<Typography variant="body2">{progress[stage].files[filename]?.count > 1 || ''}</Typography>
191-
<Typography variant="caption" color="textSecondary">
192-
{progress[stage]?.files[filename]?.done ? 'Done' : progress[stage].files[filename]?.error ? 'Error' : ''}
193-
</Typography>
194-
</Box>
195-
</TableCell>
196-
))}
157+
<TableCell colSpan={IngestionPipelineStageKeys.length}>
158+
<LinearProgress
159+
variant="determinate"
160+
value={(IngestionPipelineStageKeys.reduce((acc, stage) => acc + (progress[stage].files[filename]?.count ? 1 : 0), 0) / IngestionPipelineStageKeys.length) * 100}
161+
/>
162+
</TableCell>
163+
<TableCell>
164+
{IngestionPipelineStageKeys.some((stage) => progress[stage].files[filename]?.error) ? (
165+
<Typography variant="body2" color="error">
166+
Error
167+
</Typography>
168+
) : progress['store']?.files[filename]?.count > 0 ? (
169+
<Typography variant="body2" color="textSecondary">
170+
Completed
171+
</Typography>
172+
) : IngestionPipelineStageKeys.some((stage) => progress[stage].files[filename]?.count) ? (
173+
<Typography variant="body2" color="textSecondary">
174+
In Progress
175+
</Typography>
176+
) : (
177+
<Typography variant="body2" color="textSecondary">
178+
Not Started
179+
</Typography>
180+
)}
181+
</TableCell>
197182
</TableRow>
198183
))}
199184
</TableBody>

src/client/components/Rag/Rag.tsx

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ type RagIndexAttributes = {
2424
dim: number
2525
}
2626
numOfChunks: number
27+
filenames: string[]
2728
}
2829

2930
const useRagIndices = () => {
@@ -135,7 +136,7 @@ const Rag: React.FC = () => {
135136

136137
return (
137138
<Box sx={{ display: 'flex', gap: 2 }}>
138-
<Dialog open={!!selectedIndex && modalOpen} onClose={() => setModalOpen(false)} fullWidth maxWidth="md">
139+
<Dialog open={!!selectedIndex && modalOpen} onClose={() => { setModalOpen(false); refetch(); }} fullWidth maxWidth="md">
139140
<DialogTitle>Edit {selectedIndex?.metadata?.name}</DialogTitle>
140141
<Box sx={{ padding: 2 }}>
141142
<Box sx={{ display: 'flex', gap: 2 }}>
@@ -207,6 +208,7 @@ const Rag: React.FC = () => {
207208
<TableRow>
208209
<TableCell>ID</TableCell>
209210
<TableCell>Name</TableCell>
211+
<TableCell>Files</TableCell>
210212
<TableCell>Dim</TableCell>
211213
<TableCell>Num chunks</TableCell>
212214
</TableRow>
@@ -215,6 +217,17 @@ const Rag: React.FC = () => {
215217
<TableRow>
216218
<TableCell>{index.id}</TableCell>
217219
<TableCell>{index.metadata.name}</TableCell>
220+
<TableCell>
221+
{index.filenames.length ? (
222+
<>
223+
{index.filenames.map((filename) => (
224+
<p key={filename}>{filename}</p>
225+
))}
226+
</>
227+
) : (
228+
'No files'
229+
)}
230+
</TableCell>
218231
<TableCell>{index.metadata.dim}</TableCell>
219232
<TableCell>{index.numOfChunks}</TableCell>
220233
</TableRow>
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { QueryInterface, DataTypes } from 'sequelize'
2+
3+
export async function up({ context: queryInterface }) {
4+
await queryInterface.addColumn('rag_indices', 'filenames', {
5+
type: DataTypes.ARRAY(DataTypes.STRING),
6+
allowNull: false,
7+
comment: 'Original filenames of the files uploaded for this index',
8+
})
9+
}
10+
11+
export async function down({ context: queryInterface }) {
12+
await queryInterface.removeColumn('rag_indices', 'filenames')
13+
}

src/server/db/models/ragIndex.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ class RagIndex extends Model<InferAttributes<RagIndex>, InferCreationAttributes<
1111
declare courseId?: string
1212

1313
declare metadata: RagIndexMetadata
14+
15+
declare filenames: string[]
1416
}
1517

1618
RagIndex.init(
@@ -33,6 +35,10 @@ RagIndex.init(
3335
type: DataTypes.JSONB,
3436
allowNull: true,
3537
},
38+
filenames: {
39+
type: DataTypes.ARRAY(DataTypes.STRING),
40+
allowNull: false,
41+
},
3642
},
3743
{
3844
underscored: true,

src/server/routes/rag.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
import { NextFunction, Request, Response, Router } from 'express'
2-
import { EMBED_DIM, EMBED_MODEL } from '../../config'
2+
import { EMBED_DIM } from '../../config'
33
import { createChunkIndex, deleteChunkIndex, getNumberOfChunks } from '../services/rag/chunkDb'
44
import { RagIndex } from '../db/models'
55
import { RequestWithUser } from '../types'
66
import z from 'zod'
77
import { queryRagIndex } from '../services/rag/query'
88
import { ingestionPipeline } from '../services/rag/ingestion/pipeline'
9-
import { getAzureOpenAIClient } from '../util/azure'
109
import multer from 'multer'
1110
import { mkdir, rm, stat } from 'fs/promises'
12-
import { Readable } from 'stream'
1311
import { getOllamaOpenAIClient } from '../util/ollama'
1412

1513
const router = Router()
@@ -30,7 +28,9 @@ router.post('/indices', async (req, res) => {
3028
metadata: {
3129
name,
3230
dim,
31+
numOfChunks: 0,
3332
},
33+
filenames: [],
3434
})
3535

3636
await createChunkIndex(ragIndex)
Lines changed: 12 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,21 @@
1-
import { Transform } from 'node:stream'
21
import { chunkingAlgorithms } from './chunkingAlgorithms.ts'
32
import { mkdirSync } from 'node:fs'
43
import { writeFile } from 'node:fs/promises'
54
import { TextData } from './textExtractor.ts'
6-
import { StageReporter } from './progressReporter.ts'
75

8-
export class Chunker extends Transform {
9-
private cachePath: string
10-
public progressReporter: StageReporter
6+
export async function createChunks(data: TextData, cachePath: string) {
7+
const chunksDir = `${cachePath}/chunks`
8+
mkdirSync(chunksDir, { recursive: true })
119

12-
constructor(cachePath: string) {
13-
super({ objectMode: true })
10+
const chunkingAlgorithm = chunkingAlgorithms[data.chunkingStrategy]
11+
const chunks = chunkingAlgorithm(data)
1412

15-
this.cachePath = cachePath + '/chunks'
13+
await Promise.all(
14+
chunks.map((chunk) => {
15+
const chunkPath = `${chunksDir}/${chunk.id}.json`
16+
return writeFile(chunkPath, JSON.stringify(chunk, null, 2), 'utf-8')
17+
}),
18+
)
1619

17-
// Make sure the cache path exists
18-
mkdirSync(this.cachePath, { recursive: true })
19-
}
20-
21-
async _transform(data: TextData, _encoding: BufferEncoding, callback: (error?: Error | null) => void) {
22-
const chunkingAlgorithm = chunkingAlgorithms[data.chunkingStrategy]
23-
24-
const chunks = chunkingAlgorithm(data)
25-
for (const chunk of chunks) {
26-
this.push(chunk)
27-
}
28-
29-
// Save chunks to cache
30-
31-
await Promise.all(
32-
chunks.map((chunk) => {
33-
const chunkPath = `${this.cachePath}/${chunk.id}.json`
34-
return writeFile(chunkPath, JSON.stringify(chunk, null, 2), 'utf-8')
35-
}),
36-
)
37-
38-
this.progressReporter.reportProgress([data.fileName])
39-
40-
callback()
41-
}
42-
43-
_flush(callback: (error?: Error | null) => void) {
44-
this.progressReporter.reportDone()
45-
callback()
46-
}
20+
return chunks
4721
}

0 commit comments

Comments
 (0)