Skip to content

Commit a245386

Browse files
authored
Merge pull request #584 from UniversityOfHelsinkiCS/feat-advancedParsing-per-file
Feat advanced parsing per file
2 parents 74a11b2 + e210982 commit a245386

File tree

13 files changed

+209
-102
lines changed

13 files changed

+209
-102
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ minio-data/*
2020
*.s3cfg
2121
# Sentry Config File
2222
.env.sentry-build-plugin
23+
*/copilot-instructions.md

src/client/components/Rag/RagCreator.tsx

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,9 @@ import {
77
DialogContentText,
88
DialogTitle,
99
FormControl,
10-
FormControlLabel,
1110
InputLabel,
1211
MenuItem,
1312
Select,
14-
Switch,
1513
TextField,
1614
} from '@mui/material'
1715
import { OutlineButtonBlack } from '../ChatV2/general/Buttons'
@@ -26,7 +24,6 @@ export const RagCreator = ({ chatInstance }: { chatInstance: Course }) => {
2624
const createIndexMutation = useCreateRagIndexMutation()
2725
const [indexName, setIndexName] = useState('')
2826
const [language, setLanguage] = useState<'Finnish' | 'English' | 'Swedish'>('English')
29-
const [advancedParsing, setAdvancedParsing] = useState(false)
3027
const [open, setOpen] = useState(false)
3128

3229
return (
@@ -46,7 +43,6 @@ export const RagCreator = ({ chatInstance }: { chatInstance: Course }) => {
4643
chatInstanceId: chatInstance?.id,
4744
name: indexName,
4845
language,
49-
advancedParsing,
5046
})
5147
setIndexName('')
5248
navigate(`?index=${newIndex.id}`)
@@ -87,11 +83,6 @@ export const RagCreator = ({ chatInstance }: { chatInstance: Course }) => {
8783
<MenuItem value={RAG_LANGUAGES[2]}>{t('rag:english')}</MenuItem>
8884
</Select>
8985
</FormControl>
90-
<DialogContentText>{t('rag:advancedParsingGuide')}</DialogContentText>
91-
<FormControlLabel
92-
control={<Switch checked={advancedParsing} onChange={(e) => setAdvancedParsing(e.target.checked)} />}
93-
label={t('rag:advancedParsing')}
94-
/>
9586
</DialogContent>
9687
<DialogActions>
9788
<OutlineButtonBlack color="primary" type="submit" data-testid="ragIndexCreateSubmit">
Lines changed: 75 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { Box, Divider, Link, Paper, styled, Table, TableBody, TableCell, TableHead, TableRow, Typography } from '@mui/material'
1+
import { Box, Divider, Link, styled, TableBody, TableCell, TableHead, TableRow, Typography } from '@mui/material'
22
import { IngestionJobStatus, IngestionPipelineStageKey } from '@shared/ingestion'
33
import { Link as RouterLink } from 'react-router-dom'
44
import type { RagFileAttributes } from '@shared/types'
@@ -7,6 +7,7 @@ import DownloadDone from '@mui/icons-material/DownloadDone'
77
import ErrorOutline from '@mui/icons-material/ErrorOutline'
88
import PendingOutlined from '@mui/icons-material/PendingOutlined'
99
import SettingsOutlined from '@mui/icons-material/SettingsOutlined'
10+
import AutoFixHigh from '@mui/icons-material/AutoFixHigh'
1011
import { locales } from '../../locales/locales'
1112
import { useTranslation } from 'react-i18next'
1213
import { formatDistanceStrict } from 'date-fns'
@@ -36,13 +37,44 @@ const ProgressIcon: Record<FileStage, React.ReactNode> = {
3637
queued: <PendingOutlined />,
3738
}
3839

40+
const HideOnSmall = styled(TableCell)(({ theme }) => ({
41+
[theme.breakpoints.down('md')]: {
42+
display: 'none',
43+
},
44+
}))
45+
46+
const HideOnSmallHead = styled(TableCell)(({ theme }) => ({
47+
[theme.breakpoints.down('md')]: {
48+
display: 'none',
49+
},
50+
})) as typeof TableCell
51+
52+
export const RagFileTableHead: React.FC = () => {
53+
const { t } = useTranslation()
54+
return (
55+
<TableHead>
56+
<TableRow>
57+
<TableCell><strong>{t('rag:fileName')}</strong></TableCell>
58+
<HideOnSmallHead><strong>{t('common:fileType')}</strong></HideOnSmallHead>
59+
<TableCell><strong>{t('rag:fileSizeKb')}</strong></TableCell>
60+
<TableCell><strong>{t('rag:parsing')}</strong></TableCell>
61+
<HideOnSmallHead><strong>{t('common:added')}</strong></HideOnSmallHead>
62+
<TableCell><strong>{t('rag:fileStatus')}</strong></TableCell>
63+
<TableCell />
64+
</TableRow>
65+
</TableHead>
66+
)
67+
}
68+
3969
export const RagFileInfo: React.FC<{
4070
file: RagFileAttributes
4171
index: number
4272
status?: IngestionJobStatus
4373
uploadProgress?: number
4474
}> = ({ file, index, status, uploadProgress }) => {
4575
const { t, i18n } = useTranslation()
76+
const usedAdvancedParsing = !!(file.metadata as Record<string, unknown> | null)?.advancedParsing
77+
const isPdf = file.fileType === 'application/pdf'
4678

4779
const pipelineStage = status?.pipelineStage ?? file.pipelineStage
4880

@@ -60,55 +92,52 @@ export const RagFileInfo: React.FC<{
6092
const progressIcon = ProgressIcon[fileStage]
6193

6294
return (
63-
<>
64-
<Box display="flex" alignItems="center" gap={2}>
65-
<Box sx={{ flex: 3 }}>
66-
<Box display="flex" width="100%" alignItems="center">
67-
<Link to={`?index=${index}&file=${file.id}`} component={RouterLink}>
68-
<Typography variant="subtitle1">{file.filename}</Typography>
69-
</Link>
70-
<Typography variant="body2" color="text.secondary" sx={{ marginLeft: 'auto' }}>
71-
{t('common:added')} {new Date(file.createdAt).toLocaleString(locales[i18n.language].code)}
95+
<TableRow>
96+
<TableCell sx={{ maxWidth: 200, wordBreak: 'break-word' }}>
97+
<Link to={`?index=${index}&file=${file.id}`} component={RouterLink}>
98+
{file.filename}
99+
</Link>
100+
</TableCell>
101+
<HideOnSmall>{file.fileType}</HideOnSmall>
102+
<TableCell>{(file.fileSize / 1024).toFixed()} kB</TableCell>
103+
<TableCell>
104+
{isPdf && (
105+
<Box display="flex" alignItems="center" gap={0.5}>
106+
{usedAdvancedParsing && <AutoFixHigh sx={{ fontSize: 16, color: 'primary.main' }} />}
107+
<Typography variant="body2" color={usedAdvancedParsing ? 'primary.main' : 'text.secondary'}>
108+
{usedAdvancedParsing ? t('rag:advancedParsing') : t('rag:standardParsing')}
72109
</Typography>
73110
</Box>
74-
<Table size="small">
75-
<TableHead>
76-
<TableRow>
77-
<TableCell>{t('common:fileType')}</TableCell>
78-
<TableCell>{t('rag:fileSize')}</TableCell>
79-
</TableRow>
80-
</TableHead>
81-
<TableBody>
82-
<TableRow>
83-
<TableCell>{file.fileType}</TableCell>
84-
<TableCell>{(file.fileSize / 1024).toFixed()} kB</TableCell>
85-
</TableRow>
86-
</TableBody>
87-
</Table>
88-
</Box>
89-
<Box sx={{ ml: '2rem', flex: 1, display: 'flex', alignItems: 'center', gap: 2 }}>
90-
<Box display="flex" alignItems="center" gap={1} color={`${accentColor}.main`}>
91-
{progressIcon}
92-
<div>
93-
<Typography variant="body2" sx={{ color: 'text.secondary' }}>
94-
{message}
111+
)}
112+
</TableCell>
113+
<HideOnSmall>
114+
<Typography variant="body2" color="text.secondary">
115+
{new Date(file.createdAt).toLocaleString(locales[i18n.language].code)}
116+
</Typography>
117+
</HideOnSmall>
118+
<TableCell>
119+
<Box display="flex" alignItems="center" gap={1} color={`${accentColor}.main`}>
120+
{progressIcon}
121+
<div>
122+
<Typography variant="body2" sx={{ color: 'text.secondary' }}>
123+
{message}
124+
</Typography>
125+
{status?.eta && (
126+
<Typography variant="body2" component="div" sx={{ color: 'text.secondary' }}>
127+
{formatDistanceStrict(0, status?.eta)} left
95128
</Typography>
96-
{status?.eta && (
97-
<Typography variant="body2" component="div" sx={{ color: 'text.secondary' }}>
98-
{formatDistanceStrict(0, status?.eta)} left
99-
</Typography>
100-
)}
101-
{error?.length && (
102-
<Typography variant="body2" color="error">
103-
{t('error:errorMessage')}: {status?.error ?? file.error}
104-
</Typography>
105-
)}
106-
</div>
107-
</Box>
129+
)}
130+
{error?.length && (
131+
<Typography variant="body2" color="error">
132+
{t('error:errorMessage')}: {status?.error ?? file.error}
133+
</Typography>
134+
)}
135+
</div>
108136
</Box>
137+
</TableCell>
138+
<TableCell>
109139
<CircularProgressWithLabel progress={progress} accentColor={accentColor} />
110-
</Box>
111-
<Divider sx={{ my: 2 }} />
112-
</>
140+
</TableCell>
141+
</TableRow>
113142
)
114143
}

src/client/components/Rag/RagIndex.tsx

Lines changed: 83 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,28 @@ import {
88
Container,
99
DialogTitle,
1010
DialogContent,
11+
DialogActions,
1112
Dialog,
1213
Link,
1314
CircularProgress,
1415
Breadcrumbs,
1516
Divider,
17+
FormControlLabel,
18+
Switch,
19+
List,
20+
ListItem,
21+
ListItemText,
22+
Table,
23+
TableBody,
24+
TableContainer,
1625
} from '@mui/material'
1726
import { useNavigate, useParams, Link as RouterLink, useSearchParams } from 'react-router-dom'
1827
import Autorenew from '@mui/icons-material/Autorenew'
1928
import CloudUpload from '@mui/icons-material/CloudUpload'
2029
import DeleteOutline from '@mui/icons-material/DeleteOutline'
2130
import FindInPage from '@mui/icons-material/FindInPage'
2231
import { orderBy } from 'lodash'
23-
import { RagFileInfo } from './RagFileDetails'
32+
import { RagFileInfo, RagFileTableHead } from './RagFileDetails'
2433
import { RagIndexDetails, useDeleteRagIndexMutation, useRagIndexDetails, useRagIndexJobs, useUploadMutation } from './api'
2534
import { Search } from './Search'
2635
import { useTranslation } from 'react-i18next'
@@ -59,6 +68,8 @@ export const RagIndex: React.FC = () => {
5968
const deleteIndexMutation = useDeleteRagIndexMutation(id)
6069
const [refetchInterval, setRefetchInterval] = React.useState(60 * 1000)
6170
const [uploadProgress, setUploadProgress] = React.useState(0)
71+
const [stagedFiles, setStagedFiles] = React.useState<File[]>([])
72+
const [advancedParsing, setAdvancedParsing] = React.useState<boolean[]>([])
6273
const { data: ragDetails, isSuccess, refetch } = useRagIndexDetails(id)
6374
const { data: ragFileStatuses, refetch: refetchStatuses } = useRagIndexJobs(id, refetchInterval)
6475
const uploadMutation = useUploadMutation({ index: ragDetails, onUploadProgress: setUploadProgress })
@@ -78,7 +89,12 @@ export const RagIndex: React.FC = () => {
7889
return <LinearProgress />
7990
}
8091

81-
const handleUpload = async (files: File[]) => {
92+
const handleStageFiles = (files: File[]) => {
93+
setStagedFiles(files)
94+
setAdvancedParsing(files.map(() => false))
95+
}
96+
97+
const handleUpload = async (files: File[], perFileAdvancedParsing: boolean[]) => {
8298
setUploadProgress(0)
8399
queryClient.setQueryData<RagIndexDetails>(['ragIndex', id], (old) => {
84100
if (!old) return old
@@ -105,7 +121,9 @@ export const RagIndex: React.FC = () => {
105121
],
106122
}
107123
})
108-
await uploadMutation.mutateAsync(Array.from(files))
124+
await uploadMutation.mutateAsync({ files: Array.from(files), advancedParsing: perFileAdvancedParsing })
125+
setStagedFiles([])
126+
setAdvancedParsing([])
109127
refetch()
110128
refetchStatuses()
111129
}
@@ -137,8 +155,9 @@ export const RagIndex: React.FC = () => {
137155
onChange={async (e) => {
138156
const files = e.target.files
139157
if (files && files.length > 0) {
140-
await handleUpload(Array.from(files))
158+
handleStageFiles(Array.from(files))
141159
}
160+
e.target.value = ''
142161
}}
143162
multiple
144163
/>
@@ -190,24 +209,74 @@ export const RagIndex: React.FC = () => {
190209
<OutlineButtonBlack
191210
startIcon={<Autorenew />}
192211
onClick={async () => {
193-
await handleUpload([])
212+
await handleUpload([], [])
194213
}}
195214
>
196215
{t('rag:retryFailedFiles')}
197216
</OutlineButtonBlack>
198217
)}
199218
</Box>
200-
{orderBy(ragDetails.ragFiles, [(f) => Date.parse(f.createdAt as unknown as string)], ['desc']).map((file) => (
201-
<RagFileInfo
202-
key={file.id}
203-
file={file}
204-
index={id}
205-
status={ragFileStatuses?.find((rfs) => rfs.ragFileId === file.id)}
206-
uploadProgress={uploadMutation.isPending ? uploadProgress : undefined}
207-
/>
208-
))}
219+
<TableContainer sx={{ overflowX: 'hidden' }}>
220+
<Table size="small">
221+
<RagFileTableHead />
222+
<TableBody>
223+
{orderBy(ragDetails.ragFiles, [(f) => Date.parse(f.createdAt as unknown as string)], ['desc']).map((file) => (
224+
<RagFileInfo
225+
key={file.id}
226+
file={file}
227+
index={id}
228+
status={ragFileStatuses?.find((rfs) => rfs.ragFileId === file.id)}
229+
uploadProgress={uploadMutation.isPending ? uploadProgress : undefined}
230+
/>
231+
))}
232+
</TableBody>
233+
</Table>
234+
</TableContainer>
209235
</Box>
210236
</Box>
237+
<Dialog open={stagedFiles.length > 0} onClose={() => setStagedFiles([])} fullWidth maxWidth="sm">
238+
<DialogTitle>{t('rag:uploadFiles')}</DialogTitle>
239+
<DialogContent>
240+
<Typography variant="body2" color="text.secondary" mb={1}>
241+
{t('rag:advancedParsingGuide')}
242+
</Typography>
243+
<List dense>
244+
{stagedFiles.map((file, idx) => (
245+
<ListItem key={file.name} disableGutters>
246+
<ListItemText primary={file.name} secondary={`${(file.size / 1024).toFixed(0)} KB`} />
247+
<FormControlLabel
248+
control={
249+
<Switch
250+
checked={advancedParsing[idx] ?? false}
251+
onChange={(e) => {
252+
setAdvancedParsing((prev) => {
253+
const next = [...prev]
254+
next[idx] = e.target.checked
255+
return next
256+
})
257+
}}
258+
/>
259+
}
260+
label={t('rag:advancedParsing')}
261+
labelPlacement="start"
262+
/>
263+
</ListItem>
264+
))}
265+
</List>
266+
</DialogContent>
267+
<DialogActions>
268+
<OutlineButtonBlack onClick={() => setStagedFiles([])}>{t('common:cancel')}</OutlineButtonBlack>
269+
<BlueButton
270+
variant="contained"
271+
onClick={async () => {
272+
await handleUpload(stagedFiles, advancedParsing)
273+
}}
274+
disabled={uploadMutation.isPending}
275+
>
276+
{uploadMutation.isPending ? t('rag:uploading') : t('rag:uploadFiles')}
277+
</BlueButton>
278+
</DialogActions>
279+
</Dialog>
211280
</Box>
212281
)
213282
}

src/client/components/Rag/api.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,11 @@ import queryClient from '../../util/queryClient'
66

77
export const useCreateRagIndexMutation = () => {
88
const mutation = useMutation({
9-
mutationFn: async ({ chatInstanceId, name, language, advancedParsing }: RagIndexMetadata & { chatInstanceId: string }) => {
9+
mutationFn: async ({ chatInstanceId, name, language }: RagIndexMetadata & { chatInstanceId: string }) => {
1010
const response = await apiClient.post('/rag/indices', {
1111
name,
1212
chatInstanceId,
1313
language,
14-
advancedParsing,
1514
})
1615
return response.data
1716
},
@@ -78,14 +77,14 @@ export const useUpdateRagIndexMutation = (indexId: number) => {
7877

7978
export const useUploadMutation = ({ index, onUploadProgress = () => {} }: { index?: RagIndexAttributes; onUploadProgress?: (progress: number) => void }) => {
8079
const mutation = useMutation({
81-
mutationFn: async (files: File[]) => {
80+
mutationFn: async ({ files, advancedParsing }: { files: File[]; advancedParsing: boolean[] }) => {
8281
if (!index) {
8382
throw new Error('Index is required')
8483
}
8584
const formData = new FormData()
86-
// Append each file individually
87-
files.forEach((file) => {
85+
files.forEach((file, idx) => {
8886
formData.append('files', file)
87+
formData.append('advancedParsing', String(advancedParsing[idx] ?? false))
8988
})
9089

9190
const res = await apiClient.post(`/rag/indices/${index.id}/upload`, formData, {

0 commit comments

Comments
 (0)