diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts index 7710ee12fd38..e733dfb2882e 100644 --- a/packages/global/common/system/types/index.d.ts +++ b/packages/global/common/system/types/index.d.ts @@ -139,16 +139,21 @@ export type SystemEnvType = { oneapiUrl?: string; chatApiKey?: string; - customPdfParse?: customPdfParseType; + customPdfParse?: SystemEnvCustomPdfParseType; }; export type customPdfParseType = { + name: string; + desc: string; url?: string; key?: string; doc2xKey?: string; price?: number; + extension?: string; }; +export type SystemEnvCustomPdfParseType = customPdfParseType[]; + export type LicenseDataType = { startTime: string; expiredTime: string; diff --git a/packages/global/core/app/constants.ts b/packages/global/core/app/constants.ts index 7a0cf126b133..1000d99e85f7 100644 --- a/packages/global/core/app/constants.ts +++ b/packages/global/core/app/constants.ts @@ -46,7 +46,8 @@ export const defaultChatInputGuideConfig = { export const defaultAppSelectFileConfig: AppFileSelectConfigType = { canSelectFile: false, canSelectImg: false, - maxFiles: 10 + maxFiles: 10, + customPdfParse: '' }; export enum AppTemplateTypeEnum { diff --git a/packages/global/core/app/type.d.ts b/packages/global/core/app/type.d.ts index a3514d88302f..4c5161f4c9c0 100644 --- a/packages/global/core/app/type.d.ts +++ b/packages/global/core/app/type.d.ts @@ -205,7 +205,7 @@ export type AppAutoExecuteConfigType = { // File export type AppFileSelectConfigType = { canSelectFile: boolean; - customPdfParse?: boolean; + customPdfParse?: string; canSelectImg: boolean; maxFiles: number; }; diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 1a3935127018..617aa80c6704 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -44,7 +44,7 @@ type DatasetCollectionStoreDataType = ChunkSettingsType & { parentId?: string; metadata?: Record; - customPdfParse?: boolean; + customPdfParse?: string; }; // create collection params diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index 26df8a5305ae..c3f0d45f4d96 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -125,7 +125,7 @@ export type DatasetCollectionSchemaType = ChunkSettingsType & { }; // Parse settings - customPdfParse?: boolean; + customPdfParse?: string; trainingType: DatasetCollectionDataProcessModeEnum; }; diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts index b69d9fcde19e..f75c594c3daf 100644 --- a/packages/service/common/file/gridfs/controller.ts +++ b/packages/service/common/file/gridfs/controller.ts @@ -199,14 +199,14 @@ export const readFileContentFromMongo = async ({ tmbId, bucketName, fileId, - customPdfParse = false, + customPdfParse, getFormatText }: { teamId: string; tmbId: string; bucketName: `${BucketNameEnum}`; fileId: string; - customPdfParse?: boolean; + customPdfParse?: string; getFormatText?: boolean; // 数据类型都尽可能转化成 markdown 格式 }): Promise<{ rawText: string; diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index 461da09dc0c4..aabe0acd6175 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -15,7 +15,7 @@ export type readRawTextByLocalFileParams = { tmbId: string; path: string; encoding: string; - customPdfParse?: boolean; + customPdfParse?: string; getFormatText?: boolean; metadata?: Record; }; @@ -46,7 +46,7 @@ export const readRawContentByFileBuffer = async ({ buffer, encoding, metadata, - customPdfParse = false, + customPdfParse, getFormatText = true }: { teamId: string; @@ -57,7 +57,7 @@ export const readRawContentByFileBuffer = async ({ encoding: string; metadata?: Record; - customPdfParse?: boolean; + customPdfParse?: string; getFormatText?: boolean; }): Promise<{ rawText: string; @@ -68,9 +68,9 @@ export const readRawContentByFileBuffer = async ({ encoding, buffer }); - const parsePdfFromCustomService = async (): Promise => { - const url = global.systemEnv.customPdfParse?.url; - const token = global.systemEnv.customPdfParse?.key; + const parsePdfFromCustomService = async (parser: any): Promise => { + const url = parser.url; + const token = parser.key; if (!url) return systemParse(); const start = Date.now(); @@ -104,7 +104,8 @@ export const readRawContentByFileBuffer = async ({ createPdfParseUsage({ teamId, tmbId, - pages: response.pages + pages: response.pages, + parserName: customPdfParse }); return { @@ -114,8 +115,8 @@ export const readRawContentByFileBuffer = async ({ }; }; // Doc2x api - const parsePdfFromDoc2x = async (): Promise => { - const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey; + const parsePdfFromDoc2x = async (parser: any): Promise => { + const doc2xKey = parser.doc2xKey; if (!doc2xKey) return systemParse(); const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer); @@ -123,7 +124,8 @@ export const readRawContentByFileBuffer = async ({ createPdfParseUsage({ teamId, tmbId, - pages + pages, + parserName: customPdfParse }); return { @@ -135,8 +137,14 @@ export const readRawContentByFileBuffer = async ({ // Custom read file service const pdfParseFn = async (): Promise => { if (!customPdfParse) return systemParse(); - if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService(); - if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x(); + + const parsers = global.systemEnv.customPdfParse || []; + const selectedParser = parsers.find((parser) => parser.name === customPdfParse); + + if (!selectedParser) return systemParse(); + + if (selectedParser.url) return parsePdfFromCustomService(selectedParser); + if (selectedParser.doc2xKey) return parsePdfFromDoc2x(selectedParser); return systemParse(); }; @@ -145,9 +153,15 @@ export const readRawContentByFileBuffer = async ({ addLog.debug(`Start parse file`, { extension }); let { rawText, formatText, imageList } = await (async () => { - if (extension === 'pdf') { + // Check if any parser supports this extension + const parsers = global.systemEnv.customPdfParse || []; + const selectedParser = parsers.find((parser) => parser.name === customPdfParse); + const ext = selectedParser?.extension?.split(','); + + if (ext?.includes(extension)) { return await pdfParseFn(); } + return await systemParse(); })(); diff --git a/packages/service/common/system/tools.ts b/packages/service/common/system/tools.ts index 0f5cff06f92f..44328cd14abc 100644 --- a/packages/service/common/system/tools.ts +++ b/packages/service/common/system/tools.ts @@ -11,9 +11,8 @@ export const initFastGPTConfig = (config?: FastGPTConfigFileType) => { if (!config) return; // Special config computed - config.feConfigs.showCustomPdfParse = - !!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey; - config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0; + const parsers = config.systemEnv.customPdfParse || []; + config.feConfigs.showCustomPdfParse = parsers.length > 0; global.feConfigs = config.feConfigs; global.systemEnv = config.systemEnv; diff --git a/packages/service/core/dataset/apiDataset/custom/api.ts b/packages/service/core/dataset/apiDataset/custom/api.ts index 277ca4eb93b9..bd917d9eeeaf 100644 --- a/packages/service/core/dataset/apiDataset/custom/api.ts +++ b/packages/service/core/dataset/apiDataset/custom/api.ts @@ -131,7 +131,7 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } teamId: string; tmbId: string; apiFileId: string; - customPdfParse?: boolean; + customPdfParse?: string; }): Promise => { const data = await request< { diff --git a/packages/service/core/dataset/collection/schema.ts b/packages/service/core/dataset/collection/schema.ts index 61ddc48a099b..62f90cfffd1a 100644 --- a/packages/service/core/dataset/collection/schema.ts +++ b/packages/service/core/dataset/collection/schema.ts @@ -80,7 +80,7 @@ const DatasetCollectionSchema = new Schema({ forbid: Boolean, // Parse settings - customPdfParse: Boolean, + customPdfParse: String, apiFileParentId: String, // Chunk settings diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index e6f417cc858b..98c7eaaac760 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -29,7 +29,7 @@ export const readFileRawTextByUrl = async ({ teamId: string; tmbId: string; url: string; - customPdfParse?: boolean; + customPdfParse?: string; getFormatText?: boolean; relatedId: string; // externalFileId / apiFileId maxFileSize?: number; @@ -161,7 +161,7 @@ export const readDatasetSourceRawText = async ({ tmbId: string; type: DatasetSourceReadTypeEnum; sourceId: string; - customPdfParse?: boolean; + customPdfParse?: string; getFormatText?: boolean; selector?: string; // link selector @@ -241,7 +241,7 @@ export const readApiServerFileContent = async ({ apiFileId: string; teamId: string; tmbId: string; - customPdfParse?: boolean; + customPdfParse?: string; }): Promise<{ title?: string; rawText: string; diff --git a/packages/service/core/workflow/dispatch/ai/agent/index.ts b/packages/service/core/workflow/dispatch/ai/agent/index.ts index 867f2d2756a7..34ca5af87b36 100644 --- a/packages/service/core/workflow/dispatch/ai/agent/index.ts +++ b/packages/service/core/workflow/dispatch/ai/agent/index.ts @@ -322,7 +322,7 @@ const getMultiInput = async ({ fileLinks?: string[]; requestOrigin?: string; maxFiles: number; - customPdfParse?: boolean; + customPdfParse?: string; inputFiles: UserChatItemValueItemType['file'][]; hasReadFilesTool: boolean; }) => { diff --git a/packages/service/core/workflow/dispatch/ai/chat.ts b/packages/service/core/workflow/dispatch/ai/chat.ts index a9025234444d..a2e3ef4fec66 100644 --- a/packages/service/core/workflow/dispatch/ai/chat.ts +++ b/packages/service/core/workflow/dispatch/ai/chat.ts @@ -416,7 +416,7 @@ async function getMultiInput({ stringQuoteText?: string; // file quote requestOrigin?: string; maxFiles: number; - customPdfParse?: boolean; + customPdfParse?: string; runningUserInfo: ChatDispatchProps['runningUserInfo']; }) { // 旧版本适配====> diff --git a/packages/service/core/workflow/dispatch/tools/readFiles.ts b/packages/service/core/workflow/dispatch/tools/readFiles.ts index 92433796d69d..35896949c29b 100644 --- a/packages/service/core/workflow/dispatch/tools/readFiles.ts +++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts @@ -55,7 +55,7 @@ export const dispatchReadFiles = async (props: Props): Promise => { params: { fileUrlList = [] } } = props; const maxFiles = chatConfig?.fileSelectConfig?.maxFiles || 20; - const customPdfParse = chatConfig?.fileSelectConfig?.customPdfParse || false; + const customPdfParse = chatConfig?.fileSelectConfig?.customPdfParse; // Get files from histories const filesFromHistories = version !== '489' ? [] : getHistoryFileLinks(histories); @@ -126,7 +126,7 @@ export const getFileContentFromLinks = async ({ maxFiles: number; teamId: string; tmbId: string; - customPdfParse?: boolean; + customPdfParse?: string; }) => { const parseUrlList = urls // Remove invalid urls diff --git a/packages/service/support/wallet/usage/controller.ts b/packages/service/support/wallet/usage/controller.ts index c8eed35cd59e..97b854064168 100644 --- a/packages/service/support/wallet/usage/controller.ts +++ b/packages/service/support/wallet/usage/controller.ts @@ -171,13 +171,17 @@ export const createTrainingUsage = async ({ export const createPdfParseUsage = async ({ teamId, tmbId, - pages + pages, + parserName }: { teamId: string; tmbId: string; pages: number; + parserName?: string; }) => { - const unitPrice = global.systemEnv?.customPdfParse?.price || 0; + const parsers = global.systemEnv?.customPdfParse || []; + const selectedParser = parserName ? parsers.find((p) => p.name === parserName) : parsers[0]; + const unitPrice = selectedParser?.price || 0; const totalPoints = pages * unitPrice; createUsage({ diff --git a/packages/web/i18n/en/app.json b/packages/web/i18n/en/app.json index d6d12a739114..5489ed1af035 100644 --- a/packages/web/i18n/en/app.json +++ b/packages/web/i18n/en/app.json @@ -140,6 +140,9 @@ "pdf_enhance_parse": "PDF enhancement analysis", "pdf_enhance_parse_price": "{{price}}Points/page", "pdf_enhance_parse_tips": "Calling PDF recognition model for parsing, you can convert it into Markdown and retain pictures in the document. At the same time, you can also identify scanned documents, which will take a long time to identify them.", + "select_pdf_parser": "Select PDF Parser", + "system_default_parser": "System Default Parser", + "system_default_parser_desc": "Use system built-in PDF parser", "permission.des.manage": "Based on write permissions, you can configure publishing channels, view conversation logs, and assign permissions to the application.", "permission.des.read": "Use the app to have conversations", "permission.des.write": "Can view and edit apps", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index d8c57c4fad05..07df3a676d64 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -143,6 +143,9 @@ "pdf_enhance_parse": "PDF enhancement analysis", "pdf_enhance_parse_price": "{{price}} points/page", "pdf_enhance_parse_tips": "Calling PDF recognition model for parsing, you can convert it into Markdown and retain pictures in the document. At the same time, you can also identify scanned documents, which will take a long time to identify them.", + "select_pdf_parser": "Select PDF Parser", + "system_default_parser": "System Default Parser", + "system_default_parser_desc": "Use system built-in PDF parser", "permission.des.manage": "Can manage the entire knowledge base data and information", "permission.des.read": "View knowledge base content", "permission.des.write": "Ability to add and change knowledge base content", diff --git a/packages/web/i18n/zh-CN/app.json b/packages/web/i18n/zh-CN/app.json index 6b62f5cf9b23..caa572d96aa4 100644 --- a/packages/web/i18n/zh-CN/app.json +++ b/packages/web/i18n/zh-CN/app.json @@ -140,6 +140,9 @@ "pdf_enhance_parse": "PDF增强解析", "pdf_enhance_parse_price": "{{price}}积分/页", "pdf_enhance_parse_tips": "调用 PDF 识别模型进行解析,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别,识别时间较长。", + "select_pdf_parser": "选择PDF解析器", + "system_default_parser": "系统默认解析器", + "system_default_parser_desc": "使用系统内置的PDF解析器", "permission.des.manage": "写权限基础上,可配置发布渠道、查看对话日志、分配该应用权限", "permission.des.read": "可使用该应用进行对话", "permission.des.write": "可查看和编辑应用", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index af813c47f329..0f1b936988e1 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -143,6 +143,9 @@ "pdf_enhance_parse": "PDF增强解析", "pdf_enhance_parse_price": "{{price}}积分/页", "pdf_enhance_parse_tips": "调用 PDF 识别模型进行解析,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别,识别时间较长。", + "select_pdf_parser": "选择PDF解析器", + "system_default_parser": "系统默认解析器", + "system_default_parser_desc": "使用系统内置的PDF解析器", "permission.des.manage": "可管理整个知识库数据和信息", "permission.des.read": "可查看知识库内容", "permission.des.write": "可增加和变更知识库内容", diff --git a/packages/web/i18n/zh-Hant/app.json b/packages/web/i18n/zh-Hant/app.json index cbe8dcc2ba6a..795fc8cd023f 100644 --- a/packages/web/i18n/zh-Hant/app.json +++ b/packages/web/i18n/zh-Hant/app.json @@ -140,6 +140,9 @@ "pdf_enhance_parse": "PDF 增強解析", "pdf_enhance_parse_price": "{{price}}積分/頁", "pdf_enhance_parse_tips": "呼叫 PDF 識別模型進行解析,可以將其轉換成 Markdown 並保留文件中的圖片,同時也可以對掃描件進行識別,識別時間較長。", + "select_pdf_parser": "選擇PDF解析器", + "system_default_parser": "系統預設解析器", + "system_default_parser_desc": "使用系統內建的PDF解析器", "permission.des.manage": "在寫入權限基礎上,可以設定發布通道、檢視對話紀錄、分配這個應用程式的權限", "permission.des.read": "可以使用這個應用程式進行對話", "permission.des.write": "可以檢視和編輯應用程式", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index 07bde6b2674d..a708fc6666eb 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -143,6 +143,9 @@ "pdf_enhance_parse": "PDF 增強解析", "pdf_enhance_parse_price": "{{price}}積分/頁", "pdf_enhance_parse_tips": "呼叫 PDF 識別模型進行解析,可以將其轉換成 Markdown 並保留文件中的圖片,同時也可以對掃描件進行識別,識別時間較長。", + "select_pdf_parser": "選擇PDF解析器", + "system_default_parser": "系統預設解析器", + "system_default_parser_desc": "使用系統內建的PDF解析器", "permission.des.manage": "可管理整個資料集的資料和資訊", "permission.des.read": "可檢視資料集內容", "permission.des.write": "可新增和變更資料集內容", diff --git a/projects/app/src/components/core/app/FileSelect.tsx b/projects/app/src/components/core/app/FileSelect.tsx index 0b00fa2f1cc0..e497d02c293a 100644 --- a/projects/app/src/components/core/app/FileSelect.tsx +++ b/projects/app/src/components/core/app/FileSelect.tsx @@ -9,8 +9,7 @@ import { HStack, Switch, ModalFooter, - type BoxProps, - Checkbox + type BoxProps } from '@chakra-ui/react'; import React, { useMemo } from 'react'; import { useTranslation } from 'next-i18next'; @@ -25,6 +24,8 @@ import { useSystemStore } from '@/web/common/system/useSystemStore'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; import MyTag from '@fastgpt/web/components/common/Tag/index'; import MyDivider from '@fastgpt/web/components/common/MyDivider'; +import { usePdfParsers } from '@/web/common/system/hooks/usePdfParsers'; +import MySelect from '@fastgpt/web/components/common/MySelect'; const FileSelect = ({ forbidVision = false, @@ -40,6 +41,24 @@ const FileSelect = ({ const { feConfigs } = useSystemStore(); const { isOpen, onOpen, onClose } = useDisclosure(); const maxSelectFiles = Math.min(feConfigs?.uploadFileMaxAmount ?? 20, 30); + const { data: pdfParsers = [] } = usePdfParsers(); + + // 构建选择器列表 + const pdfParserOptions = useMemo( + () => [ + { + label: t('app:system_default_parser'), + value: '', + description: t('app:system_default_parser_desc') + }, + ...pdfParsers.map((parser) => ({ + label: parser.label, + value: parser.value, + description: parser.desc + })) + ], + [pdfParsers, t] + ); const formLabel = useMemo( () => @@ -100,20 +119,24 @@ const FileSelect = ({ {value.canSelectFile && feConfigs.showCustomPdfParse && ( <> - - { + + + {t('app:pdf_enhance_parse')} + + + { onChange({ ...value, - customPdfParse: e.target.checked + customPdfParse: val }); }} - > - {t('app:pdf_enhance_parse')} - - - {feConfigs?.show_pay && ( + size={'sm'} + h={'32px'} + /> + {value.customPdfParse && feConfigs?.show_pay && ( {t('app:pdf_enhance_parse_price', { - price: feConfigs.customPdfParsePrice || 0 + price: pdfParsers.find((p) => p.value === value.customPdfParse)?.price || 0 })} )} - + )} diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx index 79c7cd012b23..a461ffe9825a 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx @@ -22,7 +22,7 @@ import { chunkAutoChunkSize, getAutoIndexSize } from '@fastgpt/global/core/datas import { type CollectionChunkFormType } from '../Form/CollectionChunkForm'; export type ImportFormType = { - customPdfParse: boolean; + customPdfParse: string; webSelector: string; } & CollectionChunkFormType; @@ -38,7 +38,7 @@ type DatasetImportContextType = { }; export const defaultFormData: ImportFormType = { - customPdfParse: false, + customPdfParse: '', trainingType: DatasetCollectionDataProcessModeEnum.chunk, diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx index 149795e6557c..91fabe014b8e 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx @@ -20,14 +20,31 @@ import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; import { shadowLight } from '@fastgpt/web/styles/theme'; import CollectionChunkForm from '../../Form/CollectionChunkForm'; +import { usePdfParsers } from '@/web/common/system/hooks/usePdfParsers'; +import MySelect from '@fastgpt/web/components/common/MySelect'; function DataProcess() { const { t } = useTranslation(); const { feConfigs } = useSystemStore(); const { goToNext, processParamsForm } = useContextSelector(DatasetImportContext, (v) => v); - const { register, watch } = processParamsForm; + const { register, watch, setValue } = processParamsForm; const customPdfParseValue = watch('customPdfParse'); + const { data: pdfParsers = [] } = usePdfParsers(); + + // 构建选择器列表 + const pdfParserOptions = [ + { + label: t('dataset:system_default_parser'), + value: '', + description: t('dataset:system_default_parser_desc') + }, + ...pdfParsers.map((parser) => ({ + label: parser.label, + value: parser.value, + description: parser.desc + })) + ]; const Title = useCallback(({ title }: { title: string }) => { return ( @@ -62,12 +79,19 @@ function DataProcess() { p={4} > {feConfigs.showCustomPdfParse && ( - - + + {t('dataset:pdf_enhance_parse')} - - - {feConfigs?.show_pay && ( + + + setValue('customPdfParse', val)} + size={'sm'} + h={'32px'} + /> + {customPdfParseValue && feConfigs?.show_pay && ( {t('dataset:pdf_enhance_parse_price', { - price: feConfigs.customPdfParsePrice || 0 + price: + pdfParsers.find((p) => p.value === customPdfParseValue)?.price || 0 })} )} - + )} diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx index a4276628642a..3fe9c6a9e5eb 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx @@ -46,7 +46,7 @@ const ReTraining = () => { ]); processParamsForm.reset({ - customPdfParse: collection.customPdfParse || false, + customPdfParse: collection.customPdfParse || '', trainingType: collection.trainingType, chunkTriggerType: collection.chunkTriggerType || defaultFormData.chunkTriggerType, diff --git a/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx b/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx index 9aa951eb2c7b..299d8a5e28cb 100644 --- a/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx +++ b/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx @@ -74,11 +74,11 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => { label: t('common:core.dataset.collection.metadata.Updatetime'), value: formatTime2YMDHM(collection.updateTime) }, - ...(collection.customPdfParse !== undefined + ...(collection.customPdfParse ? [ { label: t('dataset:collection_metadata_custom_pdf_parse'), - value: collection.customPdfParse ? 'Yes' : 'No' + value: collection.customPdfParse } ] : []), diff --git a/projects/app/src/pages/api/admin/migratePdfParseConfig.ts b/projects/app/src/pages/api/admin/migratePdfParseConfig.ts new file mode 100644 index 000000000000..7ff088dd83a1 --- /dev/null +++ b/projects/app/src/pages/api/admin/migratePdfParseConfig.ts @@ -0,0 +1,165 @@ +import { NextAPI } from '@/service/middleware/entry'; +import { authCert } from '@fastgpt/service/support/permission/auth/common'; +import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; +import { MongoApp } from '@fastgpt/service/core/app/schema'; +import { type NextApiRequest, type NextApiResponse } from 'next'; + +/** + * 数据迁移API:将customPdfParse从boolean类型迁移到string类型 + * + * 使用方法: + * POST /api/admin/migratePdfParseConfig + * Body: { defaultParser?: string } + */ + +type MigrationResult = { + datasetCollections: { + trueToParser: number; + falseToEmpty: number; + }; + apps: { + updated: number; + details: Array<{ + appId: string; + oldValue: any; + newValue: string; + }>; + }; +}; + +async function migratePdfParseConfig(defaultParser: string = 'custom'): Promise { + const result: MigrationResult = { + datasetCollections: { + trueToParser: 0, + falseToEmpty: 0 + }, + apps: { + updated: 0, + details: [] + } + }; + + try { + console.log('开始迁移PDF解析配置...'); + + // 1. 迁移数据集集合中的customPdfParse字段 + console.log('迁移数据集集合...'); + + // 使用原生MongoDB查询来绕过Mongoose类型检查 + const db = MongoDatasetCollection.db; + const collectionName = MongoDatasetCollection.collection.collectionName; + + // 查找所有customPdfParse为boolean true的记录 + const booleanTrueRecords = await db + .collection(collectionName) + .find({ + customPdfParse: { $eq: true } + }) + .toArray(); + console.log(`找到 ${booleanTrueRecords.length} 条 customPdfParse: true 的记录`); + + // 将true转换为指定的解析器名称 + if (booleanTrueRecords.length > 0) { + const updateResult = await db + .collection(collectionName) + .updateMany({ customPdfParse: { $eq: true } }, { $set: { customPdfParse: defaultParser } }); + result.datasetCollections.trueToParser = updateResult.modifiedCount; + console.log(`更新了 ${updateResult.modifiedCount} 条记录: true -> '${defaultParser}'`); + } + + // 查找所有customPdfParse为boolean false的记录 + const booleanFalseRecords = await db + .collection(collectionName) + .find({ + customPdfParse: { $eq: false } + }) + .toArray(); + + console.log(`找到 ${booleanFalseRecords.length} 条 customPdfParse: false 的记录`); + + // 将false转换为空字符串 + if (booleanFalseRecords.length > 0) { + const updateResult = await db + .collection(collectionName) + .updateMany({ customPdfParse: { $eq: false } }, { $set: { customPdfParse: '' } }); + result.datasetCollections.falseToEmpty = updateResult.modifiedCount; + console.log(`更新了 ${updateResult.modifiedCount} 条记录: false -> ''`); + } + + // 2. 迁移应用配置中的fileSelectConfig.customPdfParse字段 + console.log('迁移应用配置...'); + + // 使用原生MongoDB查询来查找应用配置 + const appDb = MongoApp.db; + const appCollectionName = MongoApp.collection.collectionName; + + // 查找所有包含boolean类型customPdfParse的应用 + const appsWithBooleanPdfParse = await appDb + .collection(appCollectionName) + .find({ 'chatConfig.fileSelectConfig.canSelectFile': { $eq: true } }) + .toArray(); + + console.log(`找到 ${appsWithBooleanPdfParse.length} 个支持上传文件的应用`); + + for (const app of appsWithBooleanPdfParse) { + // 从数据库获取的原始值 + const rawValue = app.chatConfig?.fileSelectConfig?.customPdfParse; + let newValue = ''; + + // 处理boolean类型的值 + if (rawValue === true) { + newValue = defaultParser; + } else { + newValue = ''; + } + + await appDb + .collection(appCollectionName) + .updateOne( + { _id: app._id }, + { $set: { 'chatConfig.fileSelectConfig.customPdfParse': newValue } } + ); + + result.apps.updated++; + result.apps.details.push({ + appId: String(app._id), + oldValue: rawValue, + newValue: newValue + }); + + console.log(`更新应用 ${app._id}: ${rawValue} -> '${newValue}'`); + } + + console.log('迁移完成!'); + return result; + } catch (error) { + console.error('迁移失败:', error); + throw error; + } +} + +async function handler(req: NextApiRequest, res: NextApiResponse) { + try { + // 验证管理员权限 + await authCert({ req, authRoot: true }); + const parserArray = global.systemEnv.customPdfParse; + if (parserArray) { + const { defaultParser = parserArray[0].name } = req.body; + const result = await migratePdfParseConfig(defaultParser); + return { + success: true, + message: 'PDF解析配置迁移完成', + data: result + }; + } + } catch (error) { + console.error('API处理失败:', error); + return { + success: false, + message: error instanceof Error ? error.message : '迁移失败', + error: error + }; + } +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts b/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts index d656507c13cc..682912e632d6 100644 --- a/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts +++ b/projects/app/src/pages/api/core/dataset/collection/trainingDetail.ts @@ -17,7 +17,7 @@ type getTrainingDetailParams = { export type getTrainingDetailResponse = { trainingType: DatasetCollectionDataProcessModeEnum; advancedTraining: { - customPdfParse: boolean; + customPdfParse: string; imageIndex: boolean; autoIndexes: boolean; }; @@ -158,7 +158,7 @@ async function handler( return { trainingType: collection.trainingType, advancedTraining: { - customPdfParse: !!collection.customPdfParse, + customPdfParse: collection.customPdfParse || '', imageIndex: !!collection.imageIndex, autoIndexes: !!collection.autoIndexes }, diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index b6a8c1dfbc31..70968c65062f 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -21,7 +21,7 @@ export type PostPreviewFilesChunksProps = ChunkSettingsType & { type: DatasetSourceReadTypeEnum; sourceId: string; - customPdfParse?: boolean; + customPdfParse?: string; // Chunk settings overlapRatio: number; @@ -44,7 +44,7 @@ async function handler( let { type, sourceId, - customPdfParse = false, + customPdfParse, overlapRatio, selector, diff --git a/projects/app/src/pages/api/system/getPdfParsers.ts b/projects/app/src/pages/api/system/getPdfParsers.ts new file mode 100644 index 000000000000..794b5493c7cf --- /dev/null +++ b/projects/app/src/pages/api/system/getPdfParsers.ts @@ -0,0 +1,45 @@ +import type { NextApiRequest, NextApiResponse } from 'next'; +import { jsonRes } from '@fastgpt/service/common/response'; +import { authCert } from '@fastgpt/service/support/permission/auth/common'; + +export default async function handler(req: NextApiRequest, res: NextApiResponse) { + try { + await authCert({ req, authToken: true }); + + const parsers = global.systemEnv?.customPdfParse || []; + + const parserOptions = parsers.map((parser) => { + // 解析支持的文件格式 + const supportedFormats = parser.extension + ? parser.extension.split(',').map((ext) => ext.trim().toLowerCase()) + : ['pdf']; + + const formatList = supportedFormats.map((format) => { + return format.startsWith('.') ? format : `.${format}`; + }); + + const formatDescription = `支持格式: ${formatList.join(', ')}`; + + const fullDescription = parser.desc + ? `${parser.desc} (${formatDescription})` + : formatDescription; + + return { + value: parser.name, + label: parser.name, + desc: fullDescription, + price: parser.price || 0, + supportedFormats: supportedFormats + }; + }); + + jsonRes(res, { + data: parserOptions + }); + } catch (err) { + jsonRes(res, { + code: 500, + error: err + }); + } +} diff --git a/projects/app/src/web/common/system/hooks/usePdfParsers.ts b/projects/app/src/web/common/system/hooks/usePdfParsers.ts new file mode 100644 index 000000000000..1463a3c1aeb3 --- /dev/null +++ b/projects/app/src/web/common/system/hooks/usePdfParsers.ts @@ -0,0 +1,18 @@ +import { useQuery } from '@tanstack/react-query'; +import { GET } from '@/web/common/api/request'; + +export type PdfParserOption = { + value: string; + label: string; + desc: string; + price: number; + supportedFormats: string[]; +}; + +export const usePdfParsers = () => { + return useQuery({ + queryKey: ['getPdfParsers'], + queryFn: () => GET('/system/getPdfParsers'), + staleTime: 5 * 60 * 1000 // 5 minutes + }); +};