Skip to content

Commit 29edf1e

Browse files
c121914yucolnii
andauthored
Perf: llm parse paragraph (labring#5420)
* feat: llm directory optimization (labring#5400) * perf: llm parse * doc --------- Co-authored-by: colnii <1286949794@qq.com>
1 parent 1fc1e3f commit 29edf1e

File tree

6 files changed

+62
-25
lines changed

6 files changed

+62
-25
lines changed

packages/global/common/string/tools.ts

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,19 +83,37 @@ export const getRegQueryStr = (text: string, flags = 'i') => {
8383

8484
/* slice json str */
8585
export const sliceJsonStr = (str: string) => {
86-
str = str.replace(/(\\n|\\)/g, '').replace(/ /g, '');
86+
str = str
87+
.trim()
88+
.replace(/(\\n|\\)/g, '')
89+
.replace(/ /g, '');
90+
91+
// Find first opening bracket
92+
let start = -1;
93+
let openChar = '';
94+
95+
for (let i = 0; i < str.length; i++) {
96+
if (str[i] === '{' || str[i] === '[') {
97+
start = i;
98+
openChar = str[i];
99+
break;
100+
}
101+
}
87102

88-
const jsonRegex = /{(?:[^{}]|{(?:[^{}]|{[^{}]*})*})*}/g;
89-
const matches = str.match(jsonRegex);
103+
if (start === -1) return str;
90104

91-
if (!matches) {
92-
return '';
93-
}
105+
// Find matching closing bracket from the end
106+
const closeChar = openChar === '{' ? '}' : ']';
94107

95-
// 找到第一个完整的 JSON 字符串
96-
const jsonStr = matches[0];
108+
for (let i = str.length - 1; i >= start; i--) {
109+
const ch = str[i];
110+
111+
if (ch === closeChar) {
112+
return str.slice(start, i + 1);
113+
}
114+
}
97115

98-
return jsonStr;
116+
return str;
99117
};
100118

101119
export const sliceStrStartEnd = (str: string, start: number, end: number) => {

packages/web/i18n/en/dataset.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,13 @@
122122
"insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.",
123123
"is_open_schedule": "Enable scheduled synchronization",
124124
"keep_image": "Keep the picture",
125-
"llm_paragraph_mode": "LLM recognition paragraph(Beta)",
125+
"llm_paragraph_mode": "LLM recognition paragraph",
126126
"llm_paragraph_mode_auto": "automatic",
127-
"llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.",
127+
"llm_paragraph_mode_auto_desc": "Enable model recognition when the text content does not contain a Markdown title.",
128128
"llm_paragraph_mode_forbid": "Disabled",
129129
"llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition",
130+
"llm_paragraph_mode_force": "Force Process",
131+
"llm_paragraph_mode_force_desc": "Force the use of the model to automatically identify paragraphs and ignore paragraphs in the original text (if any)",
130132
"loading": "Loading...",
131133
"max_chunk_size": "Maximum chunk size",
132134
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",

packages/web/i18n/zh-CN/dataset.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,13 @@
122122
"insert_images_success": "新增图片成功,需等待训练完成才会展示",
123123
"is_open_schedule": "启用定时同步",
124124
"keep_image": "保留图片",
125-
"llm_paragraph_mode": "模型识别段落(Beta)",
125+
"llm_paragraph_mode": "模型识别段落",
126126
"llm_paragraph_mode_auto": "自动",
127-
"llm_paragraph_mode_auto_desc": "当文件内容不包含 Markdown 标题时,启用模型自动识别标题",
127+
"llm_paragraph_mode_auto_desc": "当文本内容不含 Markdown 标题时,启用模型识别",
128128
"llm_paragraph_mode_forbid": "禁用",
129129
"llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落",
130+
"llm_paragraph_mode_force": "强制处理",
131+
"llm_paragraph_mode_force_desc": "强制使用模型自动识别段落,并忽略原文本的段落(如有)",
130132
"loading": "加载中...",
131133
"max_chunk_size": "最大分块大小",
132134
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",

packages/web/i18n/zh-Hant/dataset.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,13 @@
122122
"insert_images_success": "新增圖片成功,需等待訓練完成才會展示",
123123
"is_open_schedule": "啟用定時同步",
124124
"keep_image": "保留圖片",
125-
"llm_paragraph_mode": "模型識別段落(Beta)",
125+
"llm_paragraph_mode": "模型識別段落",
126126
"llm_paragraph_mode_auto": "自動",
127-
"llm_paragraph_mode_auto_desc": "當文件內容不包含 Markdown 標題時,啟用模型自動識別標題",
127+
"llm_paragraph_mode_auto_desc": "當文本內容不含 Markdown 標題時,啟用模型識別",
128128
"llm_paragraph_mode_forbid": "禁用",
129129
"llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落",
130+
"llm_paragraph_mode_force": "強制處理",
131+
"llm_paragraph_mode_force_desc": "強制使用模型自動識別段落,並忽略原文本的段落(如有)",
130132
"loading": "加載中...",
131133
"max_chunk_size": "最大分塊大小",
132134
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",

projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -387,15 +387,20 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
387387
setValue('paragraphChunkAIMode', e);
388388
}}
389389
list={[
390+
{
391+
label: t('dataset:llm_paragraph_mode_auto'),
392+
value: ParagraphChunkAIModeEnum.auto,
393+
description: t('dataset:llm_paragraph_mode_auto_desc')
394+
},
390395
{
391396
label: t('dataset:llm_paragraph_mode_forbid'),
392397
value: ParagraphChunkAIModeEnum.forbid,
393398
description: t('dataset:llm_paragraph_mode_forbid_desc')
394399
},
395400
{
396-
label: t('dataset:llm_paragraph_mode_auto'),
397-
value: ParagraphChunkAIModeEnum.auto,
398-
description: t('dataset:llm_paragraph_mode_auto_desc')
401+
label: t('dataset:llm_paragraph_mode_force'),
402+
value: ParagraphChunkAIModeEnum.force,
403+
description: t('dataset:llm_paragraph_mode_force_desc')
399404
}
400405
]}
401406
/>

projects/app/src/service/core/dataset/queues/datasetParse.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,13 @@ const requestLLMPargraph = async ({
5555
};
5656
}
5757

58-
// Check is markdown text(Include 1 group of title)
5958
if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
60-
const isMarkdown = /^(#+)\s/.test(rawText);
59+
// Check if the text contains Markdown header structure
60+
const hasMarkdownHeaders = /^(#+)\s/m.test(rawText);
61+
const hasMultipleHeaders = (rawText.match(/^(#+)\s/g) || []).length > 1;
62+
63+
const isMarkdown = hasMarkdownHeaders && hasMultipleHeaders;
64+
6165
if (isMarkdown) {
6266
return {
6367
resultText: rawText,
@@ -71,11 +75,15 @@ const requestLLMPargraph = async ({
7175
resultText: string;
7276
totalInputTokens: number;
7377
totalOutputTokens: number;
74-
}>('/core/dataset/training/llmPargraph', {
75-
rawText,
76-
model,
77-
billId
78-
});
78+
}>(
79+
'/core/dataset/training/llmPargraph',
80+
{
81+
rawText,
82+
model,
83+
billId
84+
},
85+
{ timeout: 600000 }
86+
);
7987

8088
return data;
8189
};

0 commit comments

Comments
 (0)