Skip to content

Commit c55bb3f

Browse files
Pr@main@pdf (#23)
* feat: 分段API支持word,pdf * fix: 通用型知识库支持上传 PDF/DOC 格式的文档#19 --------- Co-authored-by: wangdan-fit2cloud <[email protected]>
1 parent da4b5be commit c55bb3f

File tree

7 files changed

+186
-14
lines changed

7 files changed

+186
-14
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: base_split_handle.py
6+
@date:2024/3/27 18:13
7+
@desc:
8+
"""
9+
from abc import ABC, abstractmethod
10+
from typing import List
11+
12+
13+
class BaseSplitHandle(ABC):
14+
@abstractmethod
15+
def support(self, file, get_buffer):
16+
pass
17+
18+
@abstractmethod
19+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
20+
pass
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: text_split_handle.py
6+
@date:2024/3/27 18:19
7+
@desc:
8+
"""
9+
import io
10+
import re
11+
from typing import List
12+
13+
from docx import Document
14+
15+
from common.handle.base_split_handle import BaseSplitHandle
16+
from common.util.split_model import SplitModel
17+
18+
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
19+
re.compile("(?<!#)### (?!#).*"),
20+
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
21+
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
22+
23+
24+
class DocSplitHandle(BaseSplitHandle):
25+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
26+
try:
27+
buffer = get_buffer(file)
28+
doc = Document(io.BytesIO(buffer))
29+
content = "\n".join([para.text for para in doc.paragraphs])
30+
if pattern_list is not None and len(pattern_list) > 0:
31+
split_model = SplitModel(pattern_list, with_filter, limit)
32+
else:
33+
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
34+
except BaseException as e:
35+
return {'name': file.name,
36+
'content': []}
37+
return {'name': file.name,
38+
'content': split_model.parse(content)
39+
}
40+
41+
def support(self, file, get_buffer):
42+
file_name: str = file.name.lower()
43+
if file_name.endswith(".docx") or file_name.endswith(".doc"):
44+
return True
45+
return False
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: text_split_handle.py
6+
@date:2024/3/27 18:19
7+
@desc:
8+
"""
9+
import re
10+
from typing import List
11+
12+
import fitz
13+
14+
from common.handle.base_split_handle import BaseSplitHandle
15+
from common.util.split_model import SplitModel
16+
17+
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
18+
re.compile("(?<!#)### (?!#).*"),
19+
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
20+
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
21+
22+
23+
def number_to_text(pdf_document, page_number):
24+
page = pdf_document.load_page(page_number)
25+
text = page.get_text()
26+
return text
27+
28+
29+
class PdfSplitHandle(BaseSplitHandle):
30+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
31+
try:
32+
buffer = get_buffer(file)
33+
pdf_document = fitz.open(file.name, buffer)
34+
content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
35+
if pattern_list is not None and len(pattern_list) > 0:
36+
split_model = SplitModel(pattern_list, with_filter, limit)
37+
else:
38+
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
39+
except BaseException as e:
40+
return {'name': file.name,
41+
'content': []}
42+
return {'name': file.name,
43+
'content': split_model.parse(content)
44+
}
45+
46+
def support(self, file, get_buffer):
47+
file_name: str = file.name.lower()
48+
if file_name.endswith(".pdf"):
49+
return True
50+
return False
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# coding=utf-8
2+
"""
3+
@project: maxkb
4+
@Author:虎
5+
@file: text_split_handle.py
6+
@date:2024/3/27 18:19
7+
@desc:
8+
"""
9+
import re
10+
from typing import List
11+
12+
import chardet
13+
14+
from common.handle.base_split_handle import BaseSplitHandle
15+
from common.util.split_model import SplitModel
16+
17+
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
18+
re.compile("(?<!#)### (?!#).*"),
19+
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
20+
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
21+
22+
23+
class TextSplitHandle(BaseSplitHandle):
24+
def support(self, file, get_buffer):
25+
buffer = get_buffer(file)
26+
file_name: str = file.name.lower()
27+
if file_name.endswith(".md") or file_name.endswith('.txt'):
28+
return True
29+
result = chardet.detect(buffer)
30+
if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
31+
return True
32+
return False
33+
34+
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
35+
buffer = get_buffer(file)
36+
if pattern_list is not None and len(pattern_list) > 0:
37+
split_model = SplitModel(pattern_list, with_filter, limit)
38+
else:
39+
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
40+
try:
41+
content = buffer.decode(chardet.detect(buffer)['encoding'])
42+
except BaseException as e:
43+
return {'name': file.name,
44+
'content': []}
45+
return {'name': file.name,
46+
'content': split_model.parse(content)
47+
}

apps/dataset/serializers/document_serializers.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
from common.event.common import work_thread_pool
2323
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs
2424
from common.exception.app_exception import AppApiException
25+
from common.handle.impl.doc_split_handle import DocSplitHandle
26+
from common.handle.impl.pdf_split_handle import PdfSplitHandle
27+
from common.handle.impl.text_split_handle import TextSplitHandle
2528
from common.mixins.api_mixin import ApiMixin
2629
from common.util.common import post
2730
from common.util.field_message import ErrMessage
@@ -593,17 +596,22 @@ def batch_delete(self, instance: Dict, with_valid=True):
593596
return True
594597

595598

599+
class FileBufferHandle:
600+
buffer = None
601+
602+
def get_buffer(self, file):
603+
if self.buffer is None:
604+
self.buffer = file.read()
605+
return self.buffer
606+
607+
608+
default_split_handle = TextSplitHandle()
609+
split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
610+
611+
596612
def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
597-
data = file.read()
598-
if pattern_list is not None and len(pattern_list) > 0:
599-
split_model = SplitModel(pattern_list, with_filter, limit)
600-
else:
601-
split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
602-
try:
603-
content = data.decode(chardet.detect(data)['encoding'])
604-
except BaseException as e:
605-
return {'name': file.name,
606-
'content': []}
607-
return {'name': file.name,
608-
'content': split_model.parse(content)
609-
}
613+
get_buffer = FileBufferHandle().get_buffer
614+
for split_handle in split_handles:
615+
if split_handle.support(file, get_buffer):
616+
return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
617+
return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ langchain-openai = "^0.0.8"
3131
django-ipware = "^6.0.4"
3232
django-apscheduler = "^0.6.2"
3333
chardet2 = "^2.0.3"
34+
pymupdf = "^1.24.0"
35+
python-docx = "^1.1.0"
3436

3537
[build-system]
3638
requires = ["poetry-core"]

ui/src/views/dataset/component/UploadComponent.vue

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
action="#"
1717
:auto-upload="false"
1818
:show-file-list="false"
19-
accept=".txt, .md, .csv, .log"
19+
accept=".txt, .md, .csv, .log, .doc, .docx, .pdf"
2020
:limit="50"
2121
:on-exceed="onExceed"
2222
>

0 commit comments

Comments
 (0)