Skip to content

Commit 6ca265d

Browse files
committed
feat: add chunk_size parameter to document splitting and chunk handling
1 parent 1a1722c commit 6ca265d

File tree

5 files changed

+15
-16
lines changed

5 files changed

+15
-16
lines changed

apps/application/flow/step_node/document_split_node/i_document_split_node.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class DocumentSplitNodeSerializer(serializers.Serializer):
3535
required=False, label=_("document name relate problem reference"), child=serializers.CharField(), default=[]
3636
)
3737
limit = serializers.IntegerField(required=False, label=_("limit"), default=4096)
38+
chunk_size = serializers.IntegerField(required=False, label=_("chunk size"), default=256)
3839
patterns = serializers.ListField(
3940
required=False, label=_("patterns"), child=serializers.CharField(), default=[]
4041
)
@@ -53,12 +54,10 @@ def get_node_params_serializer_class(self) -> Type[serializers.Serializer]:
5354
return DocumentSplitNodeSerializer
5455

5556
def _run(self):
56-
# res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
57-
# self.node_params_serializer.data.get('file_list')[1:])
5857
return self.execute(**self.node_params_serializer.data, **self.flow_params_serializer.data)
5958

6059
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
6160
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
6261
document_name_relate_problem_type, document_name_relate_problem,
63-
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
62+
document_name_relate_problem_reference, limit, chunk_size, patterns, with_filter, **kwargs) -> NodeResult:
6463
pass

apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from application.flow.i_step_node import NodeResult
99
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
10+
from common.chunk import text_to_chunk
1011
from knowledge.serializers.document import default_split_handle, FileBufferHandle
1112

1213

@@ -43,7 +44,7 @@ def get_reference_content(self, fields: List[str]):
4344
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
4445
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
4546
document_name_relate_problem_type, document_name_relate_problem,
46-
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
47+
document_name_relate_problem_reference, limit, chunk_size, patterns, with_filter, **kwargs) -> NodeResult:
4748
self.context['knowledge_id'] = knowledge_id
4849
file_list = self.workflow_manage.get_reference_field(document_list[0], document_list[1:])
4950
paragraph_list = []
@@ -62,7 +63,7 @@ def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_r
6263
split_strategy, paragraph_title_relate_problem_type,
6364
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
6465
document_name_relate_problem_type, document_name_relate_problem,
65-
document_name_relate_problem_reference
66+
document_name_relate_problem_reference, chunk_size
6667
)
6768

6869
paragraph_list += results
@@ -79,7 +80,7 @@ def _process_split_result(
7980
split_strategy, paragraph_title_relate_problem_type,
8081
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
8182
document_name_relate_problem_type, document_name_relate_problem,
82-
document_name_relate_problem_reference
83+
document_name_relate_problem_reference, chunk_size
8384
):
8485
"""处理文档分割结果"""
8586
item['meta'] = {
@@ -99,6 +100,7 @@ def _process_split_result(
99100
document_name_relate_problem_reference
100101
)
101102
paragraph['is_active'] = True
103+
paragraph['chunks'] = text_to_chunk(paragraph['content'], chunk_size)
102104

103105
def _generate_problem_list(
104106
self, paragraph, document_name, split_strategy, paragraph_title_relate_problem_type,

apps/common/chunk/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
handles = [MarkChunkHandle()]
1212

1313

14-
def text_to_chunk(text: str):
14+
def text_to_chunk(text: str, chunk_size: int = 256):
1515
chunk_list = [text]
1616
for handle in handles:
17-
chunk_list = handle.handle(chunk_list)
17+
chunk_list = handle.handle(chunk_list, chunk_size)
1818
return chunk_list

apps/common/chunk/i_chunk_handle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212

1313
class IChunkHandle(ABC):
1414
@abstractmethod
15-
def handle(self, chunk_list: List[str]):
15+
def handle(self, chunk_list: List[str], chunk_size: int = 256):
1616
pass

apps/common/chunk/impl/mark_chunk_handle.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,11 @@
1111

1212
from common.chunk.i_chunk_handle import IChunkHandle
1313

14-
max_chunk_len = 256
15-
split_chunk_pattern = r'.{1,%d}[。| |\\.|!|;|;|!|\n]' % max_chunk_len
16-
max_chunk_pattern = r'.{1,%d}' % max_chunk_len
17-
18-
1914
class MarkChunkHandle(IChunkHandle):
20-
def handle(self, chunk_list: List[str]):
15+
def handle(self, chunk_list: List[str], chunk_size: int = 256):
16+
split_chunk_pattern = r'.{1,%d}[。| |\\.|!|;|;|!|\n]' % chunk_size
17+
max_chunk_pattern = r'.{1,%d}' % chunk_size
18+
2119
result = []
2220
for chunk in chunk_list:
2321
chunk_result = re.findall(split_chunk_pattern, chunk, flags=re.DOTALL)
@@ -28,7 +26,7 @@ def handle(self, chunk_list: List[str]):
2826
other_chunk_list = re.split(split_chunk_pattern, chunk, flags=re.DOTALL)
2927
for other_chunk in other_chunk_list:
3028
if len(other_chunk) > 0:
31-
if len(other_chunk) < max_chunk_len:
29+
if len(other_chunk) < chunk_size:
3230
if len(other_chunk.strip()) > 0:
3331
result.append(other_chunk.strip())
3432
else:

0 commit comments

Comments
 (0)