Skip to content

Commit 4aaf0fd

Browse files
committed
feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection
1 parent 937ae72 commit 4aaf0fd

File tree

4 files changed

+169
-0
lines changed

4 files changed

+169
-0
lines changed

runtime/datamate-python/app/common/text_split.py

Whitespace-only changes.
File renamed without changes.

runtime/datamate-python/app/common/document_loaders.py renamed to runtime/datamate-python/app/module/shared/common/document_loaders.py

File renamed without changes.
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import os
2+
from typing import List, Optional, Tuple
3+
4+
from langchain_core.documents import Document
5+
from langchain_text_splitters import (
6+
RecursiveCharacterTextSplitter,
7+
MarkdownHeaderTextSplitter
8+
)
9+
10+
11+
class DocumentSplitter:
12+
"""
13+
文档分割器类 - 增强版,优先通过元数据识别文档类型
14+
核心特性:
15+
1. 优先从metadata的source字段(文件扩展名)识别Markdown
16+
2. 元数据缺失时,通过内容特征降级检测
17+
3. 支持CJK(中日韩)语言优化
18+
"""
19+
20+
def __init__(
21+
self,
22+
chunk_size: int = 2000,
23+
chunk_overlap: int = 200,
24+
is_cjk_language: bool = True,
25+
markdown_headers: Optional[List[Tuple[str, str]]] = None
26+
):
27+
"""
28+
初始化文档分割器
29+
30+
Args:
31+
chunk_size: 每个文本块的最大长度(默认2000字符)
32+
chunk_overlap: 文本块之间的重叠长度(默认200字符)
33+
is_cjk_language: 是否处理中日韩等无词边界语言(默认True)
34+
markdown_headers: Markdown标题分割规则(默认:#/##/###/####)
35+
"""
36+
self.chunk_size = chunk_size
37+
self.chunk_overlap = chunk_overlap
38+
self.is_cjk_language = is_cjk_language
39+
40+
# 默认Markdown标题分割规则
41+
self.markdown_headers = markdown_headers or [
42+
("#", "header_1"),
43+
("##", "header_2"),
44+
("###", "header_3"),
45+
("####", "header_4"),
46+
]
47+
48+
# 初始化基础文本分割器
49+
self.text_splitter = self._create_text_splitter()
50+
51+
def _create_text_splitter(self) -> RecursiveCharacterTextSplitter:
52+
"""创建递归字符分割器(内部方法)"""
53+
# 优化后的CJK分隔符列表(修复语法错误,调整优先级)
54+
if self.is_cjk_language:
55+
separators = [
56+
"\n\n", "\n", # 段落/换行(最高优先级)
57+
"。", ".", # 句号(中文/英文)
58+
"!", "!", # 感叹号(中文/英文)
59+
"?", "?", # 问号(中文/英文)
60+
";", ";", # 分号(中文/英文)
61+
",", ",", # 逗号(中文/英文)
62+
"、", # 顿号(中文)
63+
":", ":", # 冒号(中文/英文)
64+
" ", # 空格
65+
"\u200b", "", # 零宽空格/兜底
66+
]
67+
else:
68+
separators = ["\n\n", "\n", " ", ".", "!", "?", ";", ":", ",", ""]
69+
70+
return RecursiveCharacterTextSplitter(
71+
chunk_size=self.chunk_size,
72+
chunk_overlap=self.chunk_overlap,
73+
separators=separators,
74+
length_function=len,
75+
is_separator_regex=False
76+
)
77+
78+
@staticmethod
79+
def _is_markdown(doc: Document) -> bool:
80+
"""
81+
优先从元数据判断是否为Markdown
82+
规则:检查metadata中的source字段扩展名是否为.md/.markdown/.mdx等
83+
"""
84+
# 获取source字段(忽略大小写)
85+
source = doc.metadata.get("source", "").lower()
86+
if not source:
87+
return False
88+
89+
# 获取文件扩展名
90+
ext = os.path.splitext(source)[-1].lower()
91+
# Markdown常见扩展名列表
92+
md_ext = [".md", ".markdown", ".mdx", ".mkd", ".mkdown"]
93+
return ext in md_ext
94+
95+
def split(self, documents: List[Document], is_markdown: bool = False) -> List[Document]:
96+
"""
97+
核心分割方法
98+
99+
Args:
100+
documents: 待分割的Document列表
101+
is_markdown: 是否为Markdown文档(默认False)
102+
103+
Returns:
104+
分割后的Document列表
105+
"""
106+
if not documents:
107+
return []
108+
109+
# Markdown文档处理:先按标题分割,再按字符分割
110+
if is_markdown:
111+
# 初始化Markdown标题分割器
112+
md_splitter = MarkdownHeaderTextSplitter(
113+
headers_to_split_on=self.markdown_headers,
114+
strip_headers=True,
115+
return_each_line=False
116+
)
117+
118+
# 按标题分割并继承元数据
119+
md_chunks = []
120+
for doc in documents:
121+
chunks = md_splitter.split_text(doc.page_content)
122+
for chunk in chunks:
123+
chunk.metadata.update(doc.metadata)
124+
md_chunks.extend(chunks)
125+
126+
# 对标题分割后的内容进行字符分割
127+
final_chunks = self.text_splitter.split_documents(md_chunks)
128+
129+
# 普通文本直接分割
130+
else:
131+
final_chunks = self.text_splitter.split_documents(documents)
132+
133+
return final_chunks
134+
135+
# 核心自动分割方法(元数据优先)
136+
@classmethod
137+
def auto_split(
138+
cls,
139+
documents: List[Document],
140+
chunk_size: int = 2000,
141+
chunk_overlap: int = 200
142+
) -> List[Document]:
143+
"""
144+
极简快捷方法:自动识别文档类型并分割(元数据优先)
145+
仅需传入3个参数,无需初始化类实例
146+
147+
Args:
148+
documents: 待分割的Document列表
149+
chunk_size: 每个文本块的最大长度(默认2000字符)
150+
chunk_overlap: 文本块之间的重叠长度(默认200字符)
151+
152+
Returns:
153+
分割后的Document列表
154+
"""
155+
if not documents:
156+
return []
157+
158+
# 初始化分割器实例(使用CJK默认优化)
159+
splitter = cls(
160+
chunk_size=chunk_size,
161+
chunk_overlap=chunk_overlap,
162+
is_cjk_language=True
163+
)
164+
165+
# 自动检测文档类型(元数据优先)
166+
is_md = splitter._is_markdown(documents[0])
167+
168+
# 根据检测结果选择分割方式
169+
return splitter.split(documents, is_markdown=is_md)

0 commit comments

Comments
 (0)