Skip to content

Commit 2f3ae21

Browse files
authored
feat: enhance dataset file fetching with improved pagination and document loading support (#156)
1 parent e9fd6a3 commit 2f3ae21

File tree

7 files changed

+158
-112
lines changed

7 files changed

+158
-112
lines changed

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public Response<PagedResponse<DatasetFile>> getDatasetFiles(
4646
@PathVariable("datasetId") String datasetId,
4747
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
4848
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
49-
@RequestParam(value = "prefix", required = false) String prefix) {
49+
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix) {
5050
PagingQuery pagingQuery = new PagingQuery(page, size);
5151
PagedResponse<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(
5252
datasetId, prefix, pagingQuery);

frontend/src/components/business/DatasetFileTransfer.tsx

Lines changed: 57 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
7878

7979
const fetchDatasets = async () => {
8080
const { data } = await queryDatasetsUsingGet({
81+
// Ant Design Table pagination.current is 1-based; ensure backend also receives 1-based value
8182
page: datasetPagination.current,
8283
size: datasetPagination.pageSize,
8384
keyword: datasetSearch,
@@ -98,29 +99,49 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
9899
300
99100
);
100101

101-
const fetchFiles = useCallback(async () => {
102-
if (!selectedDataset) return;
103-
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
104-
page: filesPagination.current - 1,
105-
size: filesPagination.pageSize,
106-
keyword: filesSearch,
107-
});
108-
setFiles(
109-
(data.content || []).map((item: DatasetFile) => ({
110-
...item,
111-
key: item.id,
112-
datasetName: selectedDataset.name,
113-
}))
114-
);
115-
setFilesPagination((prev) => ({
116-
...prev,
117-
total: data.totalElements,
118-
}));
119-
}, [filesPagination.current, filesPagination.pageSize, filesSearch, selectedDataset]);
102+
const fetchFiles = useCallback(
103+
async (
104+
options?: Partial<{ page: number; pageSize: number; keyword: string }>
105+
) => {
106+
if (!selectedDataset) return;
107+
const page = options?.page ?? filesPagination.current;
108+
const pageSize = options?.pageSize ?? filesPagination.pageSize;
109+
const keyword = options?.keyword ?? filesSearch;
110+
111+
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
112+
page,
113+
size: pageSize,
114+
keyword,
115+
});
116+
setFiles(
117+
(data.content || []).map((item: DatasetFile) => ({
118+
...item,
119+
key: item.id,
120+
datasetName: selectedDataset.name,
121+
}))
122+
);
123+
setFilesPagination((prev) => ({
124+
...prev,
125+
current: page,
126+
pageSize,
127+
total: data.totalElements,
128+
}));
129+
},
130+
[selectedDataset, filesPagination.current, filesPagination.pageSize, filesSearch]
131+
);
120132

121133
useEffect(() => {
122-
fetchFiles().catch(() => {});
123-
}, [fetchFiles]);
134+
// 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求
135+
if (selectedDataset) {
136+
setFilesPagination({ current: 1, pageSize: 10, total: 0 });
137+
fetchFiles({ page: 1, pageSize: 10 }).catch(() => {});
138+
} else {
139+
setFiles([]);
140+
setFilesPagination({ current: 1, pageSize: 10, total: 0 });
141+
}
142+
// 只在 selectedDataset 变化时触发
143+
// eslint-disable-next-line react-hooks/exhaustive-deps
144+
}, [selectedDataset]);
124145

125146
useEffect(() => {
126147
onDatasetSelect?.(selectedDataset);
@@ -238,7 +259,18 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
238259
size="small"
239260
dataSource={files}
240261
columns={fileCols.slice(1, fileCols.length)}
241-
pagination={filesPagination}
262+
pagination={{
263+
...filesPagination,
264+
onChange: (page, pageSize) => {
265+
const nextPageSize = pageSize || filesPagination.pageSize;
266+
setFilesPagination((prev) => ({
267+
...prev,
268+
current: page,
269+
pageSize: nextPageSize,
270+
}));
271+
fetchFiles({ page, pageSize: nextPageSize }).catch(() => {});
272+
},
273+
}}
242274
onRow={(record: DatasetFile) => ({
243275
onClick: () => toggleSelectFile(record),
244276
})}
@@ -247,15 +279,15 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
247279
selectedRowKeys: Object.keys(selectedFilesMap),
248280

249281
// 单选
250-
onSelect: (record: DatasetFile, selected: boolean) => {
282+
onSelect: (record: DatasetFile) => {
251283
toggleSelectFile(record);
252284
},
253285

254286
// 全选
255287
onSelectAll: (selected, selectedRows: DatasetFile[]) => {
256288
if (selected) {
257289
// ✔ 全选 -> 将 files 列表全部加入 selectedFilesMap
258-
const newMap: Record<string, DatasetFile> = {};
290+
const newMap: Record<string, DatasetFile> = { ...selectedFilesMap };
259291
selectedRows.forEach((f) => {
260292
newMap[f.id] = f;
261293
});
@@ -264,7 +296,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
264296
// ✘ 取消全选 -> 清空 map
265297
const newMap = { ...selectedFilesMap };
266298
Object.keys(newMap).forEach((id) => {
267-
if (files.find((f) => f.id === id)) {
299+
if (files.some((f) => String(f.id) === id)) {
268300
// 仅移除当前页对应文件
269301
delete newMap[id];
270302
}
@@ -277,15 +309,6 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
277309
name: record.fileName,
278310
}),
279311
}}
280-
281-
// rowSelection={{
282-
// type: "checkbox",
283-
// selectedRowKeys: Object.keys(selectedFilesMap),
284-
// onSelect: toggleSelectFile,
285-
// getCheckboxProps: (record: DatasetFile) => ({
286-
// name: record.fileName,
287-
// }),
288-
// }}
289312
/>
290313
</div>
291314
</div>
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""
2+
公共模块
3+
"""
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from typing import List, Union, Optional
2+
from pathlib import Path
3+
4+
from langchain_core.documents import Document
5+
from langchain_community.document_loaders import (
6+
TextLoader,
7+
JSONLoader,
8+
CSVLoader,
9+
UnstructuredMarkdownLoader,
10+
PyPDFLoader,
11+
Docx2txtLoader
12+
)
13+
14+
from app.core.logging import get_logger
15+
16+
log = get_logger(__name__)
17+
18+
class UniversalDocLoader:
19+
"""
20+
通用泛文本文档加载类
21+
支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF
22+
"""
23+
# 格式-加载器映射(轻量优先)
24+
SUPPORTED_FORMATS = {
25+
# 纯文本类
26+
".txt": TextLoader,
27+
".json": JSONLoader,
28+
".csv": CSVLoader,
29+
".md": UnstructuredMarkdownLoader,
30+
# 办公文档类
31+
".docx": Docx2txtLoader,
32+
".doc": Docx2txtLoader,
33+
# PDF 类
34+
".pdf": PyPDFLoader
35+
}
36+
37+
def __init__(self, file_path: Union[str, Path]):
38+
self.file_path = Path(file_path).resolve()
39+
self.file_suffix = self.file_path.suffix.lower()
40+
log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})")
41+
self._validate_file()
42+
43+
def _validate_file(self) -> None:
44+
"""验证文件存在性和格式支持性"""
45+
if not self.file_path.exists():
46+
raise FileNotFoundError(f"文件不存在: {self.file_path}")
47+
if self.file_suffix not in self.SUPPORTED_FORMATS:
48+
raise ValueError(
49+
f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}"
50+
)
51+
52+
def load(
53+
self,
54+
file_format: Optional[str] = None,
55+
**loader_kwargs
56+
) -> List[Document]:
57+
"""
58+
加载文档并返回 LangChain Document 列表
59+
:param file_format: 手动指定格式(如 ".pdf"),默认自动识别
60+
:param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema)
61+
:return: List[Document]
62+
"""
63+
# 确定目标格式
64+
target_format = file_format.lower() if file_format else self.file_suffix
65+
loader_cls = self.SUPPORTED_FORMATS[target_format]
66+
67+
# 加载器默认参数优化
68+
loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs)
69+
70+
# 初始化并加载
71+
loader = loader_cls(str(self.file_path), **loader_kwargs)
72+
return loader.load()
73+
74+
@staticmethod
75+
def _set_default_kwargs(loader_cls, kwargs: dict) -> dict:
76+
"""为不同加载器设置默认参数,简化调用"""
77+
if loader_cls == JSONLoader and "jq_schema" not in kwargs:
78+
kwargs.setdefault("jq_schema", ".")
79+
kwargs.setdefault("text_content", False)
80+
if loader_cls == CSVLoader and "csv_args" not in kwargs:
81+
kwargs["csv_args"] = {"delimiter": ","}
82+
return kwargs
83+
84+
85+
# 文档加载器便捷函数
86+
def load_documents(
87+
file_path: Union[str, Path],
88+
file_format: Optional[str] = None,
89+
**loader_kwargs
90+
) -> List[Document]:
91+
"""快速加载文档的便捷函数"""
92+
loader = UniversalDocLoader(file_path)
93+
return loader.load(file_format=file_format, **loader_kwargs)

runtime/datamate-python/app/common/text_split.py

Whitespace-only changes.

runtime/datamate-python/app/module/generation/service/generation_service.py

Lines changed: 3 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,8 @@
11
import asyncio
2-
import uuid
32
import json
3+
import uuid
44
from pathlib import Path
55

6-
from langchain_community.document_loaders import (
7-
TextLoader,
8-
CSVLoader,
9-
JSONLoader,
10-
UnstructuredMarkdownLoader,
11-
UnstructuredHTMLLoader,
12-
UnstructuredFileLoader,
13-
PyPDFLoader,
14-
UnstructuredWordDocumentLoader,
15-
UnstructuredPowerPointLoader,
16-
UnstructuredExcelLoader,
17-
)
186
from langchain_text_splitters import RecursiveCharacterTextSplitter
197
from sqlalchemy import select
208
from sqlalchemy.ext.asyncio import AsyncSession
@@ -30,6 +18,7 @@
3018
from app.db.session import logger
3119
from app.module.shared.util.model_chat import _extract_json_substring
3220
from app.module.system.service.common_service import get_chat_client, chat
21+
from app.common.document_loaders import load_documents
3322

3423

3524
class GenerationService:
@@ -250,8 +239,7 @@ def _load_and_split(self, file_path: str, chunk_size: int, chunk_overlap: int):
250239
251240
保留每个 Document 的 metadata,方便后续追加例如文件ID、chunk序号等信息。
252241
"""
253-
loader = self._build_loader(file_path)
254-
docs = loader.load()
242+
docs = load_documents(file_path)
255243

256244
splitter = RecursiveCharacterTextSplitter(
257245
chunk_size=chunk_size,
@@ -262,67 +250,6 @@ def _load_and_split(self, file_path: str, chunk_size: int, chunk_overlap: int):
262250
split_docs = splitter.split_documents(docs)
263251
return split_docs
264252

265-
@staticmethod
266-
def _build_loader(file_path: str):
267-
"""根据文件扩展名选择合适的 LangChain 文本加载器,尽量覆盖常见泛文本格式。
268-
269-
优先按格式选择专门的 Loader,找不到匹配时退回到 TextLoader。
270-
"""
271-
path = Path(file_path)
272-
suffix = path.suffix.lower()
273-
path_str = str(path)
274-
275-
# 1. 纯文本类
276-
if suffix in {".txt", "", ".log"}: # "" 兼容无扩展名
277-
return TextLoader(path_str, encoding="utf-8")
278-
279-
# 2. Markdown
280-
if suffix in {".md", ".markdown"}:
281-
# UnstructuredMarkdownLoader 会保留更多结构信息
282-
return UnstructuredMarkdownLoader(path_str)
283-
284-
# 3. HTML / HTM
285-
if suffix in {".html", ".htm"}:
286-
return UnstructuredHTMLLoader(path_str)
287-
288-
# 4. JSON
289-
if suffix == ".json":
290-
# 使用 JSONLoader 将 JSON 中的内容展开成文档
291-
# 这里使用默认 jq_schema,后续需要更精细地提取可以在此调整
292-
return JSONLoader(file_path=path_str, jq_schema=".")
293-
294-
# 5. CSV / TSV
295-
if suffix in {".csv", ".tsv"}:
296-
# CSVLoader 默认将每一行作为一条 Document
297-
return CSVLoader(file_path=path_str)
298-
299-
# 6. YAML
300-
if suffix in {".yaml", ".yml"}:
301-
# 暂时按纯文本加载
302-
return TextLoader(path_str, encoding="utf-8")
303-
304-
# 7. PDF
305-
if suffix == ".pdf":
306-
return PyPDFLoader(path_str)
307-
308-
# 8. Word 文档
309-
if suffix in {".docx", ".doc"}:
310-
# UnstructuredWordDocumentLoader 支持 .docx/.doc 文本抽取
311-
return UnstructuredWordDocumentLoader(path_str)
312-
313-
# 9. PowerPoint
314-
if suffix in {".ppt", ".pptx"}:
315-
return UnstructuredPowerPointLoader(path_str)
316-
317-
# 10. Excel
318-
if suffix in {".xls", ".xlsx"}:
319-
return UnstructuredExcelLoader(path_str)
320-
321-
# 11. 兜底:使用 UnstructuredFileLoader 或 TextLoader 作为纯文本
322-
try:
323-
return UnstructuredFileLoader(path_str)
324-
except Exception:
325-
return TextLoader(path_str, encoding="utf-8")
326253

327254
@staticmethod
328255
def _build_qa_prompt(chunk: str, synthesis_cfg: dict) -> str:

scripts/images/backend-python/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
3232
poetry install --no-root --only main
3333

3434
# Download NLTK data
35-
RUN python -c "import nltk; nltk.download('punkt_tab', download_dir='/usr/local/nltk_data')"
35+
RUN python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')"
3636
ENV NLTK_DATA=/usr/local/nltk_data
3737

3838
# Copy the rest of the application

0 commit comments

Comments
 (0)