Skip to content

Commit f870604

Browse files
authored
feature: data management supports nested folders (#150)
* fix: k8s部署场景下,backend-python服务挂载需要存储 * fix: 增加数据集文件免拷贝的接口定义 * fix: 评估时评估结果赋予初始空值,防止未评估完成时接口报错 * feature: 数据管理支持嵌套文件夹(展示时按照文件系统展示;批量下载时带上相对路径) * fix: 去除多余的文件重命名逻辑 * refactor: remove unused imports
1 parent fea7133 commit f870604

File tree

7 files changed

+288
-56
lines changed

7 files changed

+288
-56
lines changed

backend/openapi/specs/data-management.yaml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,35 @@ paths:
330330
type: string
331331
format: binary
332332

333+
/data-management/datasets/{datasetId}/files/upload/add:
334+
post:
335+
tags: [ DatasetFile ]
336+
operationId: addFilesToDataset
337+
summary: 添加文件到数据集(仅创建数据库记录)
338+
description: 将指定源文件路径列表添加到数据集,仅在数据库中创建记录,不执行物理文件系统操作。
339+
parameters:
340+
- name: datasetId
341+
in: path
342+
required: true
343+
schema:
344+
type: string
345+
description: 数据集ID
346+
requestBody:
347+
required: true
348+
content:
349+
application/json:
350+
schema:
351+
$ref: '#/components/schemas/AddFilesRequest'
352+
responses:
353+
'200':
354+
description: 添加成功,返回创建的文件记录列表
355+
content:
356+
application/json:
357+
schema:
358+
type: array
359+
items:
360+
$ref: '#/components/schemas/DatasetFileResponse'
361+
333362
/data-management/datasets/{datasetId}/files/upload/pre-upload:
334363
post:
335364
tags: [ DatasetFile ]
@@ -805,3 +834,19 @@ components:
805834
path:
806835
type: string
807836
description: 请求路径
837+
838+
AddFilesRequest:
839+
type: object
840+
description: 将源文件路径添加到数据集的请求
841+
properties:
842+
sourcePaths:
843+
type: array
844+
items:
845+
type: string
846+
description: 源文件路径列表(相对或绝对路径),每个元素表示一个要添加的文件或目录路径
847+
softAdd:
848+
type: boolean
849+
description: 如果为 true,则仅在数据库中创建记录(默认 false)
850+
default: false
851+
required:
852+
- sourcePaths

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java

Lines changed: 113 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package com.datamate.datamanagement.application;
22

33
import com.baomidou.mybatisplus.core.metadata.IPage;
4+
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
45
import com.datamate.common.domain.model.ChunkUploadPreRequest;
56
import com.datamate.common.domain.model.FileUploadResult;
67
import com.datamate.common.domain.service.FileService;
@@ -29,6 +30,9 @@
2930
import com.fasterxml.jackson.databind.ObjectMapper;
3031
import jakarta.servlet.http.HttpServletResponse;
3132
import lombok.extern.slf4j.Slf4j;
33+
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
34+
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
35+
import org.apache.commons.io.IOUtils;
3236
import org.springframework.beans.factory.annotation.Autowired;
3337
import org.springframework.beans.factory.annotation.Value;
3438
import org.springframework.core.io.Resource;
@@ -37,22 +41,22 @@
3741
import org.springframework.stereotype.Service;
3842
import org.springframework.transaction.annotation.Transactional;
3943

40-
import java.io.BufferedInputStream;
4144
import java.io.File;
4245
import java.io.IOException;
4346
import java.io.InputStream;
4447
import java.net.MalformedURLException;
4548
import java.nio.file.Files;
4649
import java.nio.file.Path;
4750
import java.nio.file.Paths;
51+
import java.nio.file.attribute.BasicFileAttributes;
4852
import java.time.LocalDateTime;
53+
import java.time.ZoneId;
4954
import java.time.format.DateTimeFormatter;
5055
import java.util.*;
5156
import java.util.concurrent.CompletableFuture;
5257
import java.util.function.Function;
5358
import java.util.stream.Collectors;
54-
import java.util.zip.ZipEntry;
55-
import java.util.zip.ZipOutputStream;
59+
import java.util.stream.Stream;
5660

5761
/**
5862
* 数据集文件应用服务
@@ -85,11 +89,77 @@ public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository
8589
*/
8690
@Transactional(readOnly = true)
8791
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name, PagingQuery pagingQuery) {
88-
IPage<DatasetFile> page = new com.baomidou.mybatisplus.extension.plugins.pagination.Page<>(pagingQuery.getPage(), pagingQuery.getSize());
92+
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
8993
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, page);
9094
return PagedResponse.of(files);
9195
}
9296

97+
/**
98+
* 获取数据集文件列表
99+
*/
100+
@Transactional(readOnly = true)
101+
public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, PagingQuery pagingQuery) {
102+
Dataset dataset = datasetRepository.getById(datasetId);
103+
int page = Math.max(pagingQuery.getPage(), 1);
104+
int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
105+
if (dataset == null) {
106+
return PagedResponse.of(new Page<>(page, size));
107+
}
108+
String datasetPath = dataset.getPath();
109+
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
110+
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
111+
.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
112+
try (Stream<Path> pathStream = Files.list(queryPath)) {
113+
List<Path> allFiles = pathStream
114+
.filter(path -> path.toString().startsWith(datasetPath))
115+
.sorted(Comparator
116+
.comparing((Path path) -> !Files.isDirectory(path))
117+
.thenComparing(path -> path.getFileName().toString()))
118+
.collect(Collectors.toList());
119+
120+
// 计算分页
121+
int total = allFiles.size();
122+
int totalPages = (int) Math.ceil((double) total / size);
123+
124+
// 获取当前页数据
125+
int fromIndex = (page - 1) * size;
126+
fromIndex = Math.max(fromIndex, 0);
127+
int toIndex = Math.min(fromIndex + size, total);
128+
129+
List<Path> pageData = new ArrayList<>();
130+
if (fromIndex < total) {
131+
pageData = allFiles.subList(fromIndex, toIndex);
132+
}
133+
List<DatasetFile> datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList();
134+
135+
return new PagedResponse<>(page, size, total, totalPages, datasetFiles);
136+
} catch (IOException e) {
137+
log.error("list dataset path error", e);
138+
return PagedResponse.of(new Page<>(page, size));
139+
}
140+
}
141+
142+
private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) {
143+
DatasetFile datasetFile = new DatasetFile();
144+
LocalDateTime localDateTime = LocalDateTime.now();
145+
try {
146+
localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime();
147+
} catch (IOException e) {
148+
log.error("get last modified time error", e);
149+
}
150+
datasetFile.setFileName(path.getFileName().toString());
151+
datasetFile.setUploadTime(localDateTime);
152+
if (Files.isDirectory(path)) {
153+
datasetFile.setId("directory-" + datasetFile.getFileName());
154+
} else if (Objects.isNull(datasetFilesMap.get(path.toString()))) {
155+
datasetFile.setId("file-" + datasetFile.getFileName());
156+
datasetFile.setFileSize(path.toFile().length());
157+
} else {
158+
datasetFile = datasetFilesMap.get(path.toString());
159+
}
160+
return datasetFile;
161+
}
162+
93163
/**
94164
* 获取文件详情
95165
*/
@@ -151,58 +221,61 @@ public Resource downloadFile(String datasetId, String fileId) {
151221
*/
152222
@Transactional(readOnly = true)
153223
public void downloadDatasetFileAsZip(String datasetId, HttpServletResponse response) {
224+
Dataset dataset = datasetRepository.getById(datasetId);
225+
if (Objects.isNull(dataset)) {
226+
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
227+
}
154228
List<DatasetFile> allByDatasetId = datasetFileRepository.findAllByDatasetId(datasetId);
155-
fileRename(allByDatasetId);
229+
Set<String> filePaths = allByDatasetId.stream().map(DatasetFile::getFilePath).collect(Collectors.toSet());
230+
String datasetPath = dataset.getPath();
231+
Path downloadPath = Path.of(datasetPath);
156232
response.setContentType("application/zip");
157233
String zipName = String.format("dataset_%s.zip",
158234
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")));
159235
response.setHeader(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + zipName);
160-
try (ZipOutputStream zos = new ZipOutputStream(response.getOutputStream())) {
161-
for (DatasetFile file : allByDatasetId) {
162-
addToZipFile(file, zos);
236+
try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(response.getOutputStream())) {
237+
try (Stream<Path> pathStream = Files.walk(downloadPath)) {
238+
List<Path> allPaths = pathStream.filter(path -> path.toString().startsWith(datasetPath))
239+
.filter(path -> filePaths.stream().anyMatch(filePath -> filePath.startsWith(path.toString())))
240+
.toList();
241+
for (Path path : allPaths) {
242+
addToZipFile(path, downloadPath, zos);
243+
}
163244
}
164245
} catch (IOException e) {
165246
log.error("Failed to download files in batches.", e);
166247
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
167248
}
168249
}
169250

170-
private void fileRename(List<DatasetFile> files) {
171-
Set<String> uniqueFilenames = new HashSet<>();
172-
for (DatasetFile file : files) {
173-
String originalFilename = file.getFileName();
174-
if (!uniqueFilenames.add(originalFilename)) {
175-
String newFilename;
176-
int counter = 1;
177-
do {
178-
newFilename = generateNewFilename(originalFilename, counter);
179-
counter++;
180-
} while (!uniqueFilenames.add(newFilename));
181-
file.setFileName(newFilename);
251+
private void addToZipFile(Path path, Path basePath, ZipArchiveOutputStream zos) throws IOException {
252+
String entryName = basePath.relativize(path)
253+
.toString()
254+
.replace(File.separator, "/");
255+
256+
// 处理目录
257+
if (Files.isDirectory(path)) {
258+
if (!entryName.isEmpty()) {
259+
entryName += "/";
260+
ZipArchiveEntry dirEntry = new ZipArchiveEntry(entryName);
261+
zos.putArchiveEntry(dirEntry);
262+
zos.closeArchiveEntry();
182263
}
183-
}
184-
}
264+
} else {
265+
// 处理文件
266+
ZipArchiveEntry fileEntry = new ZipArchiveEntry(path.toFile(), entryName);
185267

186-
private String generateNewFilename(String oldFilename, int counter) {
187-
int dotIndex = oldFilename.lastIndexOf(".");
188-
return oldFilename.substring(0, dotIndex) + "-(" + counter + ")" + oldFilename.substring(dotIndex);
189-
}
268+
// 设置更多属性
269+
BasicFileAttributes attrs = Files.readAttributes(path, BasicFileAttributes.class);
270+
fileEntry.setSize(attrs.size());
271+
fileEntry.setLastModifiedTime(attrs.lastModifiedTime());
190272

191-
private void addToZipFile(DatasetFile file, ZipOutputStream zos) throws IOException {
192-
if (file.getFilePath() == null || !Files.exists(Paths.get(file.getFilePath()))) {
193-
log.warn("The file hasn't been found on filesystem, id: {}", file.getId());
194-
return;
195-
}
196-
try (InputStream fis = Files.newInputStream(Paths.get(file.getFilePath()));
197-
BufferedInputStream bis = new BufferedInputStream(fis)) {
198-
ZipEntry zipEntry = new ZipEntry(file.getFileName());
199-
zos.putNextEntry(zipEntry);
200-
byte[] buffer = new byte[8192];
201-
int length;
202-
while ((length = bis.read(buffer)) >= 0) {
203-
zos.write(buffer, 0, length);
273+
zos.putArchiveEntry(fileEntry);
274+
275+
try (InputStream is = Files.newInputStream(path)) {
276+
IOUtils.copy(is, zos);
204277
}
205-
zos.closeEntry();
278+
zos.closeArchiveEntry();
206279
}
207280
}
208281

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,10 @@ public Response<PagedResponse<DatasetFile>> getDatasetFiles(
4646
@PathVariable("datasetId") String datasetId,
4747
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
4848
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
49-
@RequestParam(value = "fileType", required = false) String fileType,
50-
@RequestParam(value = "status", required = false) String status,
51-
@RequestParam(value = "name", required = false) String name) {
49+
@RequestParam(value = "prefix", required = false) String prefix) {
5250
PagingQuery pagingQuery = new PagingQuery(page, size);
53-
PagedResponse<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFiles(
54-
datasetId, fileType, status, name, pagingQuery);
51+
PagedResponse<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(
52+
datasetId, prefix, pagingQuery);
5553
return Response.ok(filesPage);
5654
}
5755

backend/shared/domain-common/src/main/java/com/datamate/common/domain/utils/ArchiveAnalyzer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
1313
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
1414
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
15+
import org.apache.commons.io.FileUtils;
1516

1617
import java.io.BufferedInputStream;
1718
import java.io.BufferedOutputStream;
@@ -145,7 +146,7 @@ private static Optional<FileUploadResult> extractEntity(ArchiveInputStream<?> ar
145146
Path path = Paths.get(archivePath.getParent().toString(), archiveEntry.getName());
146147
File file = path.toFile();
147148
long fileSize = 0L;
148-
String extension = AnalyzerUtils.getExtension(archiveEntry.getName());
149+
FileUtils.createParentDirectories(file);
149150

150151
long supportFileSize = 1024*1024*1024; // 上传大小暂定为1个G
151152
try (OutputStream outputStream = new BufferedOutputStream(Files.newOutputStream(file.toPath()))) {

0 commit comments

Comments
 (0)