Skip to content

Commit 15498f2

Browse files
authored
feat: add file copying functionality to dataset directory and update base path configuration #80
1 parent 960323f commit 15498f2

File tree

8 files changed

+122
-45
lines changed

8 files changed

+122
-45
lines changed

README-zh.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
![GitHub Stars](https://img.shields.io/github/stars/ModelEngine-Group/DataMate)
88
![GitHub Forks](https://img.shields.io/github/forks/ModelEngine-Group/DataMate)
99
![GitHub Issues](https://img.shields.io/github/issues/ModelEngine-Group/DataMate)
10-
![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/DataMate)
10+
![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/datamate-docs)
1111

1212
**DataMate是面向模型微调与RAG检索的企业级数据处理平台,支持数据归集、数据管理、算子市场、数据清洗、数据合成、数据标注、数据评估、知识生成等核心功能。**
1313

backend/services/data-collection-service/src/main/java/com/datamate/collection/interfaces/rest/CollectionTaskController.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import com.datamate.collection.interfaces.dto.*;
99
import com.datamate.common.interfaces.PagedResponse;
1010
import com.datamate.datamanagement.application.DatasetApplicationService;
11-
import com.datamate.datamanagement.domain.model.dataset.Dataset;
1211
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
1312
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
1413
import jakarta.validation.Valid;

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public class DatasetApplicationService {
4949
private final FileMetadataService fileMetadataService;
5050
private final ObjectMapper objectMapper;
5151

52-
@Value("${dataset.base.path:/dataset}")
52+
@Value("${datamate.data-management.base-path:/dataset}")
5353
private String datasetBasePath;
5454

5555
/**

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import com.datamate.common.domain.model.FileUploadResult;
55
import com.datamate.common.domain.service.FileService;
66
import com.datamate.common.domain.utils.AnalyzerUtils;
7+
import com.datamate.common.infrastructure.exception.BusinessAssert;
78
import com.datamate.common.infrastructure.exception.BusinessException;
89
import com.datamate.common.infrastructure.exception.SystemErrorCode;
910
import com.datamate.datamanagement.domain.contants.DatasetConstant;
@@ -13,12 +14,14 @@
1314
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
1415
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
1516
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
17+
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
1618
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
1719
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
1820
import com.fasterxml.jackson.core.JsonProcessingException;
1921
import com.fasterxml.jackson.databind.ObjectMapper;
2022
import jakarta.servlet.http.HttpServletResponse;
2123
import lombok.extern.slf4j.Slf4j;
24+
import org.apache.commons.io.FileUtils;
2225
import org.apache.ibatis.session.RowBounds;
2326
import org.springframework.beans.factory.annotation.Autowired;
2427
import org.springframework.beans.factory.annotation.Value;
@@ -57,7 +60,7 @@ public class DatasetFileApplicationService {
5760
private final DatasetRepository datasetRepository;
5861
private final FileService fileService;
5962

60-
@Value("${dataset.base.path:/dataset}")
63+
@Value("${datamate.data-management.base-path:/dataset}")
6164
private String datasetBasePath;
6265

6366
@Autowired
@@ -257,4 +260,51 @@ private void saveFileInfoToDb(FileUploadResult fileUploadResult, UploadFileReque
257260
dataset.active();
258261
datasetRepository.updateById(dataset);
259262
}
263+
264+
/**
265+
* 复制文件到数据集目录
266+
*
267+
* @param datasetId 数据集id
268+
* @param req 复制文件请求
269+
* @return 复制的文件列表
270+
*/
271+
@Transactional
272+
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
273+
Dataset dataset = datasetRepository.getById(datasetId);
274+
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
275+
List<DatasetFile> copiedFiles = new ArrayList<>();
276+
for (String sourceFilePath : req.sourcePaths()) {
277+
Path sourcePath = Paths.get(sourceFilePath);
278+
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
279+
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
280+
continue;
281+
}
282+
String fileName = sourcePath.getFileName().toString();
283+
File targetFile = new File(dataset.getPath(), fileName);
284+
try {
285+
FileUtils.copyInputStreamToFile(Files.newInputStream(sourcePath), targetFile);
286+
} catch (IOException e) {
287+
log.error("Failed to copy file: {}", sourceFilePath, e);
288+
continue;
289+
}
290+
291+
LocalDateTime currentTime = LocalDateTime.now();
292+
DatasetFile datasetFile = DatasetFile.builder()
293+
.id(UUID.randomUUID().toString())
294+
.datasetId(datasetId)
295+
.fileName(fileName)
296+
.fileType(AnalyzerUtils.getExtension(fileName))
297+
.fileSize(targetFile.length())
298+
.filePath(targetFile.getPath())
299+
.uploadTime(currentTime)
300+
.lastAccessTime(currentTime)
301+
.build();
302+
datasetFileRepository.save(datasetFile);
303+
dataset.addFile(datasetFile);
304+
copiedFiles.add(datasetFile);
305+
}
306+
dataset.active();
307+
datasetRepository.updateById(dataset);
308+
return copiedFiles;
309+
}
260310
}

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/converter/DatasetConverter.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,13 @@
11
package com.datamate.datamanagement.interfaces.converter;
22

3-
import com.datamate.common.infrastructure.exception.BusinessException;
4-
import com.datamate.common.infrastructure.exception.SystemErrorCode;
3+
import com.datamate.common.domain.model.ChunkUploadRequest;
4+
import com.datamate.datamanagement.domain.model.dataset.Dataset;
5+
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
56
import com.datamate.datamanagement.domain.model.dataset.FileTag;
67
import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
78
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
89
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
910
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
10-
import com.datamate.common.domain.model.ChunkUploadRequest;
11-
import com.datamate.datamanagement.domain.model.dataset.Dataset;
12-
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
13-
import com.fasterxml.jackson.databind.ObjectMapper;
1411
import org.apache.commons.collections4.CollectionUtils;
1512
import org.mapstruct.Mapper;
1613
import org.mapstruct.Mapping;
@@ -59,6 +56,13 @@ public interface DatasetConverter {
5956
*/
6057
DatasetFileResponse convertToResponse(DatasetFile datasetFile);
6158

59+
60+
/**
61+
* 将数据集文件列表转换为响应
62+
*/
63+
List<DatasetFileResponse> convertToResponseList(List<DatasetFile> datasetFiles);
64+
65+
6266
/**
6367
* 获取数据文件的标签分布
6468
*
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package com.datamate.datamanagement.interfaces.dto;
2+
3+
import jakarta.validation.constraints.NotEmpty;
4+
5+
import java.util.List;
6+
7+
/**
8+
* 复制文件请求DTO
9+
*
10+
* @author dallas
11+
* @since 2025-11-13
12+
*/
13+
public record CopyFilesRequest(@NotEmpty List<String> sourcePaths) {
14+
}

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@
66
import com.datamate.datamanagement.application.DatasetFileApplicationService;
77
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
88
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
9-
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
10-
import com.datamate.datamanagement.interfaces.dto.PagedDatasetFileResponse;
11-
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
12-
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
9+
import com.datamate.datamanagement.interfaces.dto.*;
1310
import jakarta.servlet.http.HttpServletResponse;
1411
import jakarta.validation.Valid;
1512
import lombok.extern.slf4j.Slf4j;
@@ -23,8 +20,8 @@
2320
import org.springframework.http.MediaType;
2421
import org.springframework.http.ResponseEntity;
2522
import org.springframework.web.bind.annotation.*;
26-
import org.springframework.web.multipart.MultipartFile;
2723

24+
import java.util.List;
2825
import java.util.stream.Collectors;
2926

3027
/**
@@ -44,20 +41,20 @@ public DatasetFileController(DatasetFileApplicationService datasetFileApplicatio
4441

4542
@GetMapping
4643
public ResponseEntity<Response<PagedDatasetFileResponse>> getDatasetFiles(
47-
@PathVariable("datasetId") String datasetId,
48-
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
49-
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
50-
@RequestParam(value = "fileType", required = false) String fileType,
51-
@RequestParam(value = "status", required = false) String status) {
44+
@PathVariable("datasetId") String datasetId,
45+
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
46+
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
47+
@RequestParam(value = "fileType", required = false) String fileType,
48+
@RequestParam(value = "status", required = false) String status) {
5249
Pageable pageable = PageRequest.of(page != null ? page : 0, size != null ? size : 20);
5350

5451
Page<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFiles(
55-
datasetId, fileType, status, pageable);
52+
datasetId, fileType, status, pageable);
5653

5754
PagedDatasetFileResponse response = new PagedDatasetFileResponse();
5855
response.setContent(filesPage.getContent().stream()
59-
.map(DatasetConverter.INSTANCE::convertToResponse)
60-
.collect(Collectors.toList()));
56+
.map(DatasetConverter.INSTANCE::convertToResponse)
57+
.collect(Collectors.toList()));
6158
response.setPage(filesPage.getNumber());
6259
response.setSize(filesPage.getSize());
6360
response.setTotalElements((int) filesPage.getTotalElements());
@@ -70,8 +67,8 @@ public ResponseEntity<Response<PagedDatasetFileResponse>> getDatasetFiles(
7067

7168
@GetMapping("/{fileId}")
7269
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
73-
@PathVariable("datasetId") String datasetId,
74-
@PathVariable("fileId") String fileId) {
70+
@PathVariable("datasetId") String datasetId,
71+
@PathVariable("fileId") String fileId) {
7572
try {
7673
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
7774
return ResponseEntity.ok(Response.ok(DatasetConverter.INSTANCE.convertToResponse(datasetFile)));
@@ -82,8 +79,8 @@ public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
8279

8380
@DeleteMapping("/{fileId}")
8481
public ResponseEntity<Response<Void>> deleteDatasetFile(
85-
@PathVariable("datasetId") String datasetId,
86-
@PathVariable("fileId") String fileId) {
82+
@PathVariable("datasetId") String datasetId,
83+
@PathVariable("fileId") String fileId) {
8784
try {
8885
datasetFileApplicationService.deleteDatasetFile(datasetId, fileId);
8986
return ResponseEntity.ok().build();
@@ -101,10 +98,10 @@ public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId
10198
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
10299

103100
return ResponseEntity.ok()
104-
.contentType(MediaType.APPLICATION_OCTET_STREAM)
105-
.header(HttpHeaders.CONTENT_DISPOSITION,
106-
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
107-
.body(resource);
101+
.contentType(MediaType.APPLICATION_OCTET_STREAM)
102+
.header(HttpHeaders.CONTENT_DISPOSITION,
103+
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
104+
.body(resource);
108105
} catch (IllegalArgumentException e) {
109106
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
110107
} catch (Exception e) {
@@ -136,11 +133,26 @@ public ResponseEntity<Response<String>> preUpload(@PathVariable("datasetId") Str
136133
* @param uploadFileRequest 上传文件请求
137134
*/
138135
@PostMapping("/upload/chunk")
139-
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId, @Valid UploadFileRequest uploadFileRequest) {
136+
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId,
137+
@Valid UploadFileRequest uploadFileRequest) {
140138
log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}",
141-
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
142-
uploadFileRequest.getChunkNo());
139+
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
140+
uploadFileRequest.getChunkNo());
143141
datasetFileApplicationService.chunkUpload(datasetId, uploadFileRequest);
144142
return ResponseEntity.ok().build();
145143
}
144+
145+
/**
146+
* 将指定路径中的文件拷贝到数据集目录下
147+
*
148+
* @param datasetId 数据集ID
149+
* @param req 源文件路径列表
150+
* @return 数据集文件响应DTO列表
151+
*/
152+
@PostMapping("/upload/copy")
153+
public List<DatasetFileResponse> copyFilesToDatasetDir(@PathVariable("datasetId") String datasetId,
154+
@RequestBody @Valid CopyFilesRequest req) {
155+
List<DatasetFile> datasetFiles = datasetFileApplicationService.copyFilesToDatasetDir(datasetId, req);
156+
return DatasetConverter.INSTANCE.convertToResponseList(datasetFiles);
157+
}
146158
}

backend/services/main-application/src/main/resources/application.yml

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -118,23 +118,15 @@ management:
118118

119119
# 平台配置
120120
datamate:
121-
# JWT配置
122-
jwt:
123-
secret: ${JWT_SECRET:dataMateSecretKey2024ForJWTTokenGeneration}
124-
expiration: ${JWT_EXPIRATION:86400} # 24小时,单位秒
125-
header: Authorization
126-
prefix: "Bearer "
121+
# 通用配置
122+
127123

128124
# 文件存储配置
129125
storage:
130126
type: ${STORAGE_TYPE:local} # local, minio, s3
131127
local:
132128
base-path: ${STORAGE_LOCAL_PATH:./data/storage}
133-
minio:
134-
endpoint: ${MINIO_ENDPOINT:http://localhost:9000}
135-
access-key: ${MINIO_ACCESS_KEY:minioadmin}
136-
secret-key: ${MINIO_SECRET_KEY:minioadmin}
137-
bucket-name: ${MINIO_BUCKET:data-mate}
129+
138130

139131
# Ray执行器配置
140132
ray:
@@ -148,6 +140,12 @@ datamate:
148140
- "numpy"
149141
- "data-juicer"
150142

143+
# 模块配置
144+
145+
# 数据管理服务配置
146+
data-management:
147+
base-path: /dataset
148+
151149
# 数据归集服务配置(可由模块导入叠加)
152150
data-collection: {}
153151

0 commit comments

Comments
 (0)