diff --git a/README-zh.md b/README-zh.md index ea89a6b5b..921a3def9 100644 --- a/README-zh.md +++ b/README-zh.md @@ -7,7 +7,7 @@ ![GitHub Stars](https://img.shields.io/github/stars/ModelEngine-Group/DataMate) ![GitHub Forks](https://img.shields.io/github/forks/ModelEngine-Group/DataMate) ![GitHub Issues](https://img.shields.io/github/issues/ModelEngine-Group/DataMate) -![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/DataMate) +![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/datamate-docs) **DataMate是面向模型微调与RAG检索的企业级数据处理平台,支持数据归集、数据管理、算子市场、数据清洗、数据合成、数据标注、数据评估、知识生成等核心功能。** diff --git a/backend/services/data-collection-service/src/main/java/com/datamate/collection/interfaces/rest/CollectionTaskController.java b/backend/services/data-collection-service/src/main/java/com/datamate/collection/interfaces/rest/CollectionTaskController.java index 172d9b029..8278e3733 100644 --- a/backend/services/data-collection-service/src/main/java/com/datamate/collection/interfaces/rest/CollectionTaskController.java +++ b/backend/services/data-collection-service/src/main/java/com/datamate/collection/interfaces/rest/CollectionTaskController.java @@ -8,7 +8,6 @@ import com.datamate.collection.interfaces.dto.*; import com.datamate.common.interfaces.PagedResponse; import com.datamate.datamanagement.application.DatasetApplicationService; -import com.datamate.datamanagement.domain.model.dataset.Dataset; import com.datamate.datamanagement.interfaces.converter.DatasetConverter; import com.datamate.datamanagement.interfaces.dto.DatasetResponse; import jakarta.validation.Valid; diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java index 0c13bb0cd..f041fc0cb 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java @@ -49,7 +49,7 @@ public class DatasetApplicationService { private final FileMetadataService fileMetadataService; private final ObjectMapper objectMapper; - @Value("${dataset.base.path:/dataset}") + @Value("${datamate.data-management.base-path:/dataset}") private String datasetBasePath; /** diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index 9f23cd872..e129a3823 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -4,6 +4,7 @@ import com.datamate.common.domain.model.FileUploadResult; import com.datamate.common.domain.service.FileService; import com.datamate.common.domain.utils.AnalyzerUtils; +import com.datamate.common.infrastructure.exception.BusinessAssert; import com.datamate.common.infrastructure.exception.BusinessException; import com.datamate.common.infrastructure.exception.SystemErrorCode; import com.datamate.datamanagement.domain.contants.DatasetConstant; @@ -13,12 +14,14 @@ import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository; import com.datamate.datamanagement.interfaces.converter.DatasetConverter; +import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest; import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import jakarta.servlet.http.HttpServletResponse; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FileUtils; import org.apache.ibatis.session.RowBounds; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; @@ -57,7 +60,7 @@ public class DatasetFileApplicationService { private final DatasetRepository datasetRepository; private final FileService fileService; - @Value("${dataset.base.path:/dataset}") + @Value("${datamate.data-management.base-path:/dataset}") private String datasetBasePath; @Autowired @@ -257,4 +260,51 @@ private void saveFileInfoToDb(FileUploadResult fileUploadResult, UploadFileReque dataset.active(); datasetRepository.updateById(dataset); } + + /** + * 复制文件到数据集目录 + * + * @param datasetId 数据集id + * @param req 复制文件请求 + * @return 复制的文件列表 + */ + @Transactional + public List copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) { + Dataset dataset = datasetRepository.getById(datasetId); + BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND); + List copiedFiles = new ArrayList<>(); + for (String sourceFilePath : req.sourcePaths()) { + Path sourcePath = Paths.get(sourceFilePath); + if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) { + log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath); + continue; + } + String fileName = sourcePath.getFileName().toString(); + File targetFile = new File(dataset.getPath(), fileName); + try { + FileUtils.copyInputStreamToFile(Files.newInputStream(sourcePath), targetFile); + } catch (IOException e) { + log.error("Failed to copy file: {}", sourceFilePath, e); + continue; + } + + LocalDateTime currentTime = LocalDateTime.now(); + DatasetFile datasetFile = DatasetFile.builder() + .id(UUID.randomUUID().toString()) + .datasetId(datasetId) + .fileName(fileName) + .fileType(AnalyzerUtils.getExtension(fileName)) + .fileSize(targetFile.length()) + .filePath(targetFile.getPath()) + .uploadTime(currentTime) + .lastAccessTime(currentTime) + .build(); + datasetFileRepository.save(datasetFile); + dataset.addFile(datasetFile); + copiedFiles.add(datasetFile); + } + dataset.active(); + datasetRepository.updateById(dataset); + return copiedFiles; + } } diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/converter/DatasetConverter.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/converter/DatasetConverter.java index 97749ba23..96c8de78b 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/converter/DatasetConverter.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/converter/DatasetConverter.java @@ -1,16 +1,13 @@ package com.datamate.datamanagement.interfaces.converter; -import com.datamate.common.infrastructure.exception.BusinessException; -import com.datamate.common.infrastructure.exception.SystemErrorCode; +import com.datamate.common.domain.model.ChunkUploadRequest; +import com.datamate.datamanagement.domain.model.dataset.Dataset; +import com.datamate.datamanagement.domain.model.dataset.DatasetFile; import com.datamate.datamanagement.domain.model.dataset.FileTag; import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest; import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse; import com.datamate.datamanagement.interfaces.dto.DatasetResponse; import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; -import com.datamate.common.domain.model.ChunkUploadRequest; -import com.datamate.datamanagement.domain.model.dataset.Dataset; -import com.datamate.datamanagement.domain.model.dataset.DatasetFile; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.collections4.CollectionUtils; import org.mapstruct.Mapper; import org.mapstruct.Mapping; @@ -59,6 +56,13 @@ public interface DatasetConverter { */ DatasetFileResponse convertToResponse(DatasetFile datasetFile); + + /** + * 将数据集文件列表转换为响应 + */ + List convertToResponseList(List datasetFiles); + + /** * 获取数据文件的标签分布 * diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/dto/CopyFilesRequest.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/dto/CopyFilesRequest.java new file mode 100644 index 000000000..83234bae4 --- /dev/null +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/dto/CopyFilesRequest.java @@ -0,0 +1,14 @@ +package com.datamate.datamanagement.interfaces.dto; + +import jakarta.validation.constraints.NotEmpty; + +import java.util.List; + +/** + * 复制文件请求DTO + * + * @author dallas + * @since 2025-11-13 + */ +public record CopyFilesRequest(@NotEmpty List sourcePaths) { +} diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java index e5cb69985..36c628e69 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java @@ -6,10 +6,7 @@ import com.datamate.datamanagement.application.DatasetFileApplicationService; import com.datamate.datamanagement.domain.model.dataset.DatasetFile; import com.datamate.datamanagement.interfaces.converter.DatasetConverter; -import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse; -import com.datamate.datamanagement.interfaces.dto.PagedDatasetFileResponse; -import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; -import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; +import com.datamate.datamanagement.interfaces.dto.*; import jakarta.servlet.http.HttpServletResponse; import jakarta.validation.Valid; import lombok.extern.slf4j.Slf4j; @@ -23,8 +20,8 @@ import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.*; -import org.springframework.web.multipart.MultipartFile; +import java.util.List; import java.util.stream.Collectors; /** @@ -44,20 +41,20 @@ public DatasetFileController(DatasetFileApplicationService datasetFileApplicatio @GetMapping public ResponseEntity> getDatasetFiles( - @PathVariable("datasetId") String datasetId, - @RequestParam(value = "page", required = false, defaultValue = "0") Integer page, - @RequestParam(value = "size", required = false, defaultValue = "20") Integer size, - @RequestParam(value = "fileType", required = false) String fileType, - @RequestParam(value = "status", required = false) String status) { + @PathVariable("datasetId") String datasetId, + @RequestParam(value = "page", required = false, defaultValue = "0") Integer page, + @RequestParam(value = "size", required = false, defaultValue = "20") Integer size, + @RequestParam(value = "fileType", required = false) String fileType, + @RequestParam(value = "status", required = false) String status) { Pageable pageable = PageRequest.of(page != null ? page : 0, size != null ? size : 20); Page filesPage = datasetFileApplicationService.getDatasetFiles( - datasetId, fileType, status, pageable); + datasetId, fileType, status, pageable); PagedDatasetFileResponse response = new PagedDatasetFileResponse(); response.setContent(filesPage.getContent().stream() - .map(DatasetConverter.INSTANCE::convertToResponse) - .collect(Collectors.toList())); + .map(DatasetConverter.INSTANCE::convertToResponse) + .collect(Collectors.toList())); response.setPage(filesPage.getNumber()); response.setSize(filesPage.getSize()); response.setTotalElements((int) filesPage.getTotalElements()); @@ -70,8 +67,8 @@ public ResponseEntity> getDatasetFiles( @GetMapping("/{fileId}") public ResponseEntity> getDatasetFileById( - @PathVariable("datasetId") String datasetId, - @PathVariable("fileId") String fileId) { + @PathVariable("datasetId") String datasetId, + @PathVariable("fileId") String fileId) { try { DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId); return ResponseEntity.ok(Response.ok(DatasetConverter.INSTANCE.convertToResponse(datasetFile))); @@ -82,8 +79,8 @@ public ResponseEntity> getDatasetFileById( @DeleteMapping("/{fileId}") public ResponseEntity> deleteDatasetFile( - @PathVariable("datasetId") String datasetId, - @PathVariable("fileId") String fileId) { + @PathVariable("datasetId") String datasetId, + @PathVariable("fileId") String fileId) { try { datasetFileApplicationService.deleteDatasetFile(datasetId, fileId); return ResponseEntity.ok().build(); @@ -101,10 +98,10 @@ public ResponseEntity downloadDatasetFileById(@PathVariable("datasetId Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId); return ResponseEntity.ok() - .contentType(MediaType.APPLICATION_OCTET_STREAM) - .header(HttpHeaders.CONTENT_DISPOSITION, - "attachment; filename=\"" + datasetFile.getFileName() + "\"") - .body(resource); + .contentType(MediaType.APPLICATION_OCTET_STREAM) + .header(HttpHeaders.CONTENT_DISPOSITION, + "attachment; filename=\"" + datasetFile.getFileName() + "\"") + .body(resource); } catch (IllegalArgumentException e) { return ResponseEntity.status(HttpStatus.NOT_FOUND).build(); } catch (Exception e) { @@ -136,11 +133,26 @@ public ResponseEntity> preUpload(@PathVariable("datasetId") Str * @param uploadFileRequest 上传文件请求 */ @PostMapping("/upload/chunk") - public ResponseEntity chunkUpload(@PathVariable("datasetId") String datasetId, @Valid UploadFileRequest uploadFileRequest) { + public ResponseEntity chunkUpload(@PathVariable("datasetId") String datasetId, + @Valid UploadFileRequest uploadFileRequest) { log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}", - uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(), - uploadFileRequest.getChunkNo()); + uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(), + uploadFileRequest.getChunkNo()); datasetFileApplicationService.chunkUpload(datasetId, uploadFileRequest); return ResponseEntity.ok().build(); } + + /** + * 将指定路径中的文件拷贝到数据集目录下 + * + * @param datasetId 数据集ID + * @param req 源文件路径列表 + * @return 数据集文件响应DTO列表 + */ + @PostMapping("/upload/copy") + public List copyFilesToDatasetDir(@PathVariable("datasetId") String datasetId, + @RequestBody @Valid CopyFilesRequest req) { + List datasetFiles = datasetFileApplicationService.copyFilesToDatasetDir(datasetId, req); + return DatasetConverter.INSTANCE.convertToResponseList(datasetFiles); + } } diff --git a/backend/services/main-application/src/main/resources/application.yml b/backend/services/main-application/src/main/resources/application.yml index b6fed0d13..3f529161e 100644 --- a/backend/services/main-application/src/main/resources/application.yml +++ b/backend/services/main-application/src/main/resources/application.yml @@ -118,23 +118,15 @@ management: # 平台配置 datamate: - # JWT配置 - jwt: - secret: ${JWT_SECRET:dataMateSecretKey2024ForJWTTokenGeneration} - expiration: ${JWT_EXPIRATION:86400} # 24小时,单位秒 - header: Authorization - prefix: "Bearer " + # 通用配置 + # 文件存储配置 storage: type: ${STORAGE_TYPE:local} # local, minio, s3 local: base-path: ${STORAGE_LOCAL_PATH:./data/storage} - minio: - endpoint: ${MINIO_ENDPOINT:http://localhost:9000} - access-key: ${MINIO_ACCESS_KEY:minioadmin} - secret-key: ${MINIO_SECRET_KEY:minioadmin} - bucket-name: ${MINIO_BUCKET:data-mate} + # Ray执行器配置 ray: @@ -148,6 +140,12 @@ datamate: - "numpy" - "data-juicer" + # 模块配置 + + # 数据管理服务配置 + data-management: + base-path: /dataset + # 数据归集服务配置(可由模块导入叠加) data-collection: {}