Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README-zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
![GitHub Stars](https://img.shields.io/github/stars/ModelEngine-Group/DataMate)
![GitHub Forks](https://img.shields.io/github/forks/ModelEngine-Group/DataMate)
![GitHub Issues](https://img.shields.io/github/issues/ModelEngine-Group/DataMate)
![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/DataMate)
![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/datamate-docs)

**DataMate是面向模型微调与RAG检索的企业级数据处理平台,支持数据归集、数据管理、算子市场、数据清洗、数据合成、数据标注、数据评估、知识生成等核心功能。**

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import com.datamate.collection.interfaces.dto.*;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.datamanagement.application.DatasetApplicationService;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
import jakarta.validation.Valid;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public class DatasetApplicationService {
private final FileMetadataService fileMetadataService;
private final ObjectMapper objectMapper;

@Value("${dataset.base.path:/dataset}")
@Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.datamate.common.domain.model.FileUploadResult;
import com.datamate.common.domain.service.FileService;
import com.datamate.common.domain.utils.AnalyzerUtils;
import com.datamate.common.infrastructure.exception.BusinessAssert;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.datamanagement.domain.contants.DatasetConstant;
Expand All @@ -13,12 +14,14 @@
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.servlet.http.HttpServletResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.ibatis.session.RowBounds;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
Expand Down Expand Up @@ -57,7 +60,7 @@ public class DatasetFileApplicationService {
private final DatasetRepository datasetRepository;
private final FileService fileService;

@Value("${dataset.base.path:/dataset}")
@Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath;

@Autowired
Expand Down Expand Up @@ -257,4 +260,51 @@ private void saveFileInfoToDb(FileUploadResult fileUploadResult, UploadFileReque
dataset.active();
datasetRepository.updateById(dataset);
}

/**
* 复制文件到数据集目录
*
* @param datasetId 数据集id
* @param req 复制文件请求
* @return 复制的文件列表
*/
@Transactional
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
List<DatasetFile> copiedFiles = new ArrayList<>();
for (String sourceFilePath : req.sourcePaths()) {
Path sourcePath = Paths.get(sourceFilePath);
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
continue;
}
String fileName = sourcePath.getFileName().toString();
File targetFile = new File(dataset.getPath(), fileName);
try {
FileUtils.copyInputStreamToFile(Files.newInputStream(sourcePath), targetFile);
} catch (IOException e) {
log.error("Failed to copy file: {}", sourceFilePath, e);
continue;
}

LocalDateTime currentTime = LocalDateTime.now();
DatasetFile datasetFile = DatasetFile.builder()
.id(UUID.randomUUID().toString())
.datasetId(datasetId)
.fileName(fileName)
.fileType(AnalyzerUtils.getExtension(fileName))
.fileSize(targetFile.length())
.filePath(targetFile.getPath())
.uploadTime(currentTime)
.lastAccessTime(currentTime)
.build();
datasetFileRepository.save(datasetFile);
dataset.addFile(datasetFile);
copiedFiles.add(datasetFile);
}
dataset.active();
datasetRepository.updateById(dataset);
return copiedFiles;
}
}
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
package com.datamate.datamanagement.interfaces.converter;

import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.common.domain.model.ChunkUploadRequest;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.FileTag;
import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.common.domain.model.ChunkUploadRequest;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.collections4.CollectionUtils;
import org.mapstruct.Mapper;
import org.mapstruct.Mapping;
Expand Down Expand Up @@ -59,6 +56,13 @@ public interface DatasetConverter {
*/
DatasetFileResponse convertToResponse(DatasetFile datasetFile);


/**
* 将数据集文件列表转换为响应
*/
List<DatasetFileResponse> convertToResponseList(List<DatasetFile> datasetFiles);


/**
* 获取数据文件的标签分布
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.datamate.datamanagement.interfaces.dto;

import jakarta.validation.constraints.NotEmpty;

import java.util.List;

/**
* 复制文件请求DTO
*
* @author dallas
* @since 2025-11-13
*/
public record CopyFilesRequest(@NotEmpty List<String> sourcePaths) {
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
import com.datamate.datamanagement.application.DatasetFileApplicationService;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
import com.datamate.datamanagement.interfaces.dto.PagedDatasetFileResponse;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.datamate.datamanagement.interfaces.dto.*;
import jakarta.servlet.http.HttpServletResponse;
import jakarta.validation.Valid;
import lombok.extern.slf4j.Slf4j;
Expand All @@ -23,8 +20,8 @@
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;

import java.util.List;
import java.util.stream.Collectors;

/**
Expand All @@ -44,20 +41,20 @@ public DatasetFileController(DatasetFileApplicationService datasetFileApplicatio

@GetMapping
public ResponseEntity<Response<PagedDatasetFileResponse>> getDatasetFiles(
@PathVariable("datasetId") String datasetId,
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
@RequestParam(value = "fileType", required = false) String fileType,
@RequestParam(value = "status", required = false) String status) {
@PathVariable("datasetId") String datasetId,
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
@RequestParam(value = "fileType", required = false) String fileType,
@RequestParam(value = "status", required = false) String status) {
Pageable pageable = PageRequest.of(page != null ? page : 0, size != null ? size : 20);

Page<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFiles(
datasetId, fileType, status, pageable);
datasetId, fileType, status, pageable);

PagedDatasetFileResponse response = new PagedDatasetFileResponse();
response.setContent(filesPage.getContent().stream()
.map(DatasetConverter.INSTANCE::convertToResponse)
.collect(Collectors.toList()));
.map(DatasetConverter.INSTANCE::convertToResponse)
.collect(Collectors.toList()));
response.setPage(filesPage.getNumber());
response.setSize(filesPage.getSize());
response.setTotalElements((int) filesPage.getTotalElements());
Expand All @@ -70,8 +67,8 @@ public ResponseEntity<Response<PagedDatasetFileResponse>> getDatasetFiles(

@GetMapping("/{fileId}")
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
try {
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
return ResponseEntity.ok(Response.ok(DatasetConverter.INSTANCE.convertToResponse(datasetFile)));
Expand All @@ -82,8 +79,8 @@ public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(

@DeleteMapping("/{fileId}")
public ResponseEntity<Response<Void>> deleteDatasetFile(
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
try {
datasetFileApplicationService.deleteDatasetFile(datasetId, fileId);
return ResponseEntity.ok().build();
Expand All @@ -101,10 +98,10 @@ public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);

return ResponseEntity.ok()
.contentType(MediaType.APPLICATION_OCTET_STREAM)
.header(HttpHeaders.CONTENT_DISPOSITION,
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
.body(resource);
.contentType(MediaType.APPLICATION_OCTET_STREAM)
.header(HttpHeaders.CONTENT_DISPOSITION,
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
.body(resource);
} catch (IllegalArgumentException e) {
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
} catch (Exception e) {
Expand Down Expand Up @@ -136,11 +133,26 @@ public ResponseEntity<Response<String>> preUpload(@PathVariable("datasetId") Str
* @param uploadFileRequest 上传文件请求
*/
@PostMapping("/upload/chunk")
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId, @Valid UploadFileRequest uploadFileRequest) {
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId,
@Valid UploadFileRequest uploadFileRequest) {
log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}",
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
uploadFileRequest.getChunkNo());
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
uploadFileRequest.getChunkNo());
datasetFileApplicationService.chunkUpload(datasetId, uploadFileRequest);
return ResponseEntity.ok().build();
}

/**
* 将指定路径中的文件拷贝到数据集目录下
*
* @param datasetId 数据集ID
* @param req 源文件路径列表
* @return 数据集文件响应DTO列表
*/
@PostMapping("/upload/copy")
public List<DatasetFileResponse> copyFilesToDatasetDir(@PathVariable("datasetId") String datasetId,
@RequestBody @Valid CopyFilesRequest req) {
List<DatasetFile> datasetFiles = datasetFileApplicationService.copyFilesToDatasetDir(datasetId, req);
return DatasetConverter.INSTANCE.convertToResponseList(datasetFiles);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -118,23 +118,15 @@ management:

# 平台配置
datamate:
# JWT配置
jwt:
secret: ${JWT_SECRET:dataMateSecretKey2024ForJWTTokenGeneration}
expiration: ${JWT_EXPIRATION:86400} # 24小时,单位秒
header: Authorization
prefix: "Bearer "
# 通用配置


# 文件存储配置
storage:
type: ${STORAGE_TYPE:local} # local, minio, s3
local:
base-path: ${STORAGE_LOCAL_PATH:./data/storage}
minio:
endpoint: ${MINIO_ENDPOINT:http://localhost:9000}
access-key: ${MINIO_ACCESS_KEY:minioadmin}
secret-key: ${MINIO_SECRET_KEY:minioadmin}
bucket-name: ${MINIO_BUCKET:data-mate}


# Ray执行器配置
ray:
Expand All @@ -148,6 +140,12 @@ datamate:
- "numpy"
- "data-juicer"

# 模块配置

# 数据管理服务配置
data-management:
base-path: /dataset

# 数据归集服务配置(可由模块导入叠加)
data-collection: {}

Expand Down