1111import com .datamate .datamanagement .domain .model .dataset .Tag ;
1212import com .datamate .datamanagement .infrastructure .client .CollectionTaskClient ;
1313import com .datamate .datamanagement .infrastructure .client .dto .CollectionTaskDetailResponse ;
14- import com .datamate .datamanagement .infrastructure .client .dto .LocalCollectionConfig ;
1514import com .datamate .datamanagement .infrastructure .exception .DataManagementErrorCode ;
1615import com .datamate .datamanagement .infrastructure .persistence .mapper .TagMapper ;
1716import com .datamate .datamanagement .infrastructure .persistence .repository .DatasetFileRepository ;
1817import com .datamate .datamanagement .infrastructure .persistence .repository .DatasetRepository ;
1918import com .datamate .datamanagement .interfaces .converter .DatasetConverter ;
2019import com .datamate .datamanagement .interfaces .dto .*;
21- import com .fasterxml .jackson .databind .ObjectMapper ;
2220import lombok .RequiredArgsConstructor ;
2321import lombok .extern .slf4j .Slf4j ;
2422import org .apache .commons .collections4 .CollectionUtils ;
2826import org .springframework .transaction .annotation .Transactional ;
2927import org .springframework .util .StringUtils ;
3028
29+ import java .io .IOException ;
3130import java .nio .file .Files ;
31+ import java .nio .file .Path ;
3232import java .nio .file .Paths ;
3333import java .util .*;
34- import java .util .function .Function ;
3534import java .util .stream .Collectors ;
35+ import java .util .stream .Stream ;
3636
3737/**
3838 * 数据集应用服务(对齐 DB schema,使用 UUID 字符串主键)
@@ -46,8 +46,7 @@ public class DatasetApplicationService {
4646 private final TagMapper tagMapper ;
4747 private final DatasetFileRepository datasetFileRepository ;
4848 private final CollectionTaskClient collectionTaskClient ;
49- private final FileMetadataService fileMetadataService ;
50- private final ObjectMapper objectMapper ;
49+ private final DatasetFileApplicationService datasetFileApplicationService ;
5150
5251 @ Value ("${datamate.data-management.base-path:/dataset}" )
5352 private String datasetBasePath ;
@@ -223,68 +222,38 @@ public AllDatasetStatisticsResponse getAllDatasetStatistics() {
223222 @ Async
224223 public void processDataSourceAsync (String datasetId , String dataSourceId ) {
225224 try {
226- log .info ("开始处理数据源文件扫描,数据集ID : {}, 数据源ID : {}" , datasetId , dataSourceId );
225+ log .info ("Initiating data source file scanning, dataset ID : {}, collection task ID : {}" , datasetId , dataSourceId );
227226 List <String > filePaths = getFilePaths (dataSourceId );
228227 if (CollectionUtils .isEmpty (filePaths )) {
229228 return ;
230229 }
231- log .info ("开始扫描文件,共 {} 个文件路径" , filePaths .size ());
232-
233- List <DatasetFile > datasetFiles = fileMetadataService .scanFiles (filePaths , datasetId );
234- // 查询数据集中已存在的文件
235- List <DatasetFile > existDatasetFileList = datasetFileRepository .findAllByDatasetId (datasetId );
236- Map <String , DatasetFile > existDatasetFilePathMap = existDatasetFileList .stream ().collect (Collectors .toMap (DatasetFile ::getFilePath , Function .identity ()));
237- Dataset dataset = datasetRepository .getById (datasetId );
238- dataset .setFiles (existDatasetFileList );
239-
240- // 批量同步数据集文件表
241- asyncDatasetFile (datasetFiles , existDatasetFilePathMap , dataset , existDatasetFileList , filePaths );
242- datasetRepository .updateById (dataset );
230+ log .info ("Starting file scan, total files: {}" , filePaths .size ());
231+ datasetFileApplicationService .copyFilesToDatasetDir (datasetId , new CopyFilesRequest (filePaths ));
243232 } catch (Exception e ) {
244233 log .error ("处理数据源文件扫描失败,数据集ID: {}, 数据源ID: {}" , datasetId , dataSourceId , e );
245234 }
246235 }
247236
248- private void asyncDatasetFile (List <DatasetFile > datasetFiles , Map <String , DatasetFile > existDatasetFilePathMap , Dataset dataset , List <DatasetFile > existDatasetFileList , List <String > filePaths ) {
249- if (CollectionUtils .isNotEmpty (datasetFiles )) {
250- for (DatasetFile datasetFile : datasetFiles ) {
251- if (existDatasetFilePathMap .containsKey (datasetFile .getFilePath ())) {
252- DatasetFile existDatasetFile = existDatasetFilePathMap .get (datasetFile .getFilePath ());
253- dataset .removeFile (existDatasetFile );
254- existDatasetFile .setFileSize (datasetFile .getFileSize ());
255- dataset .addFile (existDatasetFile );
256- dataset .active ();
257- datasetFileRepository .updateById (existDatasetFile );
258- } else {
259- dataset .addFile (datasetFile );
260- dataset .active ();
261- datasetFileRepository .save (datasetFile );
262- }
263- }
264- log .info ("文件元数据写入完成,共写入 {} 条记录" , datasetFiles .size ());
265- } else {
266- log .warn ("未扫描到有效文件" );
267- }
268- for (DatasetFile datasetFile : existDatasetFileList ) {
269- String existFilePath = datasetFile .getFilePath ();
270- for (String filePath : filePaths ) {
271- if (existFilePath .equals (filePath ) || existFilePath .startsWith (filePath )) {
272- if (Files .notExists (Paths .get (existFilePath ))) {
273- dataset .removeFile (datasetFile );
274- datasetFileRepository .removeById (datasetFile .getId ());
275- }
276- }
277- }
278- }
279- }
280-
281237 private List <String > getFilePaths (String dataSourceId ) {
282238 CollectionTaskDetailResponse taskDetail = collectionTaskClient .getTaskDetail (dataSourceId ).getData ();
283239 if (taskDetail == null ) {
284- log .warn ("获取归集任务详情失败,任务ID: {}" , dataSourceId );
240+ log .warn ("Fail to get collection task detail, task ID: {}" , dataSourceId );
241+ return Collections .emptyList ();
242+ }
243+ Path targetPath = Paths .get (taskDetail .getTargetPath ());
244+ if (!Files .exists (targetPath ) || !Files .isDirectory (targetPath )) {
245+ log .warn ("Target path not exists or is not a directory: {}" , taskDetail .getTargetPath ());
246+ return Collections .emptyList ();
247+ }
248+
249+ try (Stream <Path > paths = Files .walk (targetPath , 1 )) {
250+ return paths
251+ .filter (Files ::isRegularFile ) // 只保留文件,排除目录
252+ .map (Path ::toString ) // 转换为字符串路径
253+ .collect (Collectors .toList ());
254+ } catch (IOException e ) {
255+ log .error ("Fail to scan directory: {}" , targetPath , e );
285256 return Collections .emptyList ();
286257 }
287- log .info ("获取到归集任务详情: {}" , taskDetail );
288- return Collections .singletonList (taskDetail .getTargetPath ());
289258 }
290259}
0 commit comments