|
30 | 30 | from fedlearner.common import fl_logging |
31 | 31 | from fedlearner.common import trainer_master_service_pb2 as tm_pb |
32 | 32 | from fedlearner.common.common import convert_time_string_to_datetime |
| 33 | +from fedlearner.common.common import end_with_valid_date |
| 34 | +from fedlearner.common.common import get_process_dates |
33 | 35 | from fedlearner.data_join.data_block_visitor import DataBlockVisitor |
34 | 36 | from fedlearner.trainer.utils import match_date |
35 | 37 |
|
@@ -351,22 +353,41 @@ def __init__(self, |
351 | 353 | if end_date: |
352 | 354 | end_date = convert_time_string_to_datetime(str(end_date)) |
353 | 355 | datablocks = [] |
354 | | - for dirname, _, filenames in tf.io.gfile.walk(data_path): |
355 | | - for filename in filenames: |
356 | | - if not fnmatch(os.path.join(dirname, filename), wildcard): |
| 356 | + if start_date and not end_with_valid_date(data_path): |
| 357 | + process_dates = get_process_dates(start_date, end_date) |
| 358 | + miss_dates = [] |
| 359 | + for process_date in process_dates: |
| 360 | + dir_path = os.path.join(data_path, process_date) |
| 361 | + if not tf.io.gfile.exists(dir_path): |
| 362 | + miss_dates.append(process_date) |
357 | 363 | continue |
358 | | - subdirname = os.path.relpath(dirname, data_path) |
359 | | - try: |
360 | | - cur_date = datetime.strptime(subdirname, '%Y%m%d') |
361 | | - if not match_date(cur_date, start_date, end_date): |
| 364 | + for _, _, filenames in tf.io.gfile.walk(dir_path): |
| 365 | + for filename in filenames: |
| 366 | + if not fnmatch(os.path.join(dir_path, filename), wildcard): |
| 367 | + continue |
| 368 | + block_id = os.path.join(process_date, filename) |
| 369 | + datablock = _RawDataBlock( |
| 370 | + id=block_id, data_path=os.path.join(dir_path, filename), |
| 371 | + start_time=None, end_time=None, type=tm_pb.JOINED) |
| 372 | + datablocks.append(datablock) |
| 373 | + fl_logging.info('miss_dates: [%s]', ",".join(miss_dates)) |
| 374 | + else: |
| 375 | + for dirname, _, filenames in tf.io.gfile.walk(data_path): |
| 376 | + for filename in filenames: |
| 377 | + if not fnmatch(os.path.join(dirname, filename), wildcard): |
362 | 378 | continue |
363 | | - except Exception: |
364 | | - fl_logging.info('subdirname is not the format of time') |
365 | | - block_id = os.path.join(subdirname, filename) |
366 | | - datablock = _RawDataBlock( |
367 | | - id=block_id, data_path=os.path.join(dirname, filename), |
368 | | - start_time=None, end_time=None, type=tm_pb.JOINED) |
369 | | - datablocks.append(datablock) |
| 379 | + subdirname = os.path.relpath(dirname, data_path) |
| 380 | + try: |
| 381 | + cur_date = datetime.strptime(subdirname, '%Y%m%d') |
| 382 | + if not match_date(cur_date, start_date, end_date): |
| 383 | + continue |
| 384 | + except Exception: |
| 385 | + fl_logging.info('subdirname is not the format of time') |
| 386 | + block_id = os.path.join(subdirname, filename) |
| 387 | + datablock = _RawDataBlock( |
| 388 | + id=block_id, data_path=os.path.join(dirname, filename), |
| 389 | + start_time=None, end_time=None, type=tm_pb.JOINED) |
| 390 | + datablocks.append(datablock) |
370 | 391 | datablocks.sort(key=lambda x: x.id) |
371 | 392 |
|
372 | 393 | fl_logging.info("create DataVisitor by local_data_path: %s", |
|
0 commit comments