34
34
import Queue as queue
35
35
else :
36
36
import queue
37
- # NOTE: [ avoid hanging ] This value is used in getting data from another process
38
- MP_CHECK_TIMEOUT = 10
37
+ # NOTE: [ avoid hanging ] These value is used in getting data from another process
38
+ QUEUE_GET_TIMEOUT = 5
39
+ MAX_GET_FAILED_TIME = 12
39
40
40
41
__all__ = ['PyReader' , 'DataLoader' ]
41
42
@@ -485,6 +486,17 @@ def __handler__(signum, frame):
485
486
486
487
signal .signal (signal .SIGCHLD , __handler__ )
487
488
489
+ def _exit_thread_expectedly (self ):
490
+ self ._thread_done_event .set ()
491
+ self ._blocking_queue .close ()
492
+ self ._data_queue .close ()
493
+
494
+ def _exit_thread_unexpectedly (self ):
495
+ self ._thread_done_event .set ()
496
+ self ._blocking_queue .kill ()
497
+ self ._data_queue .close ()
498
+ logging .error ("DataLoader reader thread raised an exception!" )
499
+
488
500
def _reader_process_loop (self ):
489
501
try :
490
502
# set signal handler
@@ -506,17 +518,29 @@ def _reader_process_loop(self):
506
518
six .reraise (* sys .exc_info ())
507
519
508
520
def _reader_thread_loop_with_process (self ):
521
+ get_sample_try_time = 0
509
522
while not self ._thread_done_event .is_set ():
510
523
try :
511
524
# NOTE: [ avoid hanging ] Even with carefully designed data dependencies
512
525
# (i.e., a put() always corresponding to a get()), hanging on get() can
513
526
# still happen when data in queue is corrupted (e.g., due to
514
527
# Queue.cancel_join_thread or unexpected exit). So we set a timeout whenever
515
528
# we try to get data from `data_queue`
516
- sample = self ._data_queue .get (timeout = MP_CHECK_TIMEOUT )
529
+ sample = self ._data_queue .get (timeout = QUEUE_GET_TIMEOUT )
530
+ get_sample_try_time = 0
517
531
except queue .Empty :
518
- self ._thread_done_event .set ()
519
- logging .error ("The reader has not read data for a long time." )
532
+ get_sample_try_time += 1
533
+ if get_sample_try_time > MAX_GET_FAILED_TIME :
534
+ self ._exit_thread_unexpectedly ()
535
+ raise RuntimeError (
536
+ "DataLoader reader thread has not read data for a long time (60s)."
537
+ )
538
+ else :
539
+ # NOTE: [ avoid failed quickly ] Sometimes if the reader child process has a heavy burden,
540
+ # the child process has no enough time to put the data in the queue when the main process
541
+ # start trying to get data from queue. At this time, failure to read data should not be
542
+ # counted as a fatal error, there should be a certain number of attempts.
543
+ continue
520
544
521
545
if not self ._thread_done_event .is_set ():
522
546
if sample is not None :
@@ -532,20 +556,10 @@ def _reader_thread_loop_with_process(self):
532
556
if not self ._blocking_queue .push (array ):
533
557
self ._blocking_queue .close ()
534
558
except :
535
- self ._thread_done_event .set ()
536
- self ._blocking_queue .kill ()
537
- self ._data_queue .close ()
538
- logging .warning (
539
- "DygraphDataLoader reader thread raised an exception."
540
- )
559
+ self ._exit_thread_unexpectedly ()
541
560
six .reraise (* sys .exc_info ())
542
561
else :
543
- self ._thread_done_event .set ()
544
- self ._blocking_queue .close ()
545
- self ._data_queue .close ()
546
- else :
547
- self ._blocking_queue .kill ()
548
- self ._data_queue .close ()
562
+ self ._exit_thread_expectedly ()
549
563
550
564
def _reader_thread_loop (self ):
551
565
try :
0 commit comments