@@ -507,10 +507,10 @@ public void notifyCheckpointStart(SubtaskKey subtaskKey, long checkpointId) {
507
507
managedSharedStateDirHandles .computeIfPresent (
508
508
subtaskKey ,
509
509
(k , v ) -> {
510
- v .increaseRefCountWhenCheckpointStart (checkpointId );
510
+ v .addReferenceWhenCheckpointStart (checkpointId );
511
511
return v ;
512
512
});
513
- managedExclusiveStateDirHandle .increaseRefCountWhenCheckpointStart (checkpointId );
513
+ managedExclusiveStateDirHandle .addReferenceWhenCheckpointStart (checkpointId );
514
514
}
515
515
}
516
516
@@ -534,10 +534,10 @@ public void notifyCheckpointAborted(SubtaskKey subtaskKey, long checkpointId) th
534
534
managedSharedStateDirHandles .computeIfPresent (
535
535
subtaskKey ,
536
536
(k , v ) -> {
537
- v .decreaseRefCountWhenCheckpointAbort (checkpointId );
537
+ v .removeReferenceWhenCheckpointAbort (checkpointId );
538
538
return v ;
539
539
});
540
- managedExclusiveStateDirHandle .decreaseRefCountWhenCheckpointAbort (checkpointId );
540
+ managedExclusiveStateDirHandle .removeReferenceWhenCheckpointAbort (checkpointId );
541
541
}
542
542
543
543
synchronized (lock ) {
@@ -948,20 +948,22 @@ boolean isCheckpointDiscard(long checkpointId) {
948
948
}
949
949
950
950
/**
951
- * This class wrap DirectoryStreamStateHandle with reference count by ongoing checkpoint. If an
952
- * ongoing checkpoint which reference the directory handle complete, we will stop tracking the
953
- * handle, because the ownership of the handle is handover to JobManager.
951
+ * This class wrap DirectoryStreamStateHandle with reference by ongoing checkpoint. If an
952
+ * ongoing checkpoint which reference the directory handle complete or be subsumed, we will stop
953
+ * tracking the handle, because the ownership of the handle is handover to JobManager.
954
+ * JobManager acknowledges the handle and will clean up the directory when it is no longer
955
+ * needed.
954
956
*/
955
957
protected static class DirectoryHandleWithReferenceTrack {
956
958
957
959
private final DirectoryStreamStateHandle directoryHandle ;
958
- // reference count by ongoing checkpoint
959
- private final AtomicLong ongoingRefCount ;
960
+ // reference by ongoing checkpoint
961
+ private final Set < Long > refCheckpointIds ;
960
962
private boolean tracking ;
961
963
962
964
DirectoryHandleWithReferenceTrack (DirectoryStreamStateHandle directoryHandle , boolean own ) {
963
965
this .directoryHandle = directoryHandle ;
964
- this .ongoingRefCount = new AtomicLong ( 0 );
966
+ this .refCheckpointIds = new HashSet <>( );
965
967
this .tracking = own ;
966
968
}
967
969
@@ -974,23 +976,23 @@ DirectoryStreamStateHandle getHandle() {
974
976
return directoryHandle ;
975
977
}
976
978
977
- void increaseRefCountWhenCheckpointStart (long checkpointId ) {
979
+ void addReferenceWhenCheckpointStart (long checkpointId ) {
978
980
if (tracking ) {
979
981
LOG .debug (
980
- "checkpoint:{} start, increase ref-count to file-merging managed shared dir : {}" ,
982
+ "checkpoint:{} start, add reference to file-merging managed shared dir : {}" ,
981
983
checkpointId ,
982
984
directoryHandle .getDirectory ());
983
- ongoingRefCount . incrementAndGet ( );
985
+ refCheckpointIds . add ( checkpointId );
984
986
}
985
987
}
986
988
987
- void decreaseRefCountWhenCheckpointAbort (long checkpointId ) {
989
+ void removeReferenceWhenCheckpointAbort (long checkpointId ) {
988
990
if (tracking ) {
989
991
LOG .debug (
990
- "checkpoint:{} aborted, decrease ref-count to file-merging managed shared dir : {}" ,
992
+ "checkpoint:{} aborted, remove reference to file-merging managed shared dir : {}" ,
991
993
checkpointId ,
992
994
directoryHandle .getDirectory ());
993
- ongoingRefCount . decrementAndGet ( );
995
+ refCheckpointIds . remove ( checkpointId );
994
996
}
995
997
}
996
998
@@ -1001,6 +1003,7 @@ void handoverOwnershipWhenCheckpointComplete(long checkpointId) {
1001
1003
checkpointId ,
1002
1004
directoryHandle .getDirectory ());
1003
1005
tracking = false ;
1006
+ refCheckpointIds .clear ();
1004
1007
}
1005
1008
}
1006
1009
@@ -1011,11 +1014,12 @@ void handoverOwnershipWhenCheckpointSubsumed(long checkpointId) {
1011
1014
checkpointId ,
1012
1015
directoryHandle .getDirectory ());
1013
1016
tracking = false ;
1017
+ refCheckpointIds .clear ();
1014
1018
}
1015
1019
}
1016
1020
1017
1021
void tryCleanupQuietly () {
1018
- if (tracking && ongoingRefCount . get () == 0 && directoryHandle != null ) {
1022
+ if (tracking && refCheckpointIds . isEmpty () && directoryHandle != null ) {
1019
1023
try {
1020
1024
directoryHandle .discardState ();
1021
1025
} catch (Exception e ) {
0 commit comments