-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Ensure that Operations are aborted when MapTaskExecutor is closed. Add tests around setup/teardown of DoFns #36631
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -105,11 +105,32 @@ public DataflowMapTaskExecutor create( | |
| Networks.replaceDirectedNetworkNodes( | ||
| network, createOutputReceiversTransform(stageName, counterSet)); | ||
|
|
||
| // Swap out all the ParallelInstruction nodes with Operation nodes | ||
| Networks.replaceDirectedNetworkNodes( | ||
| network, | ||
| createOperationTransformForParallelInstructionNodes( | ||
| stageName, network, options, readerFactory, sinkFactory, executionContext)); | ||
| // Swap out all the ParallelInstruction nodes with Operation nodes. While updating the network, | ||
| // we keep track of | ||
| // the created Operations so that if an exception is encountered we can properly abort started | ||
| // operations. | ||
| ArrayList<Operation> createdOperations = new ArrayList<>(); | ||
| try { | ||
| Networks.replaceDirectedNetworkNodes( | ||
| network, | ||
| createOperationTransformForParallelInstructionNodes( | ||
| stageName, | ||
| network, | ||
| options, | ||
| readerFactory, | ||
| sinkFactory, | ||
| executionContext, | ||
| createdOperations)); | ||
| } catch (RuntimeException exn) { | ||
| for (Operation o : createdOperations) { | ||
| try { | ||
| o.abort(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are now evaluating teardown for all pardos when one throw. Given the change would it be straightforward to fix for the case DoFn finished normally (like changing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is when creating the executor, if there is no error creating it then we don't want to abort here because the executor is then returned and reused many times. If we want to ensure we teardown DoFns, I think that we need to have some timeout on the internal cache of DoFns in DoFnInstanceManagers.java |
||
| } catch (Exception exn2) { | ||
| exn.addSuppressed(exn2); | ||
| } | ||
| } | ||
| throw exn; | ||
| } | ||
|
|
||
| // Collect all the operations within the network and attach all the operations as receivers | ||
| // to preceding output receivers. | ||
|
|
@@ -144,7 +165,8 @@ Function<Node, Node> createOperationTransformForParallelInstructionNodes( | |
| final PipelineOptions options, | ||
| final ReaderFactory readerFactory, | ||
| final SinkFactory sinkFactory, | ||
| final DataflowExecutionContext<?> executionContext) { | ||
| final DataflowExecutionContext<?> executionContext, | ||
| final List<Operation> createdOperations) { | ||
|
|
||
| return new TypeSafeNodeFunction<ParallelInstructionNode>(ParallelInstructionNode.class) { | ||
| @Override | ||
|
|
@@ -156,27 +178,31 @@ public Node typedApply(ParallelInstructionNode node) { | |
| instruction.getOriginalName(), | ||
| instruction.getSystemName(), | ||
| instruction.getName()); | ||
| OperationNode result; | ||
| try { | ||
| DataflowOperationContext context = executionContext.createOperationContext(nameContext); | ||
| if (instruction.getRead() != null) { | ||
| return createReadOperation( | ||
| network, node, options, readerFactory, executionContext, context); | ||
| result = | ||
| createReadOperation( | ||
| network, node, options, readerFactory, executionContext, context); | ||
| } else if (instruction.getWrite() != null) { | ||
| return createWriteOperation(node, options, sinkFactory, executionContext, context); | ||
| result = createWriteOperation(node, options, sinkFactory, executionContext, context); | ||
| } else if (instruction.getParDo() != null) { | ||
| return createParDoOperation(network, node, options, executionContext, context); | ||
| result = createParDoOperation(network, node, options, executionContext, context); | ||
| } else if (instruction.getPartialGroupByKey() != null) { | ||
| return createPartialGroupByKeyOperation( | ||
| network, node, options, executionContext, context); | ||
| result = | ||
| createPartialGroupByKeyOperation(network, node, options, executionContext, context); | ||
| } else if (instruction.getFlatten() != null) { | ||
| return createFlattenOperation(network, node, context); | ||
| result = createFlattenOperation(network, node, context); | ||
| } else { | ||
| throw new IllegalArgumentException( | ||
| String.format("Unexpected instruction: %s", instruction)); | ||
| } | ||
| } catch (Exception e) { | ||
| throw new RuntimeException(e); | ||
| } | ||
| createdOperations.add(result.getOperation()); | ||
| return result; | ||
| } | ||
| }; | ||
| } | ||
|
|
@@ -328,7 +354,6 @@ public Node typedApply(InstructionOutputNode input) { | |
| Coder<?> coder = | ||
| CloudObjects.coderFromCloudObject(CloudObject.fromSpec(cloudOutput.getCodec())); | ||
|
|
||
| @SuppressWarnings("unchecked") | ||
| ElementCounter outputCounter = | ||
| new DataflowOutputCounter( | ||
| cloudOutput.getName(), | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.