Skip to content

Commit 9236852

Browse files
committed
[GR-35375] Fix memory leaks in multiprocessing.
PullRequest: graalpython/2052
2 parents 84126c7 + 7c2dc99 commit 9236852

File tree

2 files changed

+31
-18
lines changed

2 files changed

+31
-18
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MultiprocessingModuleBuiltins.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,12 @@ PTuple waittid(long id, @SuppressWarnings("unused") int options) {
214214
}
215215

216216
PythonContext.ChildContextData data = multiprocessing.getChildContextData(tid);
217+
/*
218+
* The assumption made here is that once _waittid returns the exit code, the caller
219+
* caches it and never calls _waittid again, so we do not need to keep the data and can
220+
* clean it. See popen_truffleprocess that calls the _waittid builtin.
221+
*/
222+
multiprocessing.removeChildContextData(tid);
217223
return factory().createTuple(new Object[]{id, data.wasSignaled() ? data.getExitCode() : 0, data.getExitCode()});
218224
}
219225
}
@@ -231,7 +237,7 @@ Object terminate(long id, PInt sig) {
231237
try {
232238
data.awaitRunning();
233239
TruffleContext truffleCtx = data.getTruffleContext();
234-
if (!truffleCtx.isCancelling() && data.compareAndSetExiting(false, true)) {
240+
if (truffleCtx != null && !truffleCtx.isCancelling() && data.compareAndSetExiting(false, true)) {
235241
LOGGER.fine("terminating spawned thread");
236242
data.setSignaled(sig.intValue());
237243
truffleCtx.closeCancelled(this, "_terminate_spawned_thread");

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PythonContext.java

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@
140140
import com.oracle.truffle.api.nodes.Node;
141141
import com.oracle.truffle.api.source.Source;
142142
import com.oracle.truffle.api.utilities.CyclicAssumption;
143+
import com.oracle.truffle.api.utilities.TruffleWeakReference;
143144
import com.oracle.truffle.llvm.api.Toolchain;
144145

145146
public final class PythonContext extends Python3Core {
@@ -538,12 +539,16 @@ public Map<String, Object> getCodecErrorRegistry() {
538539
public static final class ChildContextData {
539540
private int exitCode = 0;
540541
private boolean signaled;
541-
@CompilationFinal private TruffleContext ctx;
542-
@CompilationFinal private PythonContext parentCtx;
542+
private final PythonContext parentCtx;
543+
private TruffleWeakReference<TruffleContext> ctx;
543544

544545
private final AtomicBoolean exiting = new AtomicBoolean(false);
545546
private final CountDownLatch running = new CountDownLatch(1);
546547

548+
public ChildContextData(PythonContext parentCtx) {
549+
this.parentCtx = parentCtx;
550+
}
551+
547552
public void setExitCode(int exitCode) {
548553
this.exitCode = exitCode;
549554
}
@@ -563,16 +568,12 @@ public boolean wasSignaled() {
563568

564569
private void setTruffleContext(TruffleContext ctx) {
565570
assert this.ctx == null;
566-
this.ctx = ctx;
571+
assert ctx != null;
572+
this.ctx = new TruffleWeakReference<>(ctx);
567573
}
568574

569575
public TruffleContext getTruffleContext() {
570-
return ctx;
571-
}
572-
573-
private void setParentContext(PythonContext parentCtx) {
574-
assert this.parentCtx == null;
575-
this.parentCtx = parentCtx;
576+
return ctx.get();
576577
}
577578

578579
public void awaitRunning() throws InterruptedException {
@@ -786,9 +787,15 @@ public Semaphore removeNamedSemaphore(String name) {
786787
return namedSemaphores.remove(name);
787788
}
788789

789-
private final Map<Long, Thread> childContextThreads = new ConcurrentHashMap<>();
790+
private final ConcurrentHashMap<Long, Thread> childContextThreads = new ConcurrentHashMap<>();
790791

791-
private final Map<Long, ChildContextData> childContextData = new ConcurrentHashMap<>();
792+
/**
793+
* {@code ChildContextData} outlives its own context, because the parent needs to be able to
794+
* access the exit code even after the child context was closed and thread disposed. We
795+
* dispose the mapping to {@code ChildContextData} when the Python code (our internal Python
796+
* code) asks for the exit code for the first time after the child exited.
797+
*/
798+
private final ConcurrentHashMap<Long, ChildContextData> childContextData = new ConcurrentHashMap<>();
792799

793800
@TruffleBoundary
794801
public Thread getChildContextThread(long tid) {
@@ -810,6 +817,11 @@ public ChildContextData getChildContextData(long tid) {
810817
return childContextData.get(tid);
811818
}
812819

820+
@TruffleBoundary
821+
public void removeChildContextData(long tid) {
822+
childContextData.remove(tid);
823+
}
824+
813825
@TruffleBoundary
814826
public void putChildContextData(long id, ChildContextData data) {
815827
childContextData.put(id, data);
@@ -853,12 +865,7 @@ public SharedMultiprocessingData getSharedMultiprocessingData() {
853865
}
854866

855867
public long spawnTruffleContext(int fd, int sentinel, int[] fdsToKeep) {
856-
ChildContextData data = new ChildContextData();
857-
if (!isChildContext()) {
858-
data.setParentContext(this);
859-
} else {
860-
data.setParentContext(childContextData.parentCtx);
861-
}
868+
ChildContextData data = new ChildContextData(isChildContext() ? childContextData.parentCtx : this);
862869

863870
Builder builder = data.parentCtx.env.newContextBuilder().config(PythonContext.CHILD_CONTEXT_DATA, data);
864871
Thread thread = data.parentCtx.env.createThread(new ChildContextThread(fd, sentinel, data, builder));

0 commit comments

Comments
 (0)