Skip to content

Commit 3a77b1b

Browse files
committed
[GR-55037] Fixes for tokenizers
PullRequest: graalpython/3390
2 parents fe06e1f + efe7ed8 commit 3a77b1b

File tree

5 files changed

+80
-79
lines changed

5 files changed

+80
-79
lines changed

graalpython/com.oracle.graal.python.cext/src/capi.c

Lines changed: 4 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,7 @@ void initialize_hashes();
853853
void _PyFloat_InitState(PyInterpreterState* state);
854854

855855
Py_LOCAL_SYMBOL TruffleContext* TRUFFLE_CONTEXT;
856-
Py_LOCAL_SYMBOL int graalpy_finalizing;
856+
Py_LOCAL_SYMBOL int32_t graalpy_finalizing;
857857

858858
PyAPI_FUNC(void) initialize_graal_capi(TruffleEnv* env, void **builtin_closures) {
859859
clock_t t = clock();
@@ -880,58 +880,12 @@ PyAPI_FUNC(void) initialize_graal_capi(TruffleEnv* env, void **builtin_closures)
880880
PyTruffle_Log(PY_TRUFFLE_LOG_FINE, "initialize_graal_capi: %fs", ((double) (clock() - t)) / CLOCKS_PER_SEC);
881881
}
882882

883-
/*
884-
This is a workaround for C++ modules, namely PyTorch, that declare global/static variables with destructors that call
885-
_Py_DECREF. The destructors get called by libc during exit during which we cannot make upcalls as that would segfault.
886-
So we rebind them to no-ops when exiting.
887-
*/
888-
Py_ssize_t nop_GraalPy_get_PyObject_ob_refcnt(PyObject* obj) {
889-
return IMMORTAL_REFCNT; // large dummy refcount
890-
}
891-
892-
void nop_GraalPy_set_PyObject_ob_refcnt(PyObject* obj, Py_ssize_t refcnt) {
893-
// do nothing
894-
}
895-
896-
void nop_GraalPyTruffle_NotifyRefCount(PyObject* obj, Py_ssize_t refcnt) {
897-
// do nothing
898-
}
899-
900-
void nop_GraalPyTruffle_BulkNotifyRefCount(void* ptrs, int count) {
901-
// do nothing
902-
}
903-
904-
/*
905-
* This array contains pairs of variable address and "reset value".
906-
* The variable location is usually the address of a function pointer variable
907-
* and the reset value is a new value to set at VM shutdown.
908-
* For further explanation why this is required, see Java method
909-
* 'CApiContext.ensureCapiWasLoaded'.
910-
*
911-
* Array format: [ var_addr, reset_val, var_addr1, reset_val1, ..., NULL ]
912-
*
913-
* ATTENTION: If the structure of the array's content is changed, method
914-
* 'CApiContext.addNativeFinalizer' *MUST BE* adopted.
915-
*
916-
* ATTENTION: the array is expected to be NULL-terminated !
917-
*
918-
*/
919-
static int64_t reset_ptrs[] = {
920-
&graalpy_finalizing, 1,
921-
&GraalPy_get_PyObject_ob_refcnt, nop_GraalPy_get_PyObject_ob_refcnt,
922-
&GraalPy_set_PyObject_ob_refcnt, nop_GraalPy_set_PyObject_ob_refcnt,
923-
&GraalPyTruffle_NotifyRefCount, nop_GraalPyTruffle_NotifyRefCount,
924-
&GraalPyTruffle_BulkNotifyRefCount, nop_GraalPyTruffle_NotifyRefCount,
925-
/* sentinel (required) */
926-
NULL
927-
};
928-
929883
/*
930884
* This function is called from Java during C API initialization to get the
931-
* pointer to array 'reset_pts'.
885+
* pointer `graalpy_finalizing`.
932886
*/
933-
PyAPI_FUNC(int64_t *) GraalPy_get_finalize_capi_pointer_array() {
934-
return reset_ptrs;
887+
PyAPI_FUNC(int32_t *) GraalPy_get_finalize_capi_pointer() {
888+
return &graalpy_finalizing;
935889
}
936890

937891
#if ((__linux__ && __GNU_LIBRARY__) || __APPLE__)

graalpython/com.oracle.graal.python.cext/src/object.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,15 @@ void
169169
_Py_DecRef(PyObject *o)
170170
{
171171
// GraalPy change: different implementation
172+
if (UNLIKELY(graalpy_finalizing)) {
173+
/*
174+
* Many native libraries in C++ do decrefs in global object destructors that run
175+
* after the VM has exitted and we have deallocated the native wrappers.
176+
*/
177+
return;
178+
}
172179
const Py_ssize_t refcnt = Py_REFCNT(o);
173-
if (refcnt != IMMORTAL_REFCNT)
174-
{
180+
if (refcnt != IMMORTAL_REFCNT) {
175181
const Py_ssize_t updated_refcnt = refcnt - 1;
176182
Py_SET_REFCNT(o, updated_refcnt);
177183
if (updated_refcnt != 0) {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/capi/CApiContext.java

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -834,11 +834,11 @@ public static CApiContext ensureCapiWasLoaded(Node node, PythonContext context,
834834
* is terminated by a signal, the context exit is skipped. For that case we set
835835
* up the shutdown hook.
836836
*/
837-
Object finalizeFunction = U.readMember(capiLibrary, "GraalPy_get_finalize_capi_pointer_array");
837+
Object finalizeFunction = U.readMember(capiLibrary, "GraalPy_get_finalize_capi_pointer");
838838
Object finalizeSignature = env.parseInternal(Source.newBuilder(J_NFI_LANGUAGE, "():POINTER", "exec").build()).call();
839-
Object resetFunctionPointerArray = SignatureLibrary.getUncached().call(finalizeSignature, finalizeFunction);
839+
Object finalizingPointer = SignatureLibrary.getUncached().call(finalizeSignature, finalizeFunction);
840840
try {
841-
cApiContext.addNativeFinalizer(env, resetFunctionPointerArray);
841+
cApiContext.addNativeFinalizer(env, finalizingPointer);
842842
} catch (RuntimeException e) {
843843
// This can happen when other languages restrict multithreading
844844
LOGGER.warning(() -> "didn't register a native finalizer due to: " + e.getMessage());
@@ -865,33 +865,18 @@ public static CApiContext ensureCapiWasLoaded(Node node, PythonContext context,
865865
}
866866

867867
/**
868-
* Registers a VM shutdown hook, that resets certain C API function to NOP functions to avoid
869-
* segfaults due to improper Python context shutdown. In particular, the shutdown hook reads
870-
* pairs of {@code variable address} and {@code reset value} provided in a native array and
871-
* writes the {@code reset value} to the {@code variable address}. Currently, this is used to
872-
* change the incref and decref upcall functions which would crash if they are called after the
873-
* Python context was closed. The format of the array is:
874-
*
875-
* <pre>
876-
* [ var_addr, reset_val, var_addr1, reset_val1, ..., NULL ]
877-
* </pre>
868+
* Registers a VM shutdown hook, that sets {@code graalpy_finalizing} variable to let the C side
869+
* know that it's not safe to do upcalls and that native wrappers might have been deallocated.
870+
* We need to do it in a VM shutdown hook to make sure C atexit won't crash even if our context
871+
* finalization didn't run.
878872
*/
879-
private void addNativeFinalizer(Env env, Object resetFunctionPointerArray) {
873+
private void addNativeFinalizer(Env env, Object finalizingPointerObj) {
880874
final Unsafe unsafe = getContext().getUnsafe();
881-
InteropLibrary lib = InteropLibrary.getUncached(resetFunctionPointerArray);
882-
if (!lib.isNull(resetFunctionPointerArray) && lib.isPointer(resetFunctionPointerArray)) {
875+
InteropLibrary lib = InteropLibrary.getUncached(finalizingPointerObj);
876+
if (!lib.isNull(finalizingPointerObj) && lib.isPointer(finalizingPointerObj)) {
883877
try {
884-
long lresetFunctionPointerArray = lib.asPointer(resetFunctionPointerArray);
885-
nativeFinalizerRunnable = () -> {
886-
long resetFunctionPointerLocation;
887-
long curElemAddr = lresetFunctionPointerArray;
888-
while ((resetFunctionPointerLocation = unsafe.getLong(curElemAddr)) != 0) {
889-
curElemAddr += Long.BYTES;
890-
long replacingFunctionPointer = unsafe.getLong(curElemAddr);
891-
unsafe.putAddress(resetFunctionPointerLocation, replacingFunctionPointer);
892-
curElemAddr += Long.BYTES;
893-
}
894-
};
878+
long finalizingPointer = lib.asPointer(finalizingPointerObj);
879+
nativeFinalizerRunnable = () -> unsafe.putInt(finalizingPointer, 1);
895880
nativeFinalizerShutdownHook = env.newTruffleThreadBuilder(nativeFinalizerRunnable).build();
896881
Runtime.getRuntime().addShutdownHook(nativeFinalizerShutdownHook);
897882
} catch (UnsupportedMessageException e) {

graalpython/lib-graalpython/patches/tokenizers/tokenizers-0.13.3.patch

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,34 @@ index 6282c31..47e6b12 100644
2222

2323
[features]
2424
default = ["pyo3/extension-module"]
25+
diff --git a/src/lib.rs b/src/lib.rs
26+
index bf1adbd..89c34fe 100644
27+
--- a/src/lib.rs
28+
+++ b/src/lib.rs
29+
@@ -48,14 +48,16 @@ extern "C" fn child_after_fork() {
30+
pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
31+
let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");
32+
33+
+ // GraalPy change: Disable the atfork warning. This triggers a ton of false positives when
34+
+ // jline calls stty and we don't support fork anyway
35+
// Register the fork callback
36+
- #[cfg(target_family = "unix")]
37+
- unsafe {
38+
- if !REGISTERED_FORK_CALLBACK {
39+
- libc::pthread_atfork(None, None, Some(child_after_fork));
40+
- REGISTERED_FORK_CALLBACK = true;
41+
- }
42+
- }
43+
+ // #[cfg(target_family = "unix")]
44+
+ // unsafe {
45+
+ // if !REGISTERED_FORK_CALLBACK {
46+
+ // libc::pthread_atfork(None, None, Some(child_after_fork));
47+
+ // REGISTERED_FORK_CALLBACK = true;
48+
+ // }
49+
+ // }
50+
51+
m.add_class::<tokenizer::PyTokenizer>()?;
52+
m.add_class::<tokenizer::PyAddedToken>()?;
2553
diff --git a/tokenizers-lib/src/models/bpe/trainer.rs b/tokenizers-lib/src/models/bpe/trainer.rs
2654
index 43ab848..55f95f8 100644
2755
--- a/tokenizers-lib/src/models/bpe/trainer.rs

graalpython/lib-graalpython/patches/tokenizers/tokenizers-0.15.0.patch

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,31 @@ index 51ac712..a7d1e9b 100644
2222

2323
[features]
2424
defaut = ["pyo3/extension-module"]
25+
diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs
26+
index 8625944..a8ac19d 100644
27+
--- a/bindings/python/src/lib.rs
28+
+++ b/bindings/python/src/lib.rs
29+
@@ -50,14 +50,16 @@ extern "C" fn child_after_fork() {
30+
pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
31+
let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");
32+
33+
+ // GraalPy change: Disable the atfork warning. This triggers a ton of false positives when
34+
+ // jline calls stty and we don't support fork anyway
35+
// Register the fork callback
36+
- #[cfg(target_family = "unix")]
37+
- unsafe {
38+
- if !REGISTERED_FORK_CALLBACK {
39+
- libc::pthread_atfork(None, None, Some(child_after_fork));
40+
- REGISTERED_FORK_CALLBACK = true;
41+
- }
42+
- }
43+
+ // #[cfg(target_family = "unix")]
44+
+ // unsafe {
45+
+ // if !REGISTERED_FORK_CALLBACK {
46+
+ // libc::pthread_atfork(None, None, Some(child_after_fork));
47+
+ // REGISTERED_FORK_CALLBACK = true;
48+
+ // }
49+
+ // }
50+
51+
m.add_class::<tokenizer::PyTokenizer>()?;
52+
m.add_class::<tokenizer::PyAddedToken>()?;

0 commit comments

Comments
 (0)