From 6d580e06325ae91d476298ba96f580de73bb6f34 Mon Sep 17 00:00:00 2001 From: Ilija Tovilo Date: Tue, 7 Oct 2025 19:50:04 +0200 Subject: [PATCH] Mark strings as IS_STR_VALID_UTF8 on-the-fly Alternative to GH-10870. Use atomic writes for adding the IS_STR_VALID_UTF8 flag to UTF-8-verified interned strings in ext-mbstring. x86 and other architectures guarantee atomic writes/reads for aligned variables up to size_t, which we already rely on, particularly for zend_op.handler being swapped out in the JIT. The atomic write is only needed here to not drop any other newly written bits (which there currently aren't any of). We use GCC and sync atomics because they don't require annotating the modified variable with the C11 _Atomic keyword. --- ext/mbstring/mbstring.c | 34 +++++++++++++++++++++++++++------ ext/opcache/zend_shared_alloc.c | 2 +- ext/opcache/zend_shared_alloc.h | 2 +- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 34d759ae30e4b..c332b0f106bc5 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -67,6 +67,28 @@ #include "zend_simd.h" +#include "ext/opcache/zend_shared_alloc.h" +#include "ext/opcache/ZendAccelerator.h" + +static void mark_zstr_as_utf8(zend_string *s) +{ + if (!ZSTR_IS_INTERNED(s)) { + GC_ADD_FLAGS(s, IS_STR_VALID_UTF8); + return; + } + + /* We don't use zend_atomic.h as we're writing to a non-_Atomic field. */ +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) || (__GNUC__ > 4) + SHM_UNPROTECT(); + __atomic_or_fetch(&GC_TYPE_INFO(s), IS_STR_VALID_UTF8 << GC_FLAGS_SHIFT, __ATOMIC_SEQ_CST); + SHM_PROTECT(); +#elif defined(__GNUC__) + SHM_UNPROTECT(); + __sync_or_and_fetch(&GC_TYPE_INFO(s), IS_STR_VALID_UTF8 << GC_FLAGS_SHIFT); + SHM_PROTECT(); +#endif +} + /* }}} */ /* {{{ prototypes */ @@ -2263,8 +2285,8 @@ PHP_FUNCTION(mb_substr_count) } else { unsigned int num_errors = 0; haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors); - if (!num_errors && !ZSTR_IS_INTERNED(haystack)) { - GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8); + if (!num_errors) { + mark_zstr_as_utf8(haystack); } } @@ -2273,8 +2295,8 @@ PHP_FUNCTION(mb_substr_count) } else { unsigned int num_errors = 0; needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors); - if (!num_errors && !ZSTR_IS_INTERNED(needle)) { - GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8); + if (!num_errors) { + mark_zstr_as_utf8(needle); } } } else { @@ -5567,8 +5589,8 @@ static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encodin return true; } bool result = mb_fast_check_utf8(str); - if (result && !ZSTR_IS_INTERNED(str)) { - GC_ADD_FLAGS(str, IS_STR_VALID_UTF8); + if (result) { + mark_zstr_as_utf8(str); } return result; } else { diff --git a/ext/opcache/zend_shared_alloc.c b/ext/opcache/zend_shared_alloc.c index 80ef36b8749d9..c1d41e756244a 100644 --- a/ext/opcache/zend_shared_alloc.c +++ b/ext/opcache/zend_shared_alloc.c @@ -628,7 +628,7 @@ const char *zend_accel_get_shared_model(void) return g_shared_model; } -void zend_accel_shared_protect(bool protected) +ZEND_API void zend_accel_shared_protect(bool protected) { #ifdef HAVE_MPROTECT int i; diff --git a/ext/opcache/zend_shared_alloc.h b/ext/opcache/zend_shared_alloc.h index 108349b13f816..fcd00363a4ba0 100644 --- a/ext/opcache/zend_shared_alloc.h +++ b/ext/opcache/zend_shared_alloc.h @@ -200,7 +200,7 @@ const char *zend_accel_get_shared_model(void); * @param protected true to protect shared memory (read-only), false * to unprotect shared memory (writable) */ -void zend_accel_shared_protect(bool protected); +ZEND_API void zend_accel_shared_protect(bool protected); #ifdef USE_MMAP extern const zend_shared_memory_handlers zend_alloc_mmap_handlers;