nielsdos · nielsdos · Aug 22, 2024
@@ -27,6 +27,7 @@
 #include "namespace_compat.h"
 #include "private_data.h"
 #include "dom_properties.h"
+#include "swar.h"
 #include <Zend/zend_smart_string.h>
 #include <lexbor/html/encoding.h>
 #include <lexbor/encoding/encoding.h>
@@ -512,6 +513,30 @@ static bool dom_process_parse_chunk(
 	return true;
 }
 
+/* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input.
+ * Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */
+static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end)
+{
+	while (*data + sizeof(size_t) <= end) {
+		size_t bytes;
+		memcpy(&bytes, *data, sizeof(bytes));
+		/* If the top bit is set, it's not ASCII. */
+		if ((bytes & SWAR_REPEAT(0x80)) != 0) {
+			return false;
+		}
+		*data += sizeof(size_t);
+	}
+
+	while (*data < end) {
+		if (**data > 0x80) {
+			return false;
+		}
+		(*data)++;
+	}
+
+	return true;
+}
+
 static bool dom_decode_encode_fast_path(
 	lexbor_libxml2_bridge_parse_context *ctx,
 	lxb_html_document_t *document,
@@ -527,13 +552,13 @@ static bool dom_decode_encode_fast_path(
 	const lxb_char_t *last_output = buf_ref;
 	while (buf_ref != buf_end) {
 		/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
-		if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
+		if (decoding_encoding_ctx->decode.u.utf_8.need == 0) {
 			/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
-			 * need more UTF-8 bytes to complete a sequence.
-			 * It might be tempting to use SIMD here, but it turns out that this is less efficient because
-			 * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
-			buf_ref++;
-			continue;
+			 * need more UTF-8 bytes to complete a sequence. */
+			if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) {
+				ZEND_ASSERT(buf_ref == buf_end);
+				break;
+			}
 		}
 		const lxb_char_t *buf_ref_backup = buf_ref;
 		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);

@@ -14,6 +14,7 @@
 #define LXB_HTML_TAG_RES_DATA
 #define LXB_HTML_TAG_RES_SHS_DATA
 #include "lexbor/html/tag_res.h"
+#include "swar.h"
 
 
 #define LXB_HTML_TKZ_TEMP_SIZE (4096 * 4)
@@ -304,6 +305,24 @@ lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz)
     return LXB_STATUS_OK;
 }
 
+static inline size_t count_utf8_codepoints(size_t bytes)
+{
+    /* Top 2 bits must not be 10 to increase the count, or if starting from a full count: must be 10 to decrease the count.
+     * We can see that the first bit must not be 1 and second must be 0, i.e. not "first & ~second".
+     * We also have to shift to align the bits on top of each other. */
+    size_t firsts = bytes & SWAR_REPEAT(0b10000000);
+    size_t seconds = bytes & SWAR_REPEAT(0b01000000);
+    size_t matches = firsts & ~(seconds << 1);
+
+    size_t cnt = sizeof(size_t);
+    while (matches) {
+        matches &= matches - 1;
+        cnt--;
+    }
+
+    return cnt;
+}
+
 lxb_status_t
 lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
                          size_t size)
@@ -315,8 +334,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
     tkz->last = end;
 
     while (data < end) {
-        size_t current_column = tkz->current_column;
         const lxb_char_t *new_data = tkz->state(tkz, data, end);
+		size_t current_column = tkz->current_column;
+
+		if (SWAR_IS_LITTLE_ENDIAN) {
+			while (data + sizeof(size_t) <= new_data) {
+				size_t bytes;
+				memcpy(&bytes, data, sizeof(size_t));
+
+				size_t matches = SWAR_HAS_ZERO(bytes ^ SWAR_REPEAT(0x0A));
+				if (matches) {
+					data += (((matches - 1) & SWAR_ONES) * SWAR_ONES) >> (sizeof(size_t) * 8 - 8);
+					tkz->current_line++;
+					current_column = 0;
+				} else {
+					data += sizeof(size_t);
+					current_column += count_utf8_codepoints(bytes);
+				}
+			}
+		}
+
         while (data < new_data) {
             /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
             if (*data == '\n') {