Skip to content

wip #112

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open

wip #112

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions ext/dom/html_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "namespace_compat.h"
#include "private_data.h"
#include "dom_properties.h"
#include "swar.h"
#include <Zend/zend_smart_string.h>
#include <lexbor/html/encoding.h>
#include <lexbor/encoding/encoding.h>
Expand Down Expand Up @@ -512,6 +513,30 @@ static bool dom_process_parse_chunk(
return true;
}

/* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input.
* Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */
static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end)
{
while (*data + sizeof(size_t) <= end) {
size_t bytes;
memcpy(&bytes, *data, sizeof(bytes));
/* If the top bit is set, it's not ASCII. */
if ((bytes & SWAR_REPEAT(0x80)) != 0) {
return false;
}
*data += sizeof(size_t);
}

while (*data < end) {
if (**data > 0x80) {
return false;
}
(*data)++;
}

return true;
}

static bool dom_decode_encode_fast_path(
lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document,
Expand All @@ -527,13 +552,13 @@ static bool dom_decode_encode_fast_path(
const lxb_char_t *last_output = buf_ref;
while (buf_ref != buf_end) {
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
if (decoding_encoding_ctx->decode.u.utf_8.need == 0) {
/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
* need more UTF-8 bytes to complete a sequence.
* It might be tempting to use SIMD here, but it turns out that this is less efficient because
* we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
buf_ref++;
continue;
* need more UTF-8 bytes to complete a sequence. */
if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) {
ZEND_ASSERT(buf_ref == buf_end);
break;
}
}
const lxb_char_t *buf_ref_backup = buf_ref;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
Expand Down
39 changes: 38 additions & 1 deletion ext/dom/lexbor/lexbor/html/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#define LXB_HTML_TAG_RES_DATA
#define LXB_HTML_TAG_RES_SHS_DATA
#include "lexbor/html/tag_res.h"
#include "swar.h"


#define LXB_HTML_TKZ_TEMP_SIZE (4096 * 4)
Expand Down Expand Up @@ -304,6 +305,24 @@ lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz)
return LXB_STATUS_OK;
}

static inline size_t count_utf8_codepoints(size_t bytes)
{
/* Top 2 bits must not be 10 to increase the count, or if starting from a full count: must be 10 to decrease the count.
* We can see that the first bit must not be 1 and second must be 0, i.e. not "first & ~second".
* We also have to shift to align the bits on top of each other. */
size_t firsts = bytes & SWAR_REPEAT(0b10000000);
size_t seconds = bytes & SWAR_REPEAT(0b01000000);
size_t matches = firsts & ~(seconds << 1);

size_t cnt = sizeof(size_t);
while (matches) {
matches &= matches - 1;
cnt--;
}

return cnt;
}

lxb_status_t
lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
size_t size)
Expand All @@ -315,8 +334,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
tkz->last = end;

while (data < end) {
size_t current_column = tkz->current_column;
const lxb_char_t *new_data = tkz->state(tkz, data, end);
size_t current_column = tkz->current_column;

if (SWAR_IS_LITTLE_ENDIAN) {
while (data + sizeof(size_t) <= new_data) {
size_t bytes;
memcpy(&bytes, data, sizeof(size_t));

size_t matches = SWAR_HAS_ZERO(bytes ^ SWAR_REPEAT(0x0A));
if (matches) {
data += (((matches - 1) & SWAR_ONES) * SWAR_ONES) >> (sizeof(size_t) * 8 - 8);
tkz->current_line++;
current_column = 0;
} else {
data += sizeof(size_t);
current_column += count_utf8_codepoints(bytes);
}
}
}

while (data < new_data) {
/* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
if (*data == '\n') {
Expand Down
Loading