From 429e4ac00d7d055d20c2b30e393a79f38d927728 Mon Sep 17 00:00:00 2001 From: Bo Bayles Date: Thu, 17 Jul 2025 09:13:31 -0500 Subject: [PATCH 1/4] Update to ada 3.2.6 --- ada_url/ada.cpp | 146 ++++++++++++++++++++++++++++++++++++++++++++++-- ada_url/ada.h | 26 +++++++-- 2 files changed, 161 insertions(+), 11 deletions(-) diff --git a/ada_url/ada.cpp b/ada_url/ada.cpp index 69675cf..16e3fe2 100644 --- a/ada_url/ada.cpp +++ b/ada_url/ada.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2025-06-30 19:51:09 -0400. Do not edit! */ +/* auto-generated on 2025-07-16 22:15:14 -0400. Do not edit! */ /* begin file src/ada.cpp */ #include "ada.h" /* begin file src/checkers.cpp */ @@ -67,7 +67,8 @@ static constexpr std::array path_signature_table = std::array result{}; for (size_t i = 0; i < 256; i++) { if (i <= 0x20 || i == 0x22 || i == 0x23 || i == 0x3c || i == 0x3e || - i == 0x3f || i == 0x60 || i == 0x7b || i == 0x7d || i > 0x7e) { + i == 0x3f || i == 0x5e || i == 0x60 || i == 0x7b || i == 0x7d || + i > 0x7e) { result[i] = 1; } else if (i == 0x25) { result[i] = 8; @@ -10444,6 +10445,8 @@ ADA_POP_DISABLE_WARNINGS #include #elif ADA_SSE2 #include +#elif ADA_LSX +#include #endif #include @@ -10552,6 +10555,38 @@ ada_really_inline bool has_tabs_or_newline( } return _mm_movemask_epi8(running) != 0; } +#elif ADA_LSX +ada_really_inline bool has_tabs_or_newline( + std::string_view user_input) noexcept { + // first check for short strings in which case we do it naively. + if (user_input.size() < 16) { // slow path + return std::ranges::any_of(user_input, is_tabs_or_newline); + } + // fast path for long strings (expected to be common) + size_t i = 0; + const __m128i mask1 = __lsx_vrepli_b('\r'); + const __m128i mask2 = __lsx_vrepli_b('\n'); + const __m128i mask3 = __lsx_vrepli_b('\t'); + // If we supported SSSE3, we could use the algorithm that we use for NEON. + __m128i running{0}; + for (; i + 15 < user_input.size(); i += 16) { + __m128i word = __lsx_vld((const __m128i*)(user_input.data() + i), 0); + running = __lsx_vor_v( + __lsx_vor_v(running, __lsx_vor_v(__lsx_vseq_b(word, mask1), + __lsx_vseq_b(word, mask2))), + __lsx_vseq_b(word, mask3)); + } + if (i < user_input.size()) { + __m128i word = __lsx_vld( + (const __m128i*)(user_input.data() + user_input.length() - 16), 0); + running = __lsx_vor_v( + __lsx_vor_v(running, __lsx_vor_v(__lsx_vseq_b(word, mask1), + __lsx_vseq_b(word, mask2))), + __lsx_vseq_b(word, mask3)); + } + if (__lsx_bz_v(running)) return false; + return true; +} #else ada_really_inline bool has_tabs_or_newline( std::string_view user_input) noexcept { @@ -11385,6 +11420,58 @@ ada_really_inline size_t find_next_host_delimiter_special( } return size_t(view.length()); } +#elif ADA_LSX +ada_really_inline size_t find_next_host_delimiter_special( + std::string_view view, size_t location) noexcept { + // first check for short strings in which case we do it naively. + if (view.size() - location < 16) { // slow path + for (size_t i = location; i < view.size(); i++) { + if (view[i] == ':' || view[i] == '/' || view[i] == '\\' || + view[i] == '?' || view[i] == '[') { + return i; + } + } + return size_t(view.size()); + } + // fast path for long strings (expected to be common) + size_t i = location; + const __m128i mask1 = __lsx_vrepli_b(':'); + const __m128i mask2 = __lsx_vrepli_b('/'); + const __m128i mask3 = __lsx_vrepli_b('\\'); + const __m128i mask4 = __lsx_vrepli_b('?'); + const __m128i mask5 = __lsx_vrepli_b('['); + + for (; i + 15 < view.size(); i += 16) { + __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0); + __m128i m1 = __lsx_vseq_b(word, mask1); + __m128i m2 = __lsx_vseq_b(word, mask2); + __m128i m3 = __lsx_vseq_b(word, mask3); + __m128i m4 = __lsx_vseq_b(word, mask4); + __m128i m5 = __lsx_vseq_b(word, mask5); + __m128i m = + __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5); + int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0); + if (mask != 0) { + return i + trailing_zeroes(mask); + } + } + if (i < view.size()) { + __m128i word = + __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0); + __m128i m1 = __lsx_vseq_b(word, mask1); + __m128i m2 = __lsx_vseq_b(word, mask2); + __m128i m3 = __lsx_vseq_b(word, mask3); + __m128i m4 = __lsx_vseq_b(word, mask4); + __m128i m5 = __lsx_vseq_b(word, mask5); + __m128i m = + __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5); + int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0); + if (mask != 0) { + return view.length() - 16 + trailing_zeroes(mask); + } + } + return size_t(view.length()); +} #else // : / [ \\ ? static constexpr std::array special_host_delimiters = @@ -11518,6 +11605,53 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view, } return size_t(view.length()); } +#elif ADA_LSX +ada_really_inline size_t find_next_host_delimiter(std::string_view view, + size_t location) noexcept { + // first check for short strings in which case we do it naively. + if (view.size() - location < 16) { // slow path + for (size_t i = location; i < view.size(); i++) { + if (view[i] == ':' || view[i] == '/' || view[i] == '?' || + view[i] == '[') { + return i; + } + } + return size_t(view.size()); + } + // fast path for long strings (expected to be common) + size_t i = location; + const __m128i mask1 = __lsx_vrepli_b(':'); + const __m128i mask2 = __lsx_vrepli_b('/'); + const __m128i mask4 = __lsx_vrepli_b('?'); + const __m128i mask5 = __lsx_vrepli_b('['); + + for (; i + 15 < view.size(); i += 16) { + __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0); + __m128i m1 = __lsx_vseq_b(word, mask1); + __m128i m2 = __lsx_vseq_b(word, mask2); + __m128i m4 = __lsx_vseq_b(word, mask4); + __m128i m5 = __lsx_vseq_b(word, mask5); + __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5)); + int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0); + if (mask != 0) { + return i + trailing_zeroes(mask); + } + } + if (i < view.size()) { + __m128i word = + __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0); + __m128i m1 = __lsx_vseq_b(word, mask1); + __m128i m2 = __lsx_vseq_b(word, mask2); + __m128i m4 = __lsx_vseq_b(word, mask4); + __m128i m5 = __lsx_vseq_b(word, mask5); + __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5)); + int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0); + if (mask != 0) { + return view.length() - 16 + trailing_zeroes(mask); + } + } + return size_t(view.length()); +} #else // : / [ ? static constexpr std::array host_delimiters = []() consteval { @@ -11762,8 +11896,8 @@ ada_really_inline void parse_prepared_path(std::string_view input, ? path_buffer_tmp : path_view; if (unicode::is_double_dot_path_segment(path_buffer)) { - if ((helpers::shorten_path(path, type) || special) && - location == std::string_view::npos) { + helpers::shorten_path(path, type); + if (location == std::string_view::npos) { path += '/'; } } else if (unicode::is_single_dot_path_segment(path_buffer) && @@ -15318,8 +15452,8 @@ inline void url_aggregator::consume_prepared_path(std::string_view input) { ? path_buffer_tmp : path_view; if (unicode::is_double_dot_path_segment(path_buffer)) { - if ((helpers::shorten_path(path, type) || special) && - location == std::string_view::npos) { + helpers::shorten_path(path, type); + if (location == std::string_view::npos) { path += '/'; } } else if (unicode::is_single_dot_path_segment(path_buffer) && diff --git a/ada_url/ada.h b/ada_url/ada.h index 5774be7..e0be62f 100644 --- a/ada_url/ada.h +++ b/ada_url/ada.h @@ -1,4 +1,4 @@ -/* auto-generated on 2025-06-30 19:51:09 -0400. Do not edit! */ +/* auto-generated on 2025-07-16 22:15:14 -0400. Do not edit! */ /* begin file include/ada.h */ /** * @file ada.h @@ -431,6 +431,10 @@ namespace ada { #define ADA_NEON 1 #endif +#if defined(__loongarch_sx) +#define ADA_LSX 1 +#endif + #ifndef __has_cpp_attribute #define ada_lifetime_bound #elif __has_cpp_attribute(msvc::lifetimebound) @@ -4204,6 +4208,7 @@ enum class errors : uint8_t { type_error }; #include #include #include +#include #if ADA_TESTING #include @@ -4233,6 +4238,17 @@ struct url_pattern_init { pattern, }; + friend std::ostream& operator<<(std::ostream& os, process_type type) { + switch (type) { + case process_type::url: + return os << "url"; + case process_type::pattern: + return os << "pattern"; + default: + return os << "unknown"; + } + } + // All strings must be valid UTF-8. // @see https://urlpattern.spec.whatwg.org/#process-a-urlpatterninit static tl::expected process( @@ -9410,7 +9426,7 @@ result> url_pattern::match( #if ADA_INCLUDE_URL_PATTERN namespace ada::url_pattern_helpers { -#ifdef ADA_TESTING +#if defined(ADA_TESTING) || defined(ADA_LOGGING) inline std::string to_string(token_type type) { switch (type) { case token_type::INVALID_CHAR: @@ -9437,7 +9453,7 @@ inline std::string to_string(token_type type) { ada::unreachable(); } } -#endif // ADA_TESTING +#endif // defined(ADA_TESTING) || defined(ADA_LOGGING) template constexpr void constructor_string_parser::rewind() { @@ -10498,14 +10514,14 @@ constructor_string_parser::parse(std::string_view input) { #ifndef ADA_ADA_VERSION_H #define ADA_ADA_VERSION_H -#define ADA_VERSION "3.2.5" +#define ADA_VERSION "3.2.6" namespace ada { enum { ADA_VERSION_MAJOR = 3, ADA_VERSION_MINOR = 2, - ADA_VERSION_REVISION = 5, + ADA_VERSION_REVISION = 6, }; } // namespace ada From 565aece78c9748e6d5378e31d122e161e2e54410 Mon Sep 17 00:00:00 2001 From: Bo Bayles Date: Thu, 17 Jul 2025 09:25:01 -0500 Subject: [PATCH 2/4] Updates to README.rst --- README.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index d1c59ae..a5f98cd 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,6 @@ ada-url ======== -The `urlib.parse` module in Python does not follow the legacy RFC 3978 standard nor -does it follow the newer WHATWG URL specification. It is also relatively slow. - This is ``ada_url``, a fast standard-compliant Python library for working with URLs based on the ``Ada`` URL parser. @@ -127,7 +124,8 @@ that it properly encodes IDNs and resolves paths: >>> parsed_url.pathname '/path2/' -Contrast that with the Python standard library's ``urlib.parse`` module: +Contrast that with the Python standard library's ``urllib.parse`` module, which loosely +follows the older `RFC 3978 `__ standard: .. code-block:: python @@ -138,11 +136,13 @@ Contrast that with the Python standard library's ``urlib.parse`` module: >>> parsed_url.path '/./path/../path2/' -Alternative Python bindings ---------------------------- +Performance +----------- This package uses `CFFI `__ to call -the ``Ada`` library's functions, which has a performance cost. -The alternative `can_ada `__ (Canadian Ada) -package uses `pybind11 `__ to generate a -Python extension module, which is more performant. +the ``Ada`` C library's functions, which makes it faster than the Python standard +library's ``urllib.parse`` module for most applications. + +An alternative package, `can_ada `__, uses +`pybind11 `__ to interact with the ``Ada`` +C++ library functions, which is even faster. From 3675468df179be9437e8769576cc3b2657c864b7 Mon Sep 17 00:00:00 2001 From: Bo Bayles Date: Thu, 17 Jul 2025 09:25:44 -0500 Subject: [PATCH 3/4] Bump version to 1.25.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2f81224..e56678e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ada-url" -version = "1.24.0" +version = "1.25.0" authors = [ {name = "Bo Bayles", email = "bo@bbayles.com"}, ] From 4439e9c2ac81271dd0ada8d136bb64fe558f18e4 Mon Sep 17 00:00:00 2001 From: Bo Bayles Date: Thu, 17 Jul 2025 09:27:15 -0500 Subject: [PATCH 4/4] Drop extra punctuation --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index a5f98cd..905edf6 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ Parsing URLs ^^^^^^^^^^^^ The ``URL`` class is intended to match the one described in the -`WHATWG URL spec `_:. +`WHATWG URL spec `_. .. code-block:: python