|
| 1 | +// Copyright 2024-2025 Rimas Misevičius |
| 2 | +// Distributed under the BSD-style license that can be |
| 3 | +// found in the LICENSE file. |
| 4 | +// |
| 5 | +#include "public_suffix_list.h" |
| 6 | + |
| 7 | +// Copyright 2024-2025 Rimas Misevičius |
| 8 | +// Distributed under the BSD-style license that can be |
| 9 | +// found in the LICENSE file. |
| 10 | +// |
| 11 | +// Formal algorithm: |
| 12 | +// https://github.com/publicsuffix/list/wiki/Format#formal-algorithm |
| 13 | +// |
| 14 | +// #include "upa/public_suffix_list.h" |
| 15 | + |
| 16 | + |
| 17 | +namespace upa { |
| 18 | +namespace { |
| 19 | + |
| 20 | +// utilities |
| 21 | + |
| 22 | +class splitter { |
| 23 | +public: |
| 24 | + splitter(std::string_view domain); |
| 25 | + |
| 26 | + bool contains_empty() const; |
| 27 | + |
| 28 | + void start(); |
| 29 | + bool next(std::string& label); |
| 30 | + bool next(std::string_view& label); |
| 31 | + std::size_t index() const { |
| 32 | + return label_ind_; |
| 33 | + } |
| 34 | + bool at_end() const { |
| 35 | + return label_ind_ == 0; |
| 36 | + } |
| 37 | + |
| 38 | + std::size_t size() const { |
| 39 | + return label_pos_.size(); |
| 40 | + } |
| 41 | + std::size_t get_pos_by_index(std::size_t ind) const { |
| 42 | + return label_pos_[ind]; |
| 43 | + } |
| 44 | + |
| 45 | +private: |
| 46 | + std::string_view domain_; |
| 47 | + std::vector<std::size_t> label_pos_; |
| 48 | + |
| 49 | + std::size_t label_end_ = 0; |
| 50 | + std::size_t label_ind_ = 0; |
| 51 | +}; |
| 52 | + |
| 53 | +inline splitter::splitter(std::string_view domain) |
| 54 | + : domain_{ domain } |
| 55 | + , label_end_{ domain_.length() } |
| 56 | +{ |
| 57 | + label_pos_.reserve(16); |
| 58 | + label_pos_.push_back(0); |
| 59 | + std::size_t pos = 0; |
| 60 | + while ((pos = domain_.find('.', pos)) != std::string_view::npos) |
| 61 | + label_pos_.push_back(++pos); // skip '.' and add pos |
| 62 | + label_ind_ = label_pos_.size(); |
| 63 | +} |
| 64 | + |
| 65 | +inline bool splitter::contains_empty() const { |
| 66 | + std::size_t label_end = domain_.length(); |
| 67 | + // label_pos_ has at least one element |
| 68 | + for (std::size_t ind = label_pos_.size(); ; --ind) { |
| 69 | + if (label_end - label_pos_[ind - 1] == 0) |
| 70 | + return true; |
| 71 | + if (ind == 1) break; |
| 72 | + label_end = label_pos_[ind - 1] - 1; // skip '.' |
| 73 | + } |
| 74 | + return false; |
| 75 | +} |
| 76 | + |
| 77 | +inline void splitter::start() { |
| 78 | + label_end_ = domain_.length(); |
| 79 | + label_ind_ = label_pos_.size(); |
| 80 | +} |
| 81 | + |
| 82 | +inline bool splitter::next(std::string& label) { |
| 83 | + if (label_ind_ != 0) { |
| 84 | + const auto pos = label_pos_[--label_ind_]; |
| 85 | + label = domain_.substr(pos, label_end_ - pos); |
| 86 | + label_end_ = pos - 1; // skip '.' |
| 87 | + return true; |
| 88 | + } |
| 89 | + return false; |
| 90 | +} |
| 91 | + |
| 92 | +inline bool splitter::next(std::string_view& label) { |
| 93 | + if (label_ind_ != 0) { |
| 94 | + const auto pos = label_pos_[--label_ind_]; |
| 95 | + label = domain_.substr(pos, label_end_ - pos); |
| 96 | + label_end_ = pos - 1; // skip '.' |
| 97 | + return true; |
| 98 | + } |
| 99 | + return false; |
| 100 | +} |
| 101 | + |
| 102 | +} // namespace |
| 103 | + |
| 104 | +// class public_suffix_list |
| 105 | + |
| 106 | +bool public_suffix_list::load(std::istream& input_stream) { |
| 107 | + push_context ctx; |
| 108 | + |
| 109 | + std::string line; |
| 110 | + while (std::getline(input_stream, line)) |
| 111 | + push_line(ctx, line); |
| 112 | + return !input_stream.bad() && ctx.error == 0 && ctx.code_flags == 0; |
| 113 | +} |
| 114 | + |
| 115 | +void public_suffix_list::push_line(push_context& ctx, std::string_view line) { |
| 116 | + static constexpr auto insert = [](label_item& root, std::string_view input, std::uint8_t code) { |
| 117 | + // TODO: maybe only to Punycode |
| 118 | + const std::string domain = upa::url_host{ input }.to_string(); |
| 119 | + |
| 120 | + splitter labels(domain); |
| 121 | + label_item* pli = &root; |
| 122 | + std::string label; |
| 123 | + while (labels.next(label)) { |
| 124 | + if (!pli->children) |
| 125 | + pli->children = std::make_unique<label_item::map_type>(); |
| 126 | + if (labels.at_end()) |
| 127 | + (*pli->children)[label].code = code; |
| 128 | + else |
| 129 | + pli->children->emplace(label, label_item{}); |
| 130 | + pli = &(*pli->children)[label]; |
| 131 | + } |
| 132 | + }; |
| 133 | + |
| 134 | + try { |
| 135 | + if (line.empty()) |
| 136 | + return; |
| 137 | + if (line.length() >= 2) { |
| 138 | + if (line[0] == '/' && line[1] == '/') { |
| 139 | + if (line == "// ===BEGIN ICANN DOMAINS===") |
| 140 | + ctx.code_flags = IS_ICANN; |
| 141 | + else if (line == "// ===BEGIN PRIVATE DOMAINS===") |
| 142 | + ctx.code_flags = IS_PRIVATE; |
| 143 | + else if (line == "// ===END ICANN DOMAINS===" || |
| 144 | + line == "// ===END PRIVATE DOMAINS===") |
| 145 | + ctx.code_flags = 0; |
| 146 | + return; |
| 147 | + } |
| 148 | + if (line[0] == '*' && line[1] == '.') { |
| 149 | + insert(root_, line.substr(2), 3 | IS_RULE | ctx.code_flags); |
| 150 | + return; |
| 151 | + } |
| 152 | + } |
| 153 | + if (line[0] == '!') |
| 154 | + insert(root_, line.substr(1), 1 | IS_RULE | ctx.code_flags); |
| 155 | + else |
| 156 | + insert(root_, line, 2 | IS_RULE | ctx.code_flags); |
| 157 | + } |
| 158 | + catch (const upa::url_error&) { |
| 159 | + ctx.error |= 1; |
| 160 | + } |
| 161 | +} |
| 162 | + |
| 163 | +void public_suffix_list::push(push_context& ctx, std::string_view buff) { |
| 164 | + std::size_t sol = 0; |
| 165 | + if (!ctx.remaining.empty()) { |
| 166 | + const auto eol = buff.find('\n', 0); |
| 167 | + ctx.remaining += buff.substr(0, eol); |
| 168 | + if (eol == std::string_view::npos) |
| 169 | + return; |
| 170 | + push_line(ctx, ctx.remaining); |
| 171 | + ctx.remaining.clear(); |
| 172 | + sol = eol + 1; // skip '\n' |
| 173 | + } |
| 174 | + while (sol < buff.size()) { |
| 175 | + const auto eol = buff.find('\n', sol); |
| 176 | + if (eol == std::string_view::npos) { |
| 177 | + ctx.remaining = buff.substr(sol); |
| 178 | + return; |
| 179 | + } |
| 180 | + push_line(ctx, buff.substr(sol, eol - sol)); |
| 181 | + sol = eol + 1; // skip '\n' |
| 182 | + } |
| 183 | +} |
| 184 | + |
| 185 | +bool public_suffix_list::finalize(push_context& ctx) { |
| 186 | + if (!ctx.remaining.empty()) { |
| 187 | + push_line(ctx, ctx.remaining); |
| 188 | + ctx.remaining.clear(); |
| 189 | + } |
| 190 | + // free up memory |
| 191 | + ctx.remaining.shrink_to_fit(); |
| 192 | + return ctx.error == 0 && ctx.code_flags == 0; |
| 193 | +} |
| 194 | + |
| 195 | + |
| 196 | +public_suffix_list::result public_suffix_list::get_host_suffix_info( |
| 197 | + std::string_view hostname, option opt) const { |
| 198 | + if (hostname.empty()) |
| 199 | + return {}; |
| 200 | + |
| 201 | + if (hostname.back() == '.') |
| 202 | + hostname.remove_suffix(1); // remove trailing dot |
| 203 | + |
| 204 | + // Split to labels |
| 205 | + splitter labels(hostname); |
| 206 | + |
| 207 | + // Empty labels are not permitted, see: |
| 208 | + // https://github.com/publicsuffix/list/wiki/Format#definitions |
| 209 | + if (labels.contains_empty()) |
| 210 | + return {}; |
| 211 | + |
| 212 | + const label_item* pli = &root_; |
| 213 | + std::uint8_t latest_code = 0; |
| 214 | + std::size_t latest_ind = 0; |
| 215 | + std::string_view label; |
| 216 | + while (labels.next(label) && pli->children) { |
| 217 | +#ifdef __cpp_lib_generic_unordered_lookup |
| 218 | + auto it = pli->children->find(label); |
| 219 | +#else |
| 220 | + auto it = pli->children->find(std::string{ label }); |
| 221 | +#endif |
| 222 | + if (it == pli->children->end()) |
| 223 | + break; |
| 224 | + if (it->second.code && ( |
| 225 | + (it->second.code & DIFF_MASK) != 3 || !labels.at_end())) { |
| 226 | + latest_code = it->second.code; |
| 227 | + latest_ind = labels.index(); |
| 228 | + } |
| 229 | + pli = &it->second; |
| 230 | + } |
| 231 | + if (latest_code == 0) { |
| 232 | + // Unlisted TLD: If no rules match, the prevailing rule is "*" |
| 233 | + latest_code = 2; |
| 234 | + latest_ind = labels.size() - 1; // index of rightmost label |
| 235 | + } |
| 236 | + // Calculate result |
| 237 | + const int ind_diff = static_cast<int>(latest_code & DIFF_MASK) - 2 + |
| 238 | + static_cast<int>(opt & option::registrable_domain); |
| 239 | + if (ind_diff <= 0 || static_cast<std::size_t>(ind_diff) <= latest_ind) { |
| 240 | + const auto ind = latest_ind - ind_diff; |
| 241 | + if (ind < labels.size()) |
| 242 | + return { ind, labels.get_pos_by_index(ind), latest_code }; |
| 243 | + } |
| 244 | + return {}; |
| 245 | +} |
| 246 | + |
| 247 | +bool public_suffix_list::operator==(const public_suffix_list& other) const { |
| 248 | + return root_ == other.root_; |
| 249 | +} |
| 250 | + |
| 251 | +public_suffix_list::public_suffix_list() = default; |
| 252 | +public_suffix_list::~public_suffix_list() = default; |
| 253 | +public_suffix_list::public_suffix_list(public_suffix_list&&) noexcept = default; |
| 254 | +public_suffix_list& public_suffix_list::operator=(public_suffix_list&&) noexcept = default; |
| 255 | + |
| 256 | +} // namespace upa |
0 commit comments