Skip to content

Commit 79f92c6

Browse files
committed
Add the Public Suffix List (PSL) class
With functions: - to load the PSL from a file - to get the public suffix and the registrable domain
1 parent 39b6798 commit 79f92c6

File tree

8 files changed

+738
-2
lines changed

8 files changed

+738
-2
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# Source files
88
###############################################################################
99

10+
*.dat text eol=lf
1011
*.md text
1112
*.toml text
1213
*.txt text

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ nanobind_add_module(
4141

4242
# Source code
4343
src/upa_url_bind.cpp
44-
ext/upa/url.cpp)
44+
ext/upa/url.cpp
45+
ext/upa/public_suffix_list.cpp)
4546

4647
# Install directive for scikit-build-core
4748
install(TARGETS upa_url LIBRARY DESTINATION upa_url)

ext/upa/public_suffix_list.cpp

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
// Copyright 2024-2025 Rimas Misevičius
2+
// Distributed under the BSD-style license that can be
3+
// found in the LICENSE file.
4+
//
5+
#include "public_suffix_list.h"
6+
7+
// Copyright 2024-2025 Rimas Misevičius
8+
// Distributed under the BSD-style license that can be
9+
// found in the LICENSE file.
10+
//
11+
// Formal algorithm:
12+
// https://github.com/publicsuffix/list/wiki/Format#formal-algorithm
13+
//
14+
// #include "upa/public_suffix_list.h"
15+
16+
17+
namespace upa {
18+
namespace {
19+
20+
// utilities
21+
22+
class splitter {
23+
public:
24+
splitter(std::string_view domain);
25+
26+
bool contains_empty() const;
27+
28+
void start();
29+
bool next(std::string& label);
30+
bool next(std::string_view& label);
31+
std::size_t index() const {
32+
return label_ind_;
33+
}
34+
bool at_end() const {
35+
return label_ind_ == 0;
36+
}
37+
38+
std::size_t size() const {
39+
return label_pos_.size();
40+
}
41+
std::size_t get_pos_by_index(std::size_t ind) const {
42+
return label_pos_[ind];
43+
}
44+
45+
private:
46+
std::string_view domain_;
47+
std::vector<std::size_t> label_pos_;
48+
49+
std::size_t label_end_ = 0;
50+
std::size_t label_ind_ = 0;
51+
};
52+
53+
inline splitter::splitter(std::string_view domain)
54+
: domain_{ domain }
55+
, label_end_{ domain_.length() }
56+
{
57+
label_pos_.reserve(16);
58+
label_pos_.push_back(0);
59+
std::size_t pos = 0;
60+
while ((pos = domain_.find('.', pos)) != std::string_view::npos)
61+
label_pos_.push_back(++pos); // skip '.' and add pos
62+
label_ind_ = label_pos_.size();
63+
}
64+
65+
inline bool splitter::contains_empty() const {
66+
std::size_t label_end = domain_.length();
67+
// label_pos_ has at least one element
68+
for (std::size_t ind = label_pos_.size(); ; --ind) {
69+
if (label_end - label_pos_[ind - 1] == 0)
70+
return true;
71+
if (ind == 1) break;
72+
label_end = label_pos_[ind - 1] - 1; // skip '.'
73+
}
74+
return false;
75+
}
76+
77+
inline void splitter::start() {
78+
label_end_ = domain_.length();
79+
label_ind_ = label_pos_.size();
80+
}
81+
82+
inline bool splitter::next(std::string& label) {
83+
if (label_ind_ != 0) {
84+
const auto pos = label_pos_[--label_ind_];
85+
label = domain_.substr(pos, label_end_ - pos);
86+
label_end_ = pos - 1; // skip '.'
87+
return true;
88+
}
89+
return false;
90+
}
91+
92+
inline bool splitter::next(std::string_view& label) {
93+
if (label_ind_ != 0) {
94+
const auto pos = label_pos_[--label_ind_];
95+
label = domain_.substr(pos, label_end_ - pos);
96+
label_end_ = pos - 1; // skip '.'
97+
return true;
98+
}
99+
return false;
100+
}
101+
102+
} // namespace
103+
104+
// class public_suffix_list
105+
106+
bool public_suffix_list::load(std::istream& input_stream) {
107+
push_context ctx;
108+
109+
std::string line;
110+
while (std::getline(input_stream, line))
111+
push_line(ctx, line);
112+
return !input_stream.bad() && ctx.error == 0 && ctx.code_flags == 0;
113+
}
114+
115+
void public_suffix_list::push_line(push_context& ctx, std::string_view line) {
116+
static constexpr auto insert = [](label_item& root, std::string_view input, std::uint8_t code) {
117+
// TODO: maybe only to Punycode
118+
const std::string domain = upa::url_host{ input }.to_string();
119+
120+
splitter labels(domain);
121+
label_item* pli = &root;
122+
std::string label;
123+
while (labels.next(label)) {
124+
if (!pli->children)
125+
pli->children = std::make_unique<label_item::map_type>();
126+
if (labels.at_end())
127+
(*pli->children)[label].code = code;
128+
else
129+
pli->children->emplace(label, label_item{});
130+
pli = &(*pli->children)[label];
131+
}
132+
};
133+
134+
try {
135+
if (line.empty())
136+
return;
137+
if (line.length() >= 2) {
138+
if (line[0] == '/' && line[1] == '/') {
139+
if (line == "// ===BEGIN ICANN DOMAINS===")
140+
ctx.code_flags = IS_ICANN;
141+
else if (line == "// ===BEGIN PRIVATE DOMAINS===")
142+
ctx.code_flags = IS_PRIVATE;
143+
else if (line == "// ===END ICANN DOMAINS===" ||
144+
line == "// ===END PRIVATE DOMAINS===")
145+
ctx.code_flags = 0;
146+
return;
147+
}
148+
if (line[0] == '*' && line[1] == '.') {
149+
insert(root_, line.substr(2), 3 | IS_RULE | ctx.code_flags);
150+
return;
151+
}
152+
}
153+
if (line[0] == '!')
154+
insert(root_, line.substr(1), 1 | IS_RULE | ctx.code_flags);
155+
else
156+
insert(root_, line, 2 | IS_RULE | ctx.code_flags);
157+
}
158+
catch (const upa::url_error&) {
159+
ctx.error |= 1;
160+
}
161+
}
162+
163+
void public_suffix_list::push(push_context& ctx, std::string_view buff) {
164+
std::size_t sol = 0;
165+
if (!ctx.remaining.empty()) {
166+
const auto eol = buff.find('\n', 0);
167+
ctx.remaining += buff.substr(0, eol);
168+
if (eol == std::string_view::npos)
169+
return;
170+
push_line(ctx, ctx.remaining);
171+
ctx.remaining.clear();
172+
sol = eol + 1; // skip '\n'
173+
}
174+
while (sol < buff.size()) {
175+
const auto eol = buff.find('\n', sol);
176+
if (eol == std::string_view::npos) {
177+
ctx.remaining = buff.substr(sol);
178+
return;
179+
}
180+
push_line(ctx, buff.substr(sol, eol - sol));
181+
sol = eol + 1; // skip '\n'
182+
}
183+
}
184+
185+
bool public_suffix_list::finalize(push_context& ctx) {
186+
if (!ctx.remaining.empty()) {
187+
push_line(ctx, ctx.remaining);
188+
ctx.remaining.clear();
189+
}
190+
// free up memory
191+
ctx.remaining.shrink_to_fit();
192+
return ctx.error == 0 && ctx.code_flags == 0;
193+
}
194+
195+
196+
public_suffix_list::result public_suffix_list::get_host_suffix_info(
197+
std::string_view hostname, option opt) const {
198+
if (hostname.empty())
199+
return {};
200+
201+
if (hostname.back() == '.')
202+
hostname.remove_suffix(1); // remove trailing dot
203+
204+
// Split to labels
205+
splitter labels(hostname);
206+
207+
// Empty labels are not permitted, see:
208+
// https://github.com/publicsuffix/list/wiki/Format#definitions
209+
if (labels.contains_empty())
210+
return {};
211+
212+
const label_item* pli = &root_;
213+
std::uint8_t latest_code = 0;
214+
std::size_t latest_ind = 0;
215+
std::string_view label;
216+
while (labels.next(label) && pli->children) {
217+
#ifdef __cpp_lib_generic_unordered_lookup
218+
auto it = pli->children->find(label);
219+
#else
220+
auto it = pli->children->find(std::string{ label });
221+
#endif
222+
if (it == pli->children->end())
223+
break;
224+
if (it->second.code && (
225+
(it->second.code & DIFF_MASK) != 3 || !labels.at_end())) {
226+
latest_code = it->second.code;
227+
latest_ind = labels.index();
228+
}
229+
pli = &it->second;
230+
}
231+
if (latest_code == 0) {
232+
// Unlisted TLD: If no rules match, the prevailing rule is "*"
233+
latest_code = 2;
234+
latest_ind = labels.size() - 1; // index of rightmost label
235+
}
236+
// Calculate result
237+
const int ind_diff = static_cast<int>(latest_code & DIFF_MASK) - 2 +
238+
static_cast<int>(opt & option::registrable_domain);
239+
if (ind_diff <= 0 || static_cast<std::size_t>(ind_diff) <= latest_ind) {
240+
const auto ind = latest_ind - ind_diff;
241+
if (ind < labels.size())
242+
return { ind, labels.get_pos_by_index(ind), latest_code };
243+
}
244+
return {};
245+
}
246+
247+
bool public_suffix_list::operator==(const public_suffix_list& other) const {
248+
return root_ == other.root_;
249+
}
250+
251+
public_suffix_list::public_suffix_list() = default;
252+
public_suffix_list::~public_suffix_list() = default;
253+
public_suffix_list::public_suffix_list(public_suffix_list&&) noexcept = default;
254+
public_suffix_list& public_suffix_list::operator=(public_suffix_list&&) noexcept = default;
255+
256+
} // namespace upa

0 commit comments

Comments
 (0)