@@ -32,7 +32,8 @@ namespace duckdb
3232 }
3333 catch (const std::exception &e)
3434 {
35- result_data[i] = " Error extracting domain: " + std::string (e.what ());
35+ // Set NULL on error
36+ FlatVector::SetNull (result, i, true );
3637 }
3738 }
3839 }
@@ -45,18 +46,26 @@ namespace duckdb
4546 Connection con (db);
4647
4748 // Extract the host from the URL
48- std::regex host_regex (R"( ^(?:(?:https?|ftp|rsync):\/\/|mailto:)?((?:[^\/\?:#@]+@)?([^\/\?:#]+)))" );
49+ // This regex captures the host, excluding protocol, path, query, fragment, and port.
50+ // It explicitly excludes '/', '\s', '#', '?', ':' from the host.
51+ std::regex host_regex (R"( ^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))" );
4952 std::smatch host_match;
50- if (!std::regex_search (input, host_match, host_regex))
53+ std::string host_str; // Use a separate string for the matched host
54+
55+ // Search for the host in the input string
56+ // No need for searchable_input, regex_search can take input directly
57+ if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1 )
5158 {
52- return " " ;
59+ host_str = host_match[1 ].str ();
60+ }
61+ else
62+ {
63+ return " " ; // No host found
5364 }
54-
55- auto host = host_match[host_match.size () - 1 ].str ();
5665
5766 // Split the host into parts
5867 std::vector<std::string> parts;
59- std::istringstream stream (host );
68+ std::istringstream stream (host_str );
6069 std::string part;
6170 while (std::getline (stream, part, ' .' ))
6271 {
@@ -65,8 +74,29 @@ namespace duckdb
6574
6675 // Find the longest matching public suffix
6776 std::string public_suffix;
68- int public_suffix_index = -1 ;
69-
77+ int public_suffix_index = -1 ; // Using -1 to indicate no valid public suffix part found yet
78+
79+ // Iterate through all possible suffix combinations, from shortest to longest.
80+ // The goal is to find the longest known public suffix.
81+ // For example, for 'a.b.c.co.uk', it will test:
82+ // uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk
83+ // If 'co.uk' is a public suffix, it will be matched.
84+ // If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched.
85+ // The last and longest match is chosen by breaking after the first DB match,
86+ // assuming suffixes are ordered or queried appropriately by the PSL logic.
87+ // However, the original loop structure implies checking all parts and
88+ // the longest one that is a PSL entry should be chosen.
89+ // The current logic takes the *first* match from the right that is a PSL entry.
90+ // Let's refine the comment to reflect the actual loop behavior.
91+
92+ // Iterate through parts of the hostname from right to left to find the longest public suffix.
93+ // For 'a.b.c.co.uk', it will form candidates:
94+ // 1. uk
95+ // 2. co.uk
96+ // 3. c.co.uk
97+ // 4. b.c.co.uk
98+ // 5. a.b.c.co.uk
99+ // It stops at the first and longest valid suffix found in the public_suffix_list.
70100 for (size_t j = 0 ; j < parts.size (); j++)
71101 {
72102 // Build the candidate suffix
0 commit comments