Skip to content

Commit 3883a44

Browse files
Jules was unable to complete the task in time. Please review the work done so far and provide feedback for Jules to continue.
1 parent 1a54695 commit 3883a44

15 files changed

+336
-147
lines changed

src/functions/extract_domain.cpp

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ namespace duckdb
3232
}
3333
catch (const std::exception &e)
3434
{
35-
result_data[i] = "Error extracting domain: " + std::string (e.what ());
35+
// Set NULL on error
36+
FlatVector::SetNull (result, i, true);
3637
}
3738
}
3839
}
@@ -45,18 +46,26 @@ namespace duckdb
4546
Connection con (db);
4647

4748
// Extract the host from the URL
48-
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/|mailto:)?((?:[^\/\?:#@]+@)?([^\/\?:#]+)))");
49+
// This regex captures the host, excluding protocol, path, query, fragment, and port.
50+
// It explicitly excludes '/', '\s', '#', '?', ':' from the host.
51+
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))");
4952
std::smatch host_match;
50-
if (!std::regex_search (input, host_match, host_regex))
53+
std::string host_str; // Use a separate string for the matched host
54+
55+
// Search for the host in the input string
56+
// No need for searchable_input, regex_search can take input directly
57+
if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1)
5158
{
52-
return "";
59+
host_str = host_match[1].str ();
60+
}
61+
else
62+
{
63+
return ""; // No host found
5364
}
54-
55-
auto host = host_match[host_match.size () - 1].str ();
5665

5766
// Split the host into parts
5867
std::vector<std::string> parts;
59-
std::istringstream stream (host);
68+
std::istringstream stream (host_str);
6069
std::string part;
6170
while (std::getline (stream, part, '.'))
6271
{
@@ -65,8 +74,29 @@ namespace duckdb
6574

6675
// Find the longest matching public suffix
6776
std::string public_suffix;
68-
int public_suffix_index = -1;
69-
77+
int public_suffix_index = -1; // Using -1 to indicate no valid public suffix part found yet
78+
79+
// Iterate through all possible suffix combinations, from shortest to longest.
80+
// The goal is to find the longest known public suffix.
81+
// For example, for 'a.b.c.co.uk', it will test:
82+
// uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk
83+
// If 'co.uk' is a public suffix, it will be matched.
84+
// If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched.
85+
// The last and longest match is chosen by breaking after the first DB match,
86+
// assuming suffixes are ordered or queried appropriately by the PSL logic.
87+
// However, the original loop structure implies checking all parts and
88+
// the longest one that is a PSL entry should be chosen.
89+
// The current logic takes the *first* match from the right that is a PSL entry.
90+
// Let's refine the comment to reflect the actual loop behavior.
91+
92+
// Iterate through parts of the hostname from right to left to find the longest public suffix.
93+
// For 'a.b.c.co.uk', it will form candidates:
94+
// 1. uk
95+
// 2. co.uk
96+
// 3. c.co.uk
97+
// 4. b.c.co.uk
98+
// 5. a.b.c.co.uk
99+
// It stops at the first and longest valid suffix found in the public_suffix_list.
70100
for (size_t j = 0; j < parts.size (); j++)
71101
{
72102
// Build the candidate suffix

src/functions/extract_extension.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ namespace duckdb
2626
}
2727
catch (const std::exception &e)
2828
{
29-
result_data[i] = "Error extracting extension: " + std::string (e.what ());
29+
// Set NULL on error
30+
FlatVector::SetNull (result, i, true);
3031
}
3132
};
3233
}

src/functions/extract_host.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ namespace duckdb
2626
}
2727
catch (const std::exception &e)
2828
{
29-
result_data[i] = "Error extracting host: " + std::string (e.what ());
29+
// Set NULL on error
30+
FlatVector::SetNull (result, i, true);
3031
}
3132
}
3233
}

src/functions/extract_path.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ namespace duckdb
1515

1616
for (idx_t i = 0; i < args.size (); i++)
1717
{
18+
// Paths are often case-sensitive, so we don't convert to lowercase.
1819
auto input = input_vector.GetValue (i).ToString ();
19-
std::transform (input.begin (), input.end (), input.begin (), ::tolower);
2020

2121
try
2222
{
@@ -26,7 +26,8 @@ namespace duckdb
2626
}
2727
catch (const std::exception &e)
2828
{
29-
result_data[i] = "Error extracting path: " + std::string (e.what ());
29+
// Set NULL on error
30+
FlatVector::SetNull (result, i, true);
3031
}
3132
};
3233
}

src/functions/extract_port.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ namespace duckdb
2626
}
2727
catch (const std::exception &e)
2828
{
29-
result_data[i] = "Error extracting port: " + std::string (e.what ());
29+
// Set NULL on error
30+
FlatVector::SetNull (result, i, true);
3031
}
3132
};
3233
}

src/functions/extract_query.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ namespace duckdb
2525
}
2626
catch (const std::exception &e)
2727
{
28-
result_data[i] = "Error extracting query string: " + std::string (e.what ());
28+
// Set NULL on error
29+
FlatVector::SetNull (result, i, true);
2930
}
3031
};
3132
}
@@ -36,9 +37,13 @@ namespace duckdb
3637
{
3738
// Regex to match the query string component of a URL
3839
// Explanation:
39-
// (?:\?|&) - Non-capturing group to match either "?" (start of query) or "&" (query parameter separator)
40-
// ([^#]+) - Capturing group to match the query string (any characters except "#")
41-
std::regex query_regex (R"((?:\?|&)([^#]+))");
40+
// \? - Matches the literal '?' character.
41+
// ([^#]*) - Capturing group:
42+
// [^#] - Matches any character that is NOT a '#'.
43+
// * - Matches the previous character zero or more times.
44+
// This regex captures content after the first '?' up to a '#' or end of string.
45+
// Does not handle query parameters in fragments.
46+
std::regex query_regex (R"(\?([^#]*))");
4247
std::smatch query_match;
4348

4449
// Use regex_search to find the query string in the input

src/functions/extract_schema.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ namespace duckdb
2626
}
2727
catch (const std::exception &e)
2828
{
29-
result_data[i] = "Error extracting schema: " + std::string (e.what ());
29+
// Set NULL on error
30+
FlatVector::SetNull (result, i, true);
3031
}
3132
};
3233
}

src/functions/extract_subdomain.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ namespace duckdb
2828
}
2929
catch (const std::exception &e)
3030
{
31-
result_data[i] = "Error extracting subdomain: " + std::string (e.what ());
31+
// Set NULL on error
32+
FlatVector::SetNull (result, i, true);
3233
}
3334
}
3435
}
@@ -43,18 +44,25 @@ namespace duckdb
4344
Connection con (db);
4445

4546
// Extract the host from the URL
46-
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
47+
// This regex captures the host, excluding protocol, path, query, fragment, and port.
48+
// It explicitly excludes '/', '\s', '#', '?', ':' from the host.
49+
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))");
4750
std::smatch host_match;
48-
if (!std::regex_search (input, host_match, host_regex))
51+
std::string host_str;
52+
53+
// No need for searchable_input, regex_search can take input directly
54+
if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1)
4955
{
50-
return "";
56+
host_str = host_match[1].str ();
57+
}
58+
else
59+
{
60+
return ""; // No host found
5161
}
52-
53-
auto host = host_match[1].str ();
5462

5563
// Split the host into parts
5664
std::vector<std::string> parts;
57-
std::istringstream stream (host);
65+
std::istringstream stream (host_str);
5866
std::string part;
5967
while (std::getline (stream, part, '.'))
6068
{
@@ -65,6 +73,14 @@ namespace duckdb
6573
std::string public_suffix;
6674
int public_suffix_index = -1;
6775

76+
// Iterate through all possible suffix combinations, from shortest to longest.
77+
// The goal is to find the longest known public suffix.
78+
// For example, for 'a.b.c.co.uk', it will test:
79+
// uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk
80+
// If 'co.uk' is a public suffix, it will be matched.
81+
// If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched.
82+
// The last and longest match is chosen.
83+
// The current logic takes the *first* match from the right that is a PSL entry.
6884
for (size_t j = 0; j < parts.size (); j++)
6985
{
7086
// Build the candidate suffix

src/functions/extract_tld.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ namespace duckdb
2828
}
2929
catch (const std::exception &e)
3030
{
31-
result_data[i] = "Error extracting tld: " + std::string (e.what ());
31+
// Set NULL on error
32+
FlatVector::SetNull (result, i, true);
3233
}
3334
}
3435
}
@@ -43,18 +44,25 @@ namespace duckdb
4344
Connection con (db);
4445

4546
// Extract the host from the URL
46-
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
47+
// This regex captures the host, excluding protocol, path, query, fragment, and port.
48+
// It explicitly excludes '/', '\s', '#', '?', ':' from the host.
49+
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))");
4750
std::smatch host_match;
48-
if (!std::regex_search (input, host_match, host_regex))
51+
std::string host_str;
52+
53+
// No need for searchable_input, regex_search can take input directly
54+
if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1)
4955
{
50-
return "";
56+
host_str = host_match[1].str ();
57+
}
58+
else
59+
{
60+
return ""; // No host found
5161
}
52-
53-
auto host = host_match[1].str ();
5462

5563
// Split the host into parts
5664
std::vector<std::string> parts;
57-
std::istringstream stream (host);
65+
std::istringstream stream (host_str);
5866
std::string part;
5967
while (std::getline (stream, part, '.'))
6068
{
@@ -64,6 +72,14 @@ namespace duckdb
6472
// Find the longest matching public suffix
6573
std::string public_suffix;
6674

75+
// Iterate through all possible suffix combinations, from shortest to longest.
76+
// The goal is to find the longest known public suffix.
77+
// For example, for 'a.b.c.co.uk', it will test:
78+
// uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk
79+
// If 'co.uk' is a public suffix, it will be matched.
80+
// If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched.
81+
// The last and longest match is chosen.
82+
// The current logic takes the *first* match from the right that is a PSL entry.
6783
for (size_t j = 0; j < parts.size (); j++)
6884
{
6985
// Build the candidate suffix

0 commit comments

Comments
 (0)