Skip to content

Commit dbfa135

Browse files
committed
fix: support more scenarios for extract_domain
1 parent 665e0fe commit dbfa135

File tree

2 files changed

+85
-2
lines changed

2 files changed

+85
-2
lines changed

src/functions/extract_domain.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ namespace duckdb
4141
Connection con (db);
4242

4343
// Extract the host from the URL
44-
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
44+
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/|mailto:)?((?:[^\/\?:#@]+@)?([^\/\?:#]+)))");
4545
std::smatch host_match;
4646
if (!std::regex_search (input, host_match, host_regex))
4747
{
4848
return "";
4949
}
5050

51-
auto host = host_match[1].str ();
51+
auto host = host_match[host_match.size () - 1].str ();
5252

5353
// Split the host into parts
5454
std::vector<std::string> parts;

test/sql/extract_domain.test

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,86 @@ example.com.ac
194194
example.com
195195
example.com
196196
example.com.ac
197+
198+
# Test IP addresses
199+
query I
200+
SELECT extract_domain('http://192.168.1.1');
201+
----
202+
(empty)
203+
204+
query I
205+
SELECT extract_domain('http://192.168.1.1:8080');
206+
----
207+
(empty)
208+
209+
query I
210+
SELECT extract_domain('192.168.1.1');
211+
----
212+
(empty)
213+
214+
# Test IPv6 addresses
215+
query I
216+
SELECT extract_domain('http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]');
217+
----
218+
(empty)
219+
220+
query I
221+
SELECT extract_domain('[2001:0db8:85a3:0000:0000:8a2e:0370:7334]');
222+
----
223+
(empty)
224+
225+
# Test URLs with query parameters
226+
query I
227+
SELECT extract_domain('https://example.com?param=value');
228+
----
229+
example.com
230+
231+
query I
232+
SELECT extract_domain('https://example.com/path?param=value');
233+
----
234+
example.com
235+
236+
# Test URLs with fragments
237+
query I
238+
SELECT extract_domain('https://example.com#section');
239+
----
240+
example.com
241+
242+
query I
243+
SELECT extract_domain('https://example.com/path#section');
244+
----
245+
example.com
246+
247+
# Test combined query parameters and fragments
248+
query I
249+
SELECT extract_domain('https://example.com?param=value#section');
250+
----
251+
example.com
252+
253+
# Test special and edge cases
254+
query I
255+
SELECT extract_domain('');
256+
----
257+
(empty)
258+
259+
query I
260+
SELECT extract_domain(NULL);
261+
----
262+
NULL
263+
264+
query I
265+
SELECT extract_domain('localhost');
266+
----
267+
(empty)
268+
269+
# Test scheme with no authority
270+
query I
271+
SELECT extract_domain('file:///path/to/file');
272+
----
273+
(empty)
274+
275+
# Test mailto URLs
276+
query I
277+
SELECT extract_domain('mailto:[email protected]');
278+
----
279+
example.com

0 commit comments

Comments
 (0)