Skip to content

Commit 199c14a

Browse files
committed
fix: path extraction regex
1 parent d10b348 commit 199c14a

File tree

2 files changed

+10
-4
lines changed

2 files changed

+10
-4
lines changed

src/functions/extract_path.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,12 @@ namespace duckdb
3636
// Explanation:
3737
// ^ - Start of the string
3838
// (?: - Non-capturing group for the protocol and domain part
39-
// (?:(?:ftp|https?|rsync):\/\/)? - Optional ftp://, http://, https://, or rsync://
40-
// (?:[^\/\s]+) - Domain name (any characters except '/' or whitespace)
39+
// (?:(?:ftp|https?|rsync):\/\/)? - Optional protocol (ftp://, http://, https://, or rsync://)
40+
// (?:[^\/\s]+) - Domain name or IP address (any characters except '/' or whitespace)
4141
// )
42-
// (\/[^?#]*) - Capturing group for the path (starts with '/', followed by any characters except '?' or '#')
43-
std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*))");
42+
// (\/[^?#]*)? - Optional capturing group for the path (starts with '/', followed by any characters except '?' or '#')
43+
// - The '?' at the end makes the path component optional, allowing the regex to match URLs with or without a path
44+
std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*)?)");
4445
std::smatch path_match;
4546

4647
// Use regex_search to find the path component in the input string

test/sql/extract_path.test

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ require netquack
77
statement ok
88
CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'});
99

10+
query I
11+
SELECT extract_path('http://example.com.ac');
12+
----
13+
/
14+
1015
query I
1116
SELECT extract_path('http://example.com.ac/path');
1217
----

0 commit comments

Comments
 (0)