diff --git a/.gitignore b/.gitignore index fee950c..90e2c07 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,4 @@ test/python/__pycache__/ .Rhistory *.log *.csv -!test/data/tranco.csv +!test/data/*.csv diff --git a/README.md b/README.md index 738005f..f134c5e 100644 --- a/README.md +++ b/README.md @@ -239,20 +239,20 @@ You can use this function to get the ranking of a domain: ```sql D SELECT get_tranco_rank('microsoft.com') as rank; -┌───────┐ -│ rank │ -│ int32 │ -├───────┤ -│ 2 │ -└───────┘ +┌─────────┐ +│ rank │ +│ varchar │ +├─────────┤ +│ 2 │ +└─────────┘ D SELECT get_tranco_rank('cloudflare.com') as rank; -┌───────┐ -│ rank │ -│ int32 │ -├───────┤ -│ 13 │ -└───────┘ +┌─────────┐ +│ rank │ +│ varchar │ +├─────────┤ +│ 13 │ +└─────────┘ ``` ### Get Extension Version diff --git a/src/functions/extract_domain.cpp b/src/functions/extract_domain.cpp index 67e5b82..f42aaed 100644 --- a/src/functions/extract_domain.cpp +++ b/src/functions/extract_domain.cpp @@ -11,18 +11,23 @@ namespace duckdb { // Extract the input from the arguments auto &input_vector = args.data[0]; - auto input = input_vector.GetValue (0).ToString (); + auto result_data = FlatVector::GetData (result); - if (input.empty ()) + for (idx_t i = 0; i < args.size (); i++) { - result.SetValue (0, Value ("")); - return; - } - - // Extract the domain using the utility function - auto domain = netquack::ExtractDomain (state, input); + auto input = input_vector.GetValue (i).ToString (); - result.SetValue (0, Value (domain)); + try + { + // Extract the domain using the utility function + auto domain = netquack::ExtractDomain (state, input); + result_data[i] = StringVector::AddString (result, domain); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting domain: " + std::string (e.what ()); + } + } } namespace netquack diff --git a/src/functions/extract_host.cpp b/src/functions/extract_host.cpp index c47fd8b..2d7678b 100644 --- a/src/functions/extract_host.cpp +++ b/src/functions/extract_host.cpp @@ -9,13 +9,23 @@ namespace duckdb { // Extract the input from the arguments auto &input_vector = args.data[0]; - auto input = input_vector.GetValue (0).ToString (); + auto result_data = FlatVector::GetData (result); - // Extract the host using the utility function - auto host = netquack::ExtractHost (input); + for (idx_t i = 0; i < args.size (); i++) + { + auto input = input_vector.GetValue (i).ToString (); - // Set the result - result.SetValue (0, Value (host)); + try + { + // Extract the host using the utility function + auto host = netquack::ExtractHost (input); + result_data[i] = StringVector::AddString (result, host); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting host: " + std::string (e.what ()); + } + } } namespace netquack diff --git a/src/functions/extract_path.cpp b/src/functions/extract_path.cpp index 1dcffe3..ac794e5 100644 --- a/src/functions/extract_path.cpp +++ b/src/functions/extract_path.cpp @@ -9,13 +9,23 @@ namespace duckdb { // Extract the input from the arguments auto &input_vector = args.data[0]; - auto input = input_vector.GetValue (0).ToString (); + auto result_data = FlatVector::GetData (result); - // Extract the path using the utility function - auto path = netquack::ExtractPath (input); + for (idx_t i = 0; i < args.size (); i++) + { + auto input = input_vector.GetValue (i).ToString (); - // Set the result - result.SetValue (0, Value (path)); + try + { + // Extract the path using the utility function + auto path = netquack::ExtractPath (input); + result_data[i] = StringVector::AddString (result, path); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting path: " + std::string (e.what ()); + } + }; } namespace netquack @@ -26,11 +36,12 @@ namespace duckdb // Explanation: // ^ - Start of the string // (?: - Non-capturing group for the protocol and domain part - // (?:(?:ftp|https?|rsync):\/\/)? - Optional ftp://, http://, https://, or rsync:// - // (?:[^\/\s]+) - Domain name (any characters except '/' or whitespace) + // (?:(?:ftp|https?|rsync):\/\/)? - Optional protocol (ftp://, http://, https://, or rsync://) + // (?:[^\/\s]+) - Domain name or IP address (any characters except '/' or whitespace) // ) - // (\/[^?#]*) - Capturing group for the path (starts with '/', followed by any characters except '?' or '#') - std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*))"); + // (\/[^?#]*)? - Optional capturing group for the path (starts with '/', followed by any characters except '?' or '#') + // - The '?' at the end makes the path component optional, allowing the regex to match URLs with or without a path + std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*)?)"); std::smatch path_match; // Use regex_search to find the path component in the input string diff --git a/src/functions/extract_query.cpp b/src/functions/extract_query.cpp index 0498ea4..14d66d9 100644 --- a/src/functions/extract_query.cpp +++ b/src/functions/extract_query.cpp @@ -7,15 +7,25 @@ namespace duckdb // Function to extract the query string from a URL void ExtractQueryStringFunction (DataChunk &args, ExpressionState &state, Vector &result) { - // Extract the URL from the input - auto &url_vector = args.data[0]; - auto url = url_vector.GetValue (0).ToString (); + // Extract the input from the arguments + auto &input_vector = args.data[0]; + auto result_data = FlatVector::GetData (result); - // Extract the query string - auto query_string = netquack::ExtractQueryString (url); + for (idx_t i = 0; i < args.size (); i++) + { + auto input = input_vector.GetValue (i).ToString (); - // Set the result - result.SetValue (0, Value (query_string)); + try + { + // Extract the query string using the utility function + auto query_string = netquack::ExtractQueryString (input); + result_data[i] = StringVector::AddString (result, query_string); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting query string: " + std::string (e.what ()); + } + }; } namespace netquack diff --git a/src/functions/extract_schema.cpp b/src/functions/extract_schema.cpp index afe744c..e40c732 100644 --- a/src/functions/extract_schema.cpp +++ b/src/functions/extract_schema.cpp @@ -9,13 +9,23 @@ namespace duckdb { // Extract the input from the arguments auto &input_vector = args.data[0]; - auto input = input_vector.GetValue (0).ToString (); + auto result_data = FlatVector::GetData (result); - // Extract the schema using the utility function - auto schema = netquack::ExtractSchema (input); + for (idx_t i = 0; i < args.size (); i++) + { + auto input = input_vector.GetValue (i).ToString (); - // Set the result - result.SetValue (0, Value (schema)); + try + { + // Extract the schema using the utility function + auto schema = netquack::ExtractSchema (input); + result_data[i] = StringVector::AddString (result, schema); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting schema: " + std::string (e.what ()); + } + }; } namespace netquack diff --git a/src/functions/extract_subdomain.cpp b/src/functions/extract_subdomain.cpp index be59fa4..bd08795 100644 --- a/src/functions/extract_subdomain.cpp +++ b/src/functions/extract_subdomain.cpp @@ -11,12 +11,23 @@ namespace duckdb { // Extract the input from the arguments auto &input_vector = args.data[0]; - auto input = input_vector.GetValue (0).ToString (); + auto result_data = FlatVector::GetData (result); - // Extract the sub-domain using the utility function - auto subdomain = netquack::ExtractSubDomain (state, input); + for (idx_t i = 0; i < args.size (); i++) + { + auto input = input_vector.GetValue (i).ToString (); - result.SetValue (0, Value (subdomain)); + try + { + // Extract the subdomain using the utility function + auto subdomain = netquack::ExtractSubDomain (state, input); + result_data[i] = StringVector::AddString (result, subdomain); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting subdomain: " + std::string (e.what ()); + } + } } namespace netquack diff --git a/src/functions/extract_tld.cpp b/src/functions/extract_tld.cpp index d6c0552..6240f2c 100644 --- a/src/functions/extract_tld.cpp +++ b/src/functions/extract_tld.cpp @@ -11,13 +11,23 @@ namespace duckdb { // Extract the input from the arguments auto &input_vector = args.data[0]; - auto input = input_vector.GetValue (0).ToString (); + auto result_data = FlatVector::GetData (result); - // Extract the top-level domain using the utility function - auto tld = netquack::ExtractTLD (state, input); + for (idx_t i = 0; i < args.size (); i++) + { + auto input = input_vector.GetValue (i).ToString (); - // Set the result - result.SetValue (0, Value (tld)); + try + { + // Extract the top-level domain using the utility function + auto tld = netquack::ExtractTLD (state, input); + result_data[i] = StringVector::AddString (result, tld); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting tld: " + std::string (e.what ()); + } + } } namespace netquack diff --git a/src/functions/get_tranco.cpp b/src/functions/get_tranco.cpp index c1cc8e6..a26c72e 100644 --- a/src/functions/get_tranco.cpp +++ b/src/functions/get_tranco.cpp @@ -148,13 +148,28 @@ namespace duckdb throw std::runtime_error ("Tranco table not found. Download it first using `SELECT update_tranco(true);`"); } - auto &domain_vector = args.data[0]; - auto domain = domain_vector.GetValue (0).ToString (); + // Extract the input from the arguments + auto &input_vector = args.data[0]; + auto result_data = FlatVector::GetData (result); - auto query = "SELECT rank FROM tranco_list WHERE domain = '" + domain + "'"; - auto query_result = con.Query (query); + for (idx_t i = 0; i < args.size (); i++) + { + auto input = input_vector.GetValue (i).ToString (); + + try + { + auto query = "SELECT rank FROM tranco_list WHERE domain = '" + input + "'"; + + auto query_result = con.Query (query); + auto rank = query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value (); - result.SetValue (0, query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value ()); + result_data[i] = StringVector::AddString (result, rank.ToString ()); + } + catch (const std::exception &e) + { + result_data[i] = "Error extracting tranco rank: " + std::string (e.what ()); + } + } } } // namespace netquack } // namespace duckdb diff --git a/src/netquack_extension.cpp b/src/netquack_extension.cpp index baab985..7b9fa31 100644 --- a/src/netquack_extension.cpp +++ b/src/netquack_extension.cpp @@ -95,7 +95,7 @@ namespace duckdb auto get_tranco_rank_function = ScalarFunction ( "get_tranco_rank", { LogicalType::VARCHAR }, - LogicalType::INTEGER, + LogicalType::VARCHAR, netquack::GetTrancoRankFunction); ExtensionUtil::RegisterFunction (instance, get_tranco_rank_function); diff --git a/test/data/examples.csv b/test/data/examples.csv new file mode 100644 index 0000000..a1f8b95 --- /dev/null +++ b/test/data/examples.csv @@ -0,0 +1,9 @@ +example.com +example.com.ac +example.com.co +a.example.com +example.com/a +example.com.ac/a +https://example.com +https://a.example.com +http://example.com.ac/path/?a=1&b=2& \ No newline at end of file diff --git a/test/data/examples_tranco.csv b/test/data/examples_tranco.csv new file mode 100644 index 0000000..75329e0 --- /dev/null +++ b/test/data/examples_tranco.csv @@ -0,0 +1,4 @@ +microsoft.com +googleapis.com +gstatic.com +apple.com \ No newline at end of file diff --git a/test/sql/extract_domain.test b/test/sql/extract_domain.test index bfb78ef..263b32e 100644 --- a/test/sql/extract_domain.test +++ b/test/sql/extract_domain.test @@ -4,6 +4,9 @@ require netquack +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + query I SELECT extract_domain('example.com'); ---- @@ -172,4 +175,17 @@ SELECT extract_domain('http:/example.com.ac/path'); query I SELECT extract_domain('http:/example.com.ac:443/path'); ---- -(empty) \ No newline at end of file +(empty) + +query I +SELECT extract_domain(uri) from uri_list; +---- +example.com +example.com.ac +example.com.co +example.com +example.com +example.com.ac +example.com +example.com +example.com.ac diff --git a/test/sql/extract_host.test b/test/sql/extract_host.test index 865a8d6..f832420 100644 --- a/test/sql/extract_host.test +++ b/test/sql/extract_host.test @@ -4,6 +4,9 @@ require netquack +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + query I SELECT extract_host('http://example.com.ac/path'); ---- @@ -63,3 +66,16 @@ query I SELECT extract_host('rsync://rpki.example.com/path'); ---- rpki.example.com + +query I +SELECT extract_host(uri) from uri_list; +---- +example.com +example.com.ac +example.com.co +a.example.com +example.com +example.com.ac +example.com +a.example.com +example.com.ac diff --git a/test/sql/extract_path.test b/test/sql/extract_path.test index 9724565..aaf3d0a 100644 --- a/test/sql/extract_path.test +++ b/test/sql/extract_path.test @@ -4,6 +4,14 @@ require netquack +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + +query I +SELECT extract_path('http://example.com.ac'); +---- +/ + query I SELECT extract_path('http://example.com.ac/path'); ---- @@ -63,3 +71,16 @@ query I SELECT extract_path('rsync://rpki.exmple.com/path/path'); ---- /path/path + +query I +SELECT extract_path(uri) from uri_list; +---- +/ +/ +/ +/ +/a +/a +/ +/ +/path/ diff --git a/test/sql/extract_query.test b/test/sql/extract_query.test index 39838dc..7389316 100644 --- a/test/sql/extract_query.test +++ b/test/sql/extract_query.test @@ -4,6 +4,9 @@ require netquack +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + query I SELECT extract_query_string('http://example.com.ac?a=1'); ---- @@ -28,3 +31,16 @@ query I SELECT extract_query_string('http://example.com.ac/path/?a=1'); ---- a=1 + +query I +SELECT extract_query_string(uri) from uri_list; +---- +(empty) +(empty) +(empty) +(empty) +(empty) +(empty) +(empty) +(empty) +a=1&b=2& diff --git a/test/sql/extract_schema.test b/test/sql/extract_schema.test index f0aa5c0..4524784 100644 --- a/test/sql/extract_schema.test +++ b/test/sql/extract_schema.test @@ -4,6 +4,9 @@ require netquack +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + # ---------------------------- HTTP/S ---------------------------- query I @@ -195,3 +198,16 @@ query I SELECT extract_schema('sms:a'); ---- sms + +query I +SELECT extract_schema(uri) from uri_list; +---- +(empty) +(empty) +(empty) +(empty) +(empty) +(empty) +https +https +http diff --git a/test/sql/extract_subdomain.test b/test/sql/extract_subdomain.test index 2f1700f..3171710 100644 --- a/test/sql/extract_subdomain.test +++ b/test/sql/extract_subdomain.test @@ -4,6 +4,9 @@ require netquack +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + query I SELECT extract_subdomain('a.example.com'); ---- @@ -123,3 +126,16 @@ query I SELECT extract_subdomain('http:/example.com.ac:443/path'); ---- (empty) + +query I +SELECT extract_subdomain(uri) from uri_list; +---- +(empty) +(empty) +(empty) +a +(empty) +(empty) +(empty) +a +(empty) diff --git a/test/sql/extract_tld.test b/test/sql/extract_tld.test index 2663094..fe71633 100644 --- a/test/sql/extract_tld.test +++ b/test/sql/extract_tld.test @@ -4,6 +4,9 @@ require netquack +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + query I SELECT extract_tld('example.com'); ---- @@ -174,3 +177,16 @@ query I SELECT extract_tld('http:/example.com.ac:443/path'); ---- (empty) + +query I +SELECT extract_tld(uri) from uri_list; +---- +com +com.ac +com.co +com +com +com.ac +com +com +com.ac diff --git a/test/sql/get_tranco_rank.test b/test/sql/get_tranco_rank.test index 2cb8869..52c6bbe 100644 --- a/test/sql/get_tranco_rank.test +++ b/test/sql/get_tranco_rank.test @@ -7,6 +7,9 @@ require netquack statement ok CREATE TABLE tranco_list AS SELECT * FROM read_csv('test/data/tranco.csv', header=false, columns={'rank': 'INTEGER', 'domain': 'VARCHAR'}); +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples_tranco.csv', header=false, columns={'uri': 'VARCHAR'}); + query I SELECT COUNT(*) FROM tranco_list; ---- @@ -26,3 +29,11 @@ query I SELECT get_tranco_rank('notfound.com'); ---- NULL + +query I +SELECT get_tranco_rank(uri) from uri_list; +---- +2 +10 +19 +7