Skip to content
Merged
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ test/python/__pycache__/
.Rhistory
*.log
*.csv
!test/data/tranco.csv
!test/data/*.csv
24 changes: 12 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,20 +239,20 @@ You can use this function to get the ranking of a domain:

```sql
D SELECT get_tranco_rank('microsoft.com') as rank;
┌───────┐
│ rank │
int32
├───────┤
2
└───────┘
┌─────────
rank
varchar
├─────────
2
└─────────

D SELECT get_tranco_rank('cloudflare.com') as rank;
┌───────┐
│ rank │
int32
├───────┤
13
└───────┘
┌─────────
rank
varchar
├─────────
13
└─────────
```

### Get Extension Version
Expand Down
23 changes: 14 additions & 9 deletions src/functions/extract_domain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,23 @@ namespace duckdb
{
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto input = input_vector.GetValue (0).ToString ();
auto result_data = FlatVector::GetData<string_t> (result);

if (input.empty ())
for (idx_t i = 0; i < args.size (); i++)
{
result.SetValue (0, Value (""));
return;
}

// Extract the domain using the utility function
auto domain = netquack::ExtractDomain (state, input);
auto input = input_vector.GetValue (i).ToString ();

result.SetValue (0, Value (domain));
try
{
// Extract the domain using the utility function
auto domain = netquack::ExtractDomain (state, input);
result_data[i] = StringVector::AddString (result, domain);
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting domain: " + std::string (e.what ());
}
}
}

namespace netquack
Expand Down
20 changes: 15 additions & 5 deletions src/functions/extract_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,23 @@ namespace duckdb
{
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto input = input_vector.GetValue (0).ToString ();
auto result_data = FlatVector::GetData<string_t> (result);

// Extract the host using the utility function
auto host = netquack::ExtractHost (input);
for (idx_t i = 0; i < args.size (); i++)
{
auto input = input_vector.GetValue (i).ToString ();

// Set the result
result.SetValue (0, Value (host));
try
{
// Extract the host using the utility function
auto host = netquack::ExtractHost (input);
result_data[i] = StringVector::AddString (result, host);
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting host: " + std::string (e.what ());
}
}
}

namespace netquack
Expand Down
29 changes: 20 additions & 9 deletions src/functions/extract_path.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,23 @@ namespace duckdb
{
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto input = input_vector.GetValue (0).ToString ();
auto result_data = FlatVector::GetData<string_t> (result);

// Extract the path using the utility function
auto path = netquack::ExtractPath (input);
for (idx_t i = 0; i < args.size (); i++)
{
auto input = input_vector.GetValue (i).ToString ();

// Set the result
result.SetValue (0, Value (path));
try
{
// Extract the path using the utility function
auto path = netquack::ExtractPath (input);
result_data[i] = StringVector::AddString (result, path);
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting path: " + std::string (e.what ());
}
};
}

namespace netquack
Expand All @@ -26,11 +36,12 @@ namespace duckdb
// Explanation:
// ^ - Start of the string
// (?: - Non-capturing group for the protocol and domain part
// (?:(?:ftp|https?|rsync):\/\/)? - Optional ftp://, http://, https://, or rsync://
// (?:[^\/\s]+) - Domain name (any characters except '/' or whitespace)
// (?:(?:ftp|https?|rsync):\/\/)? - Optional protocol (ftp://, http://, https://, or rsync://)
// (?:[^\/\s]+) - Domain name or IP address (any characters except '/' or whitespace)
// )
// (\/[^?#]*) - Capturing group for the path (starts with '/', followed by any characters except '?' or '#')
std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*))");
// (\/[^?#]*)? - Optional capturing group for the path (starts with '/', followed by any characters except '?' or '#')
// - The '?' at the end makes the path component optional, allowing the regex to match URLs with or without a path
std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*)?)");
std::smatch path_match;

// Use regex_search to find the path component in the input string
Expand Down
24 changes: 17 additions & 7 deletions src/functions/extract_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,25 @@ namespace duckdb
// Function to extract the query string from a URL
void ExtractQueryStringFunction (DataChunk &args, ExpressionState &state, Vector &result)
{
// Extract the URL from the input
auto &url_vector = args.data[0];
auto url = url_vector.GetValue (0).ToString ();
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto result_data = FlatVector::GetData<string_t> (result);

// Extract the query string
auto query_string = netquack::ExtractQueryString (url);
for (idx_t i = 0; i < args.size (); i++)
{
auto input = input_vector.GetValue (i).ToString ();

// Set the result
result.SetValue (0, Value (query_string));
try
{
// Extract the query string using the utility function
auto query_string = netquack::ExtractQueryString (input);
result_data[i] = StringVector::AddString (result, query_string);
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting query string: " + std::string (e.what ());
}
};
}

namespace netquack
Expand Down
20 changes: 15 additions & 5 deletions src/functions/extract_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,23 @@ namespace duckdb
{
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto input = input_vector.GetValue (0).ToString ();
auto result_data = FlatVector::GetData<string_t> (result);

// Extract the schema using the utility function
auto schema = netquack::ExtractSchema (input);
for (idx_t i = 0; i < args.size (); i++)
{
auto input = input_vector.GetValue (i).ToString ();

// Set the result
result.SetValue (0, Value (schema));
try
{
// Extract the schema using the utility function
auto schema = netquack::ExtractSchema (input);
result_data[i] = StringVector::AddString (result, schema);
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting schema: " + std::string (e.what ());
}
};
}

namespace netquack
Expand Down
19 changes: 15 additions & 4 deletions src/functions/extract_subdomain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,23 @@ namespace duckdb
{
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto input = input_vector.GetValue (0).ToString ();
auto result_data = FlatVector::GetData<string_t> (result);

// Extract the sub-domain using the utility function
auto subdomain = netquack::ExtractSubDomain (state, input);
for (idx_t i = 0; i < args.size (); i++)
{
auto input = input_vector.GetValue (i).ToString ();

result.SetValue (0, Value (subdomain));
try
{
// Extract the subdomain using the utility function
auto subdomain = netquack::ExtractSubDomain (state, input);
result_data[i] = StringVector::AddString (result, subdomain);
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting subdomain: " + std::string (e.what ());
}
}
}

namespace netquack
Expand Down
20 changes: 15 additions & 5 deletions src/functions/extract_tld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,23 @@ namespace duckdb
{
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto input = input_vector.GetValue (0).ToString ();
auto result_data = FlatVector::GetData<string_t> (result);

// Extract the top-level domain using the utility function
auto tld = netquack::ExtractTLD (state, input);
for (idx_t i = 0; i < args.size (); i++)
{
auto input = input_vector.GetValue (i).ToString ();

// Set the result
result.SetValue (0, Value (tld));
try
{
// Extract the top-level domain using the utility function
auto tld = netquack::ExtractTLD (state, input);
result_data[i] = StringVector::AddString (result, tld);
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting tld: " + std::string (e.what ());
}
}
}

namespace netquack
Expand Down
25 changes: 20 additions & 5 deletions src/functions/get_tranco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,28 @@ namespace duckdb
throw std::runtime_error ("Tranco table not found. Download it first using `SELECT update_tranco(true);`");
}

auto &domain_vector = args.data[0];
auto domain = domain_vector.GetValue (0).ToString ();
// Extract the input from the arguments
auto &input_vector = args.data[0];
auto result_data = FlatVector::GetData<string_t> (result);

auto query = "SELECT rank FROM tranco_list WHERE domain = '" + domain + "'";
auto query_result = con.Query (query);
for (idx_t i = 0; i < args.size (); i++)
{
auto input = input_vector.GetValue (i).ToString ();

try
{
auto query = "SELECT rank FROM tranco_list WHERE domain = '" + input + "'";

auto query_result = con.Query (query);
auto rank = query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value ();

result.SetValue (0, query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value ());
result_data[i] = StringVector::AddString (result, rank.ToString ());
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting tranco rank: " + std::string (e.what ());
}
}
}
} // namespace netquack
} // namespace duckdb
2 changes: 1 addition & 1 deletion src/netquack_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ namespace duckdb
auto get_tranco_rank_function = ScalarFunction (
"get_tranco_rank",
{ LogicalType::VARCHAR },
LogicalType::INTEGER,
LogicalType::VARCHAR,
netquack::GetTrancoRankFunction);
ExtensionUtil::RegisterFunction (instance, get_tranco_rank_function);

Expand Down
9 changes: 9 additions & 0 deletions test/data/examples.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
example.com
example.com.ac
example.com.co
a.example.com
example.com/a
example.com.ac/a
https://example.com
https://a.example.com
http://example.com.ac/path/?a=1&b=2&
4 changes: 4 additions & 0 deletions test/data/examples_tranco.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
microsoft.com
googleapis.com
gstatic.com
apple.com
18 changes: 17 additions & 1 deletion test/sql/extract_domain.test
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

require netquack

statement ok
CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'});

query I
SELECT extract_domain('example.com');
----
Expand Down Expand Up @@ -172,4 +175,17 @@ SELECT extract_domain('http:/example.com.ac/path');
query I
SELECT extract_domain('http:/example.com.ac:443/path');
----
(empty)
(empty)

query I
SELECT extract_domain(uri) from uri_list;
----
example.com
example.com.ac
example.com.co
example.com
example.com
example.com.ac
example.com
example.com
example.com.ac
16 changes: 16 additions & 0 deletions test/sql/extract_host.test
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

require netquack

statement ok
CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'});

query I
SELECT extract_host('http://example.com.ac/path');
----
Expand Down Expand Up @@ -63,3 +66,16 @@ query I
SELECT extract_host('rsync://rpki.example.com/path');
----
rpki.example.com

query I
SELECT extract_host(uri) from uri_list;
----
example.com
example.com.ac
example.com.co
a.example.com
example.com
example.com.ac
example.com
a.example.com
example.com.ac
Loading
Loading