Skip to content

Commit 4b1f89f

Browse files
committed
feat: add extract_domain function
1 parent f73f9ee commit 4b1f89f

File tree

2 files changed

+181
-52
lines changed

2 files changed

+181
-52
lines changed

CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ set(TARGET_NAME netquack)
66
# DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then
77
# used in cmake with find_package. Feel free to remove or replace with other dependencies.
88
# Note that it should also be removed from vcpkg.json to prevent needlessly installing it..
9-
find_package(OpenSSL REQUIRED)
9+
find_package(CURL REQUIRED)
1010

1111
set(EXTENSION_NAME ${TARGET_NAME}_extension)
1212
set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
@@ -19,9 +19,9 @@ set(EXTENSION_SOURCES src/netquack_extension.cpp)
1919
build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
2020
build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
2121

22-
# Link OpenSSL in both the static library as the loadable extension
23-
target_link_libraries(${EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto)
24-
target_link_libraries(${LOADABLE_EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto)
22+
# Link Curl in both the static library as the loadable extension
23+
target_link_libraries(${EXTENSION_NAME} CURL::libcurl)
24+
target_link_libraries(${LOADABLE_EXTENSION_NAME} CURL::libcurl)
2525

2626
install(
2727
TARGETS ${EXTENSION_NAME}

src/netquack_extension.cpp

Lines changed: 177 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,70 +7,199 @@
77
#include "duckdb/function/scalar_function.hpp"
88
#include "duckdb/main/extension_util.hpp"
99
#include <duckdb/parser/parsed_data/create_scalar_function_info.hpp>
10+
#include <regex>
11+
#include <fstream>
12+
#include <sstream>
13+
#include <curl/curl.h>
1014

11-
// OpenSSL linked through vcpkg
12-
#include <openssl/opensslv.h>
15+
namespace duckdb
16+
{
1317

14-
namespace duckdb {
18+
// Function to download a file from a URL
19+
static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp)
20+
{
21+
((std::string *)userp)->append((char *)contents, size * nmemb);
22+
return size * nmemb;
23+
}
1524

16-
inline void NetquackScalarFun(DataChunk &args, ExpressionState &state, Vector &result) {
17-
auto &name_vector = args.data[0];
18-
UnaryExecutor::Execute<string_t, string_t>(
19-
name_vector, result, args.size(),
20-
[&](string_t name) {
21-
return StringVector::AddString(result, "Netquack "+name.GetString()+" 🐥");;
22-
});
23-
}
25+
static std::string DownloadPublicSuffixList()
26+
{
27+
CURL *curl;
28+
CURLcode res;
29+
std::string readBuffer;
2430

25-
inline void NetquackOpenSSLVersionScalarFun(DataChunk &args, ExpressionState &state, Vector &result) {
26-
auto &name_vector = args.data[0];
27-
UnaryExecutor::Execute<string_t, string_t>(
28-
name_vector, result, args.size(),
29-
[&](string_t name) {
30-
return StringVector::AddString(result, "Netquack " + name.GetString() +
31-
", my linked OpenSSL version is " +
32-
OPENSSL_VERSION_TEXT );;
33-
});
34-
}
31+
curl = curl_easy_init();
32+
if (curl)
33+
{
34+
curl_easy_setopt(curl, CURLOPT_URL, "https://publicsuffix.org/list/public_suffix_list.dat");
35+
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
36+
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
37+
res = curl_easy_perform(curl);
38+
curl_easy_cleanup(curl);
3539

36-
static void LoadInternal(DatabaseInstance &instance) {
37-
// Register a scalar function
38-
auto netquack_scalar_function = ScalarFunction("netquack", {LogicalType::VARCHAR}, LogicalType::VARCHAR, NetquackScalarFun);
39-
ExtensionUtil::RegisterFunction(instance, netquack_scalar_function);
40+
if (res != CURLE_OK)
41+
{
42+
throw std::runtime_error("Failed to download public suffix list.");
43+
}
44+
}
4045

41-
// Register another scalar function
42-
auto netquack_openssl_version_scalar_function = ScalarFunction("netquack_openssl_version", {LogicalType::VARCHAR},
43-
LogicalType::VARCHAR, NetquackOpenSSLVersionScalarFun);
44-
ExtensionUtil::RegisterFunction(instance, netquack_openssl_version_scalar_function);
45-
}
46+
return readBuffer;
47+
}
4648

47-
void NetquackExtension::Load(DuckDB &db) {
48-
LoadInternal(*db.instance);
49-
}
50-
std::string NetquackExtension::Name() {
51-
return "netquack";
52-
}
49+
// Function to parse the public suffix list and store it in a table
50+
static void LoadPublicSuffixList(DatabaseInstance &db)
51+
{
52+
// Check if the table already exists
53+
Connection con(db);
54+
auto table_exists = con.Query("SELECT 1 FROM information_schema.tables WHERE table_name = 'public_suffix_list'");
55+
56+
if (table_exists->RowCount() == 0)
57+
{
58+
// Download the list
59+
auto list_data = DownloadPublicSuffixList();
60+
61+
// Parse the list and insert into a table
62+
std::istringstream stream(list_data);
63+
std::string line;
64+
con.Query("CREATE TABLE public_suffix_list (suffix VARCHAR)");
65+
66+
while (std::getline(stream, line))
67+
{
68+
// Skip comments and empty lines
69+
if (line.empty() || line[0] == '/' || line[0] == ' ')
70+
continue;
71+
72+
// Replace `*.` with an empty string
73+
size_t wildcard_pos = line.find("*.");
74+
if (wildcard_pos != std::string::npos)
75+
{
76+
line.replace(wildcard_pos, 2, "");
77+
}
78+
79+
// Insert the suffix into the table
80+
con.Query("INSERT INTO public_suffix_list (suffix) VALUES ('" + line + "')");
81+
}
82+
}
83+
}
84+
85+
// Function to extract the main domain from a URL
86+
static void ExtractDomainFunction(DataChunk &args, ExpressionState &state, Vector &result)
87+
{
88+
// Load the public suffix list if not already loaded
89+
auto &db = *state.GetContext().db;
90+
LoadPublicSuffixList(db);
91+
Connection con(db);
92+
93+
// Extract the URL from the input
94+
auto &url_vector = args.data[0];
95+
auto result_data = FlatVector::GetData<string_t>(result);
96+
97+
for (idx_t i = 0; i < args.size(); i++)
98+
{
99+
auto url = url_vector.GetValue(i).ToString();
100+
101+
// Extract the host from the URL
102+
std::regex host_regex(R"(^(?:https?:\/\/)?([^\/\?:]+))");
103+
std::smatch host_match;
104+
if (!std::regex_search(url, host_match, host_regex))
105+
{
106+
result_data[i] = StringVector::AddString(result, "");
107+
continue;
108+
}
53109

54-
std::string NetquackExtension::Version() const {
110+
auto host = host_match[1].str();
111+
112+
// Split the host into parts
113+
std::vector<std::string> parts;
114+
std::istringstream stream(host);
115+
std::string part;
116+
while (std::getline(stream, part, '.'))
117+
{
118+
parts.push_back(part);
119+
}
120+
121+
// Find the longest matching public suffix
122+
std::string public_suffix;
123+
int public_suffix_index = -1;
124+
125+
for (int j = 0; j < parts.size(); j++)
126+
{
127+
// Build the candidate suffix
128+
std::string candidate;
129+
for (int k = j; k < parts.size(); k++)
130+
{
131+
candidate += (k == j ? "" : ".") + parts[k];
132+
}
133+
134+
// Query the public suffix list
135+
auto query = "SELECT 1 FROM public_suffix_list WHERE suffix = '" + candidate + "'";
136+
auto query_result = con.Query(query);
137+
138+
if (query_result->RowCount() > 0)
139+
{
140+
public_suffix = candidate;
141+
public_suffix_index = j;
142+
break;
143+
}
144+
}
145+
146+
// Determine the main domain
147+
std::string domain;
148+
if (!public_suffix.empty() && public_suffix_index > 0)
149+
{
150+
// Combine the part before the public suffix with the public suffix
151+
domain = parts[public_suffix_index - 1] + "." + public_suffix;
152+
}
153+
else if (!public_suffix.empty())
154+
{
155+
// No part before the suffix, use the public suffix only
156+
domain = public_suffix;
157+
}
158+
159+
result_data[i] = StringVector::AddString(result, domain);
160+
}
161+
}
162+
163+
static void LoadInternal(DatabaseInstance &instance)
164+
{
165+
auto insight_extract_domain_function = ScalarFunction("extract_domain", {LogicalType::VARCHAR},
166+
LogicalType::VARCHAR, ExtractDomainFunction);
167+
ExtensionUtil::RegisterFunction(instance, insight_extract_domain_function);
168+
}
169+
170+
void NetquackExtension::Load(DuckDB &db)
171+
{
172+
LoadInternal(*db.instance);
173+
}
174+
std::string NetquackExtension::Name()
175+
{
176+
return "netquack";
177+
}
178+
179+
std::string NetquackExtension::Version() const
180+
{
55181
#ifdef EXT_VERSION_NETQUACK
56-
return EXT_VERSION_NETQUACK;
182+
return EXT_VERSION_NETQUACK;
57183
#else
58-
return "";
184+
return "";
59185
#endif
60-
}
186+
}
61187

62188
} // namespace duckdb
63189

64-
extern "C" {
190+
extern "C"
191+
{
65192

66-
DUCKDB_EXTENSION_API void netquack_init(duckdb::DatabaseInstance &db) {
67-
duckdb::DuckDB db_wrapper(db);
68-
db_wrapper.LoadExtension<duckdb::NetquackExtension>();
69-
}
193+
DUCKDB_EXTENSION_API void netquack_init(duckdb::DatabaseInstance &db)
194+
{
195+
duckdb::DuckDB db_wrapper(db);
196+
db_wrapper.LoadExtension<duckdb::NetquackExtension>();
197+
}
70198

71-
DUCKDB_EXTENSION_API const char *netquack_version() {
72-
return duckdb::DuckDB::LibraryVersion();
73-
}
199+
DUCKDB_EXTENSION_API const char *netquack_version()
200+
{
201+
return duckdb::DuckDB::LibraryVersion();
202+
}
74203
}
75204

76205
#ifndef DUCKDB_EXTENSION_MAIN

0 commit comments

Comments
 (0)