Skip to content

Commit 30d7602

Browse files
committed
style: configure .clang-format
[no ci]
1 parent ef75de7 commit 30d7602

23 files changed

+1280
-885
lines changed

.clang-format

Lines changed: 391 additions & 0 deletions
Large diffs are not rendered by default.

src/functions/extract_domain.cpp

Lines changed: 91 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,97 @@
11
#include "extract_domain.hpp"
2-
#include "../utils/utils.hpp"
32

43
#include <regex>
54

5+
#include "../utils/utils.hpp"
6+
67
namespace duckdb
78
{
8-
// Function to extract the domain from a URL
9-
void ExtractDomainFunction(DataChunk &args, ExpressionState &state, Vector &result)
10-
{
11-
// Extract the input from the arguments
12-
auto &input_vector = args.data[0];
13-
auto input = input_vector.GetValue(0).ToString();
14-
15-
if (input.empty())
16-
{
17-
result.SetValue(0, Value(""));
18-
return;
19-
}
20-
21-
// Extract the domain using the utility function
22-
auto domain = netquack::ExtractDomain(state, input);
23-
24-
result.SetValue(0, Value(domain));
25-
}
26-
27-
namespace netquack
28-
{
29-
std::string ExtractDomain(ExpressionState &state, const std::string &input)
30-
{
31-
// Load the public suffix list if not already loaded
32-
auto &db = *state.GetContext().db;
33-
netquack::LoadPublicSuffixList(db, false);
34-
Connection con(db);
35-
36-
// Extract the host from the URL
37-
std::regex host_regex(R"(^(?:https?:\/\/)?([^\/\?:]+))");
38-
std::smatch host_match;
39-
if (!std::regex_search(input, host_match, host_regex))
40-
{
41-
return "";
42-
}
43-
44-
auto host = host_match[1].str();
45-
46-
// Split the host into parts
47-
std::vector<std::string> parts;
48-
std::istringstream stream(host);
49-
std::string part;
50-
while (std::getline(stream, part, '.'))
51-
{
52-
parts.push_back(part);
53-
}
54-
55-
// Find the longest matching public suffix
56-
std::string public_suffix;
57-
int public_suffix_index = -1;
58-
59-
for (int j = 0; j < parts.size(); j++)
60-
{
61-
// Build the candidate suffix
62-
std::string candidate;
63-
for (int k = j; k < parts.size(); k++)
64-
{
65-
candidate += (k == j ? "" : ".") + parts[k];
66-
}
67-
68-
// Query the public suffix list
69-
auto query = "SELECT 1 FROM public_suffix_list WHERE suffix = '" + candidate + "'";
70-
auto query_result = con.Query(query);
71-
72-
if (query_result->RowCount() > 0)
73-
{
74-
public_suffix = candidate;
75-
public_suffix_index = j;
76-
break;
77-
}
78-
}
79-
80-
// Determine the main domain
81-
std::string domain;
82-
if (!public_suffix.empty() && public_suffix_index > 0)
83-
{
84-
// Combine the part before the public suffix with the public suffix
85-
domain = parts[public_suffix_index - 1] + "." + public_suffix;
86-
}
87-
else if (!public_suffix.empty())
88-
{
89-
// No part before the suffix, use the public suffix only
90-
domain = public_suffix;
91-
}
92-
93-
return domain;
94-
}
95-
}
96-
}
9+
// Function to extract the domain from a URL
10+
void ExtractDomainFunction (DataChunk &args, ExpressionState &state, Vector &result)
11+
{
12+
// Extract the input from the arguments
13+
auto &input_vector = args.data[0];
14+
auto input = input_vector.GetValue (0).ToString ();
15+
16+
if (input.empty ())
17+
{
18+
result.SetValue (0, Value (""));
19+
return;
20+
}
21+
22+
// Extract the domain using the utility function
23+
auto domain = netquack::ExtractDomain (state, input);
24+
25+
result.SetValue (0, Value (domain));
26+
}
27+
28+
namespace netquack
29+
{
30+
std::string ExtractDomain (ExpressionState &state, const std::string &input)
31+
{
32+
// Load the public suffix list if not already loaded
33+
auto &db = *state.GetContext ().db;
34+
netquack::LoadPublicSuffixList (db, false);
35+
Connection con (db);
36+
37+
// Extract the host from the URL
38+
std::regex host_regex (R"(^(?:https?:\/\/)?([^\/\?:]+))");
39+
std::smatch host_match;
40+
if (!std::regex_search (input, host_match, host_regex))
41+
{
42+
return "";
43+
}
44+
45+
auto host = host_match[1].str ();
46+
47+
// Split the host into parts
48+
std::vector<std::string> parts;
49+
std::istringstream stream (host);
50+
std::string part;
51+
while (std::getline (stream, part, '.'))
52+
{
53+
parts.push_back (part);
54+
}
55+
56+
// Find the longest matching public suffix
57+
std::string public_suffix;
58+
int public_suffix_index = -1;
59+
60+
for (int j = 0; j < parts.size (); j++)
61+
{
62+
// Build the candidate suffix
63+
std::string candidate;
64+
for (int k = j; k < parts.size (); k++)
65+
{
66+
candidate += (k == j ? "" : ".") + parts[k];
67+
}
68+
69+
// Query the public suffix list
70+
auto query = "SELECT 1 FROM public_suffix_list WHERE suffix = '" + candidate + "'";
71+
auto query_result = con.Query (query);
72+
73+
if (query_result->RowCount () > 0)
74+
{
75+
public_suffix = candidate;
76+
public_suffix_index = j;
77+
break;
78+
}
79+
}
80+
81+
// Determine the main domain
82+
std::string domain;
83+
if (!public_suffix.empty () && public_suffix_index > 0)
84+
{
85+
// Combine the part before the public suffix with the public suffix
86+
domain = parts[public_suffix_index - 1] + "." + public_suffix;
87+
}
88+
else if (!public_suffix.empty ())
89+
{
90+
// No part before the suffix, use the public suffix only
91+
domain = public_suffix;
92+
}
93+
94+
return domain;
95+
}
96+
} // namespace netquack
97+
} // namespace duckdb

src/functions/extract_domain.hpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44

55
namespace duckdb
66
{
7-
void ExtractDomainFunction(DataChunk &args, ExpressionState &state, Vector &result);
7+
void ExtractDomainFunction (DataChunk &args, ExpressionState &state, Vector &result);
88

9-
namespace netquack
10-
{
11-
// Function to extract the main domain from a URL
12-
std::string ExtractDomain(ExpressionState &state, const std::string &input);
13-
}
14-
}
9+
namespace netquack
10+
{
11+
// Function to extract the main domain from a URL
12+
std::string ExtractDomain (ExpressionState &state, const std::string &input);
13+
} // namespace netquack
14+
} // namespace duckdb

src/functions/extract_host.cpp

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,46 +4,46 @@
44

55
namespace duckdb
66
{
7-
// Function to extract the host from a URL
8-
void ExtractHostFunction(DataChunk &args, ExpressionState &state, Vector &result)
9-
{
10-
// Extract the input from the arguments
11-
auto &input_vector = args.data[0];
12-
auto input = input_vector.GetValue(0).ToString();
7+
// Function to extract the host from a URL
8+
void ExtractHostFunction (DataChunk &args, ExpressionState &state, Vector &result)
9+
{
10+
// Extract the input from the arguments
11+
auto &input_vector = args.data[0];
12+
auto input = input_vector.GetValue (0).ToString ();
1313

14-
// Extract the host using the utility function
15-
auto host = netquack::ExtractHost(input);
14+
// Extract the host using the utility function
15+
auto host = netquack::ExtractHost (input);
1616

17-
// Set the result
18-
result.SetValue(0, Value(host));
19-
}
17+
// Set the result
18+
result.SetValue (0, Value (host));
19+
}
2020

21-
namespace netquack
22-
{
23-
std::string ExtractHost(const std::string &input)
24-
{
25-
// Regex to match the host component of a URL
26-
// Explanation:
27-
// ^ - Start of the string
28-
// (?: - Non-capturing group for the optional protocol
29-
// https?:\/\/ - Matches "http://" or "https://"
30-
// )?
31-
// ([^\/\s:?#]+) - Capturing group for the host (any characters except '/', ':', '?', '#', or whitespace)
32-
std::regex host_regex(R"(^(?:https?:\/\/)?([^\/\s:?#]+))");
33-
std::smatch host_match;
21+
namespace netquack
22+
{
23+
std::string ExtractHost (const std::string &input)
24+
{
25+
// Regex to match the host component of a URL
26+
// Explanation:
27+
// ^ - Start of the string
28+
// (?: - Non-capturing group for the optional protocol
29+
// https?:\/\/ - Matches "http://" or "https://"
30+
// )?
31+
// ([^\/\s:?#]+) - Capturing group for the host (any characters except '/', ':', '?', '#', or whitespace)
32+
std::regex host_regex (R"(^(?:https?:\/\/)?([^\/\s:?#]+))");
33+
std::smatch host_match;
3434

35-
// Use regex_search to find the host component in the input string
36-
if (std::regex_search(input, host_match, host_regex))
37-
{
38-
// Check if the host group was matched and is not empty
39-
if (host_match.size() > 1 && host_match[1].matched)
40-
{
41-
return host_match[1].str();
42-
}
43-
}
35+
// Use regex_search to find the host component in the input string
36+
if (std::regex_search (input, host_match, host_regex))
37+
{
38+
// Check if the host group was matched and is not empty
39+
if (host_match.size () > 1 && host_match[1].matched)
40+
{
41+
return host_match[1].str ();
42+
}
43+
}
4444

45-
// If no host is found, return an empty string
46-
return "";
47-
}
48-
}
49-
}
45+
// If no host is found, return an empty string
46+
return "";
47+
}
48+
} // namespace netquack
49+
} // namespace duckdb

src/functions/extract_host.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
namespace duckdb
66
{
7-
// Function to extract the host from a URL
8-
void ExtractHostFunction(DataChunk &args, ExpressionState &state, Vector &result);
7+
// Function to extract the host from a URL
8+
void ExtractHostFunction (DataChunk &args, ExpressionState &state, Vector &result);
99

10-
namespace netquack
11-
{
12-
// Function to extract the host from a URL
13-
std::string ExtractHost(const std::string &input);
14-
}
15-
}
10+
namespace netquack
11+
{
12+
// Function to extract the host from a URL
13+
std::string ExtractHost (const std::string &input);
14+
} // namespace netquack
15+
} // namespace duckdb

src/functions/extract_path.cpp

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,47 +4,47 @@
44

55
namespace duckdb
66
{
7-
// Function to extract the path from a URL
8-
void ExtractPathFunction(DataChunk &args, ExpressionState &state, Vector &result)
9-
{
10-
// Extract the input from the arguments
11-
auto &input_vector = args.data[0];
12-
auto input = input_vector.GetValue(0).ToString();
7+
// Function to extract the path from a URL
8+
void ExtractPathFunction (DataChunk &args, ExpressionState &state, Vector &result)
9+
{
10+
// Extract the input from the arguments
11+
auto &input_vector = args.data[0];
12+
auto input = input_vector.GetValue (0).ToString ();
1313

14-
// Extract the path using the utility function
15-
auto path = netquack::ExtractPath(input);
14+
// Extract the path using the utility function
15+
auto path = netquack::ExtractPath (input);
1616

17-
// Set the result
18-
result.SetValue(0, Value(path));
19-
}
17+
// Set the result
18+
result.SetValue (0, Value (path));
19+
}
2020

21-
namespace netquack
22-
{
23-
std::string ExtractPath(const std::string &input)
24-
{
25-
// Regex to match the path component of a URL
26-
// Explanation:
27-
// ^ - Start of the string
28-
// (?: - Non-capturing group for the protocol and domain part
29-
// (?:https?:\/\/)? - Optional http:// or https://
30-
// (?:[^\/\s]+) - Domain name (any characters except '/' or whitespace)
31-
// )
32-
// (\/[^?#]*) - Capturing group for the path (starts with '/', followed by any characters except '?' or '#')
33-
std::regex path_regex(R"(^(?:(?:https?:\/\/)?(?:[^\/\s]+))(\/[^?#]*))");
34-
std::smatch path_match;
21+
namespace netquack
22+
{
23+
std::string ExtractPath (const std::string &input)
24+
{
25+
// Regex to match the path component of a URL
26+
// Explanation:
27+
// ^ - Start of the string
28+
// (?: - Non-capturing group for the protocol and domain part
29+
// (?:https?:\/\/)? - Optional http:// or https://
30+
// (?:[^\/\s]+) - Domain name (any characters except '/' or whitespace)
31+
// )
32+
// (\/[^?#]*) - Capturing group for the path (starts with '/', followed by any characters except '?' or '#')
33+
std::regex path_regex (R"(^(?:(?:https?:\/\/)?(?:[^\/\s]+))(\/[^?#]*))");
34+
std::smatch path_match;
3535

36-
// Use regex_search to find the path component in the input string
37-
if (std::regex_search(input, path_match, path_regex))
38-
{
39-
// Check if the path group was matched and is not empty
40-
if (path_match.size() > 1 && path_match[1].matched)
41-
{
42-
return path_match[1].str();
43-
}
44-
}
36+
// Use regex_search to find the path component in the input string
37+
if (std::regex_search (input, path_match, path_regex))
38+
{
39+
// Check if the path group was matched and is not empty
40+
if (path_match.size () > 1 && path_match[1].matched)
41+
{
42+
return path_match[1].str ();
43+
}
44+
}
4545

46-
// If no path is found, return the default path "/"
47-
return "/";
48-
}
49-
}
50-
}
46+
// If no path is found, return the default path "/"
47+
return "/";
48+
}
49+
} // namespace netquack
50+
} // namespace duckdb

0 commit comments

Comments
 (0)