Skip to content

Commit 625ec05

Browse files
committed
feat: add extract_extension function
1 parent eb0d4de commit 625ec05

File tree

3 files changed

+98
-0
lines changed

3 files changed

+98
-0
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#include "extract_extension.hpp"
2+
3+
#include <regex>
4+
5+
namespace duckdb
6+
{
7+
// Function to extract the extension from a URL
8+
void ExtractExtensionFunction (DataChunk &args, ExpressionState &state, Vector &result)
9+
{
10+
// Extract the input from the arguments
11+
auto &input_vector = args.data[0];
12+
auto result_data = FlatVector::GetData<string_t> (result);
13+
14+
for (idx_t i = 0; i < args.size (); i++)
15+
{
16+
auto input = input_vector.GetValue (i).ToString ();
17+
std::transform (input.begin (), input.end (), input.begin (), ::tolower);
18+
19+
try
20+
{
21+
// Extract the extension using the utility function
22+
auto ext = netquack::ExtractExtension (input);
23+
result_data[i] = StringVector::AddString (result, ext);
24+
}
25+
catch (const std::exception &e)
26+
{
27+
result_data[i] = "Error extracting extension: " + std::string (e.what ());
28+
}
29+
};
30+
}
31+
32+
namespace netquack
33+
{
34+
std::string ExtractExtension (const std::string &input)
35+
{
36+
// Regex to extract valid file extensions from paths/URLs
37+
// Explanation:
38+
// (?<!\.) - Negative lookbehind ensures no preceding dot (avoids "..ext")
39+
// \. - Literal dot (extension separator)
40+
// ([a-zA-Z0-9]{1,10}) - Capturing group for extension:
41+
// - 1-10 alphanumeric chars (prevents long garbage matches)
42+
// (?=[?#]|$) - Positive lookahead for:
43+
// - Query separator (?)
44+
// - Fragment (#)
45+
// - Or end of string ($)
46+
//
47+
// Examples matched:
48+
// /path/image.jpg -> jpg
49+
// /doc.v12.pdf -> pdf
50+
// /archive.tar.gz -> gz
51+
// https://site.com/page.html?param=1 -> html
52+
//
53+
// Rejected cases:
54+
// /path..jpg -> no match (double dot)
55+
// /path. -> no match (no extension after dot)
56+
// /.hidden_file -> no match (no alnum after dot)
57+
// /path.with.dots/file -> no match (not at end)
58+
std::regex ext_regex (R"(^(?!.*\.\.)(?:.*\/)?[^\/?#]+\.([a-zA-Z0-9]{1,10})(?=[?#]|$))");
59+
std::smatch ext_match;
60+
61+
// Use regex_search to find the extension component in the input string
62+
if (std::regex_search (input, ext_match, ext_regex))
63+
{
64+
// Check if the extension group was matched and is not empty
65+
if (ext_match.size () > 1 && ext_match[1].matched)
66+
{
67+
return ext_match[1].str ();
68+
}
69+
}
70+
71+
// If no extension is found, return an empty string
72+
return "";
73+
}
74+
} // namespace netquack
75+
} // namespace duckdb
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#pragma once
2+
3+
#include "duckdb.hpp"
4+
5+
namespace duckdb
6+
{
7+
// Function to extract the extension from a URL
8+
void ExtractExtensionFunction (DataChunk &args, ExpressionState &state, Vector &result);
9+
10+
namespace netquack
11+
{
12+
// Function to extract the extension from a URL or host
13+
std::string ExtractExtension (const std::string &input);
14+
} // namespace netquack
15+
} // namespace duckdb

src/netquack_extension.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "duckdb/main/extension_util.hpp"
1010
#include "duckdb/parser/parsed_data/create_scalar_function_info.hpp"
1111
#include "functions/extract_domain.hpp"
12+
#include "functions/extract_extension.hpp"
1213
#include "functions/extract_host.hpp"
1314
#include "functions/extract_path.hpp"
1415
#include "functions/extract_port.hpp"
@@ -94,6 +95,13 @@ namespace duckdb
9495
ExtractPortFunction);
9596
ExtensionUtil::RegisterFunction (instance, netquack_extract_port_function);
9697

98+
auto netquack_extract_extension_function = ScalarFunction (
99+
"extract_extension",
100+
{ LogicalType::VARCHAR },
101+
LogicalType::VARCHAR,
102+
ExtractExtensionFunction);
103+
ExtensionUtil::RegisterFunction (instance, netquack_extract_extension_function);
104+
97105
auto netquack_update_tranco_function = ScalarFunction (
98106
"update_tranco",
99107
{ LogicalType::BOOLEAN },

0 commit comments

Comments
 (0)