|
| 1 | +#include "extract_extension.hpp" |
| 2 | + |
| 3 | +#include <regex> |
| 4 | + |
| 5 | +namespace duckdb |
| 6 | +{ |
| 7 | + // Function to extract the extension from a URL |
| 8 | + void ExtractExtensionFunction (DataChunk &args, ExpressionState &state, Vector &result) |
| 9 | + { |
| 10 | + // Extract the input from the arguments |
| 11 | + auto &input_vector = args.data[0]; |
| 12 | + auto result_data = FlatVector::GetData<string_t> (result); |
| 13 | + |
| 14 | + for (idx_t i = 0; i < args.size (); i++) |
| 15 | + { |
| 16 | + auto input = input_vector.GetValue (i).ToString (); |
| 17 | + std::transform (input.begin (), input.end (), input.begin (), ::tolower); |
| 18 | + |
| 19 | + try |
| 20 | + { |
| 21 | + // Extract the extension using the utility function |
| 22 | + auto ext = netquack::ExtractExtension (input); |
| 23 | + result_data[i] = StringVector::AddString (result, ext); |
| 24 | + } |
| 25 | + catch (const std::exception &e) |
| 26 | + { |
| 27 | + result_data[i] = "Error extracting extension: " + std::string (e.what ()); |
| 28 | + } |
| 29 | + }; |
| 30 | + } |
| 31 | + |
| 32 | + namespace netquack |
| 33 | + { |
| 34 | + std::string ExtractExtension (const std::string &input) |
| 35 | + { |
| 36 | + // Regex to extract valid file extensions from paths/URLs |
| 37 | + // Explanation: |
| 38 | + // (?<!\.) - Negative lookbehind ensures no preceding dot (avoids "..ext") |
| 39 | + // \. - Literal dot (extension separator) |
| 40 | + // ([a-zA-Z0-9]{1,10}) - Capturing group for extension: |
| 41 | + // - 1-10 alphanumeric chars (prevents long garbage matches) |
| 42 | + // (?=[?#]|$) - Positive lookahead for: |
| 43 | + // - Query separator (?) |
| 44 | + // - Fragment (#) |
| 45 | + // - Or end of string ($) |
| 46 | + // |
| 47 | + // Examples matched: |
| 48 | + // /path/image.jpg -> jpg |
| 49 | + // /doc.v12.pdf -> pdf |
| 50 | + // /archive.tar.gz -> gz |
| 51 | + // https://site.com/page.html?param=1 -> html |
| 52 | + // |
| 53 | + // Rejected cases: |
| 54 | + // /path..jpg -> no match (double dot) |
| 55 | + // /path. -> no match (no extension after dot) |
| 56 | + // /.hidden_file -> no match (no alnum after dot) |
| 57 | + // /path.with.dots/file -> no match (not at end) |
| 58 | + std::regex ext_regex (R"(^(?!.*\.\.)(?:.*\/)?[^\/?#]+\.([a-zA-Z0-9]{1,10})(?=[?#]|$))"); |
| 59 | + std::smatch ext_match; |
| 60 | + |
| 61 | + // Use regex_search to find the extension component in the input string |
| 62 | + if (std::regex_search (input, ext_match, ext_regex)) |
| 63 | + { |
| 64 | + // Check if the extension group was matched and is not empty |
| 65 | + if (ext_match.size () > 1 && ext_match[1].matched) |
| 66 | + { |
| 67 | + return ext_match[1].str (); |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + // If no extension is found, return an empty string |
| 72 | + return ""; |
| 73 | + } |
| 74 | + } // namespace netquack |
| 75 | +} // namespace duckdb |
0 commit comments