Skip to content

Commit 2da0c4a

Browse files
authored
Merge pull request #26 from hatamiarash7/url-encode-decode
Add `url_encode` / `url_decode` Functions
2 parents 7772b7c + fa1539c commit 2da0c4a

File tree

8 files changed

+766
-1
lines changed

8 files changed

+766
-1
lines changed

README.md

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Table of Contents
4242
- [Validate URL](#validate-url)
4343
- [Validate Domain](#validate-domain)
4444
- [Extract Path Segments](#extract-path-segments)
45+
- [URL Encode / Decode](#url-encode--decode)
4546
- [Get Extension Version](#get-extension-version)
4647
- [Build Requirements](#build-requirements)
4748
- [Debugging](#debugging)
@@ -832,6 +833,58 @@ D SELECT u.url,
832833
└───────────────────────────┴───────────────┴─────────┘
833834
```
834835

836+
### URL Encode / Decode
837+
838+
The `url_encode` function percent-encodes a string per RFC 3986. Only unreserved characters (`A-Z`, `a-z`, `0-9`, `-`, `_`, `.`, `~`) are left as-is — everything else is encoded as `%XX` with uppercase hex digits.
839+
840+
The `url_decode` function decodes percent-encoded strings back to their original form. It also decodes `+` as a space (for `application/x-www-form-urlencoded` compatibility). Invalid percent sequences are passed through literally.
841+
842+
```sql
843+
D SELECT url_encode('hello world') AS encoded;
844+
┌───────────────┐
845+
│ encoded │
846+
varchar
847+
├───────────────┤
848+
│ hello%20world │
849+
└───────────────┘
850+
851+
D SELECT url_decode('hello%20world') AS decoded;
852+
┌─────────────┐
853+
│ decoded │
854+
varchar
855+
├─────────────┤
856+
│ hello world │
857+
└─────────────┘
858+
859+
D SELECT url_encode('https://www.google.com/search?client=firefox-b-d&q=url+encode') AS encoded;
860+
┌─────────────────────────────────────────────────────────────────────────────────┐
861+
│ encoded │
862+
varchar
863+
├─────────────────────────────────────────────────────────────────────────────────┤
864+
│ https%3A%2F%2Fwww.google.com%2Fsearch%3Fclient%3Dfirefox-b-d%26q%3Durl%2Bencode │
865+
└─────────────────────────────────────────────────────────────────────────────────┘
866+
867+
D SELECT url_decode(url_encode('café 🦆')) AS roundtrip;
868+
┌───────────┐
869+
│ roundtrip │
870+
varchar
871+
├───────────┤
872+
│ café 🦆 │
873+
└───────────┘
874+
```
875+
876+
`url_decode` also decodes `+` as space:
877+
878+
```sql
879+
D SELECT url_decode('hello+world') AS decoded;
880+
┌─────────────┐
881+
│ decoded │
882+
varchar
883+
├─────────────┤
884+
│ hello world │
885+
└─────────────┘
886+
```
887+
835888
### Get Extension Version
836889

837890
You can use the `netquack_version` function to get the extension version.
@@ -874,7 +927,6 @@ Also, there will be stdout errors for background tasks like CURL.
874927
- [ ] Save Tranco data as Parquet
875928
- [ ] Implement GeoIP functionality
876929
- [ ] Return default value for `get_tranco_rank`
877-
- [ ] Implement `url_encode` / `url_decode` functions - Standalone percent-encoding and decoding
878930
- [ ] Implement `ip_in_range` function - Check if an IP falls within a given CIDR block
879931
- [ ] Support internationalized domain names (IDNs)
880932
- [ ] Implement `punycode_encode` / `punycode_decode` functions - Convert internationalized domain names to/from ASCII-compatible encoding

docs/SUMMARY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
* [Validate URL](functions/is-valid-url.md)
2828
* [Validate Domain](functions/is-valid-domain.md)
2929
* [Extract Path Segments](functions/extract-path-segments.md)
30+
* [URL Encode / Decode](functions/url-encode-functions.md)
3031
* [Tranco](functions/tranco/README.md)
3132
* [Get Tranco Rank](functions/tranco/get-tranco-rank.md)
3233
* [Download / Update Tranco](functions/tranco/download-update-tranco.md)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
---
2+
layout:
3+
title:
4+
visible: true
5+
description:
6+
visible: false
7+
tableOfContents:
8+
visible: true
9+
outline:
10+
visible: true
11+
pagination:
12+
visible: true
13+
---
14+
15+
# URL Encode / Decode
16+
17+
The `url_encode` function percent-encodes a string per RFC 3986. Only unreserved characters (`A-Z`, `a-z`, `0-9`, `-`, `_`, `.`, `~`) are left as-is — everything else is encoded as `%XX` with uppercase hex digits.
18+
19+
The `url_decode` function decodes percent-encoded strings back to their original form. It also decodes `+` as a space (for `application/x-www-form-urlencoded` compatibility). Invalid percent sequences (e.g., `%ZZ`, trailing `%`) are passed through literally.
20+
21+
## Encode
22+
23+
```sql
24+
D SELECT url_encode('hello world') AS encoded;
25+
┌───────────────┐
26+
│ encoded │
27+
varchar
28+
├───────────────┤
29+
│ hello%20world │
30+
└───────────────┘
31+
```
32+
33+
```sql
34+
D SELECT url_encode('café') AS encoded;
35+
┌───────────┐
36+
│ encoded │
37+
varchar
38+
├───────────┤
39+
│ caf%C3%A9 │
40+
└───────────┘
41+
```
42+
43+
```sql
44+
D SELECT url_encode('key=value&lang=en') AS encoded;
45+
┌───────────────────────────┐
46+
│ encoded │
47+
varchar
48+
├───────────────────────────┤
49+
│ key%3Dvalue%26lang%3Den │
50+
└───────────────────────────┘
51+
```
52+
53+
## Decode
54+
55+
```sql
56+
D SELECT url_decode('hello%20world') AS decoded;
57+
┌─────────────┐
58+
│ decoded │
59+
varchar
60+
├─────────────┤
61+
│ hello world │
62+
└─────────────┘
63+
```
64+
65+
```sql
66+
D SELECT url_decode('hello+world') AS decoded;
67+
┌─────────────┐
68+
│ decoded │
69+
varchar
70+
├─────────────┤
71+
│ hello world │
72+
└─────────────┘
73+
```
74+
75+
```sql
76+
D SELECT url_decode('caf%C3%A9') AS decoded;
77+
┌─────────┐
78+
│ decoded │
79+
varchar
80+
├─────────┤
81+
│ café │
82+
└─────────┘
83+
```
84+
85+
## Round-trip
86+
87+
```sql
88+
D SELECT url_decode(url_encode('https://example.com/path?q=hello world')) AS roundtrip;
89+
┌──────────────────────────────────────────┐
90+
│ roundtrip │
91+
varchar
92+
├──────────────────────────────────────────┤
93+
│ https://example.com/path?q=hello world │
94+
└──────────────────────────────────────────┘
95+
```
96+
97+
Returns an empty string for empty input and `NULL` for `NULL` input.
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
// Copyright 2026 Arash Hatami
2+
3+
#include "url_encode_functions.hpp"
4+
5+
#include <array>
6+
#include <sstream>
7+
8+
namespace duckdb {
9+
10+
// RFC 3986 unreserved characters: A-Z a-z 0-9 - _ . ~
11+
// These are the ONLY characters that are NOT percent-encoded.
12+
static constexpr std::array<bool, 256> BuildUnreservedTable() {
13+
std::array<bool, 256> table = {};
14+
for (auto &v : table) {
15+
v = false;
16+
}
17+
// A-Z
18+
for (int c = 'A'; c <= 'Z'; c++) {
19+
table[c] = true;
20+
}
21+
// a-z
22+
for (int c = 'a'; c <= 'z'; c++) {
23+
table[c] = true;
24+
}
25+
// 0-9
26+
for (int c = '0'; c <= '9'; c++) {
27+
table[c] = true;
28+
}
29+
// - _ . ~
30+
table[static_cast<uint8_t>('-')] = true;
31+
table[static_cast<uint8_t>('_')] = true;
32+
table[static_cast<uint8_t>('.')] = true;
33+
table[static_cast<uint8_t>('~')] = true;
34+
return table;
35+
}
36+
37+
static constexpr auto UNRESERVED_TABLE = BuildUnreservedTable();
38+
39+
static const char HEX_DIGITS[] = "0123456789ABCDEF";
40+
41+
// Returns -1 for invalid hex character
42+
static int HexVal(char c) {
43+
if (c >= '0' && c <= '9') {
44+
return c - '0';
45+
}
46+
if (c >= 'a' && c <= 'f') {
47+
return c - 'a' + 10;
48+
}
49+
if (c >= 'A' && c <= 'F') {
50+
return c - 'A' + 10;
51+
}
52+
return -1;
53+
}
54+
55+
void UrlEncodeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
56+
auto &input_vector = args.data[0];
57+
auto result_data = FlatVector::GetData<string_t>(result);
58+
auto &result_validity = FlatVector::Validity(result);
59+
60+
for (idx_t i = 0; i < args.size(); i++) {
61+
auto value = input_vector.GetValue(i);
62+
if (value.IsNull()) {
63+
result_validity.SetInvalid(i);
64+
continue;
65+
}
66+
67+
auto input = value.ToString();
68+
auto encoded = netquack::UrlEncode(input);
69+
result_data[i] = StringVector::AddString(result, encoded);
70+
}
71+
}
72+
73+
void UrlDecodeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
74+
auto &input_vector = args.data[0];
75+
auto result_data = FlatVector::GetData<string_t>(result);
76+
auto &result_validity = FlatVector::Validity(result);
77+
78+
for (idx_t i = 0; i < args.size(); i++) {
79+
auto value = input_vector.GetValue(i);
80+
if (value.IsNull()) {
81+
result_validity.SetInvalid(i);
82+
continue;
83+
}
84+
85+
auto input = value.ToString();
86+
auto decoded = netquack::UrlDecode(input);
87+
result_data[i] = StringVector::AddString(result, decoded);
88+
}
89+
}
90+
91+
namespace netquack {
92+
93+
std::string UrlEncode(const std::string &input) {
94+
if (input.empty()) {
95+
return "";
96+
}
97+
98+
std::string result;
99+
// Worst case: every byte becomes %XX (3x expansion)
100+
result.reserve(input.size() * 3);
101+
102+
for (unsigned char c : input) {
103+
if (UNRESERVED_TABLE[c]) {
104+
result += static_cast<char>(c);
105+
} else {
106+
result += '%';
107+
result += HEX_DIGITS[(c >> 4) & 0x0F];
108+
result += HEX_DIGITS[c & 0x0F];
109+
}
110+
}
111+
112+
return result;
113+
}
114+
115+
std::string UrlDecode(const std::string &input) {
116+
if (input.empty()) {
117+
return "";
118+
}
119+
120+
std::string result;
121+
result.reserve(input.size());
122+
123+
for (size_t i = 0; i < input.size(); i++) {
124+
if (input[i] == '%' && i + 2 < input.size()) {
125+
int hi = HexVal(input[i + 1]);
126+
int lo = HexVal(input[i + 2]);
127+
if (hi >= 0 && lo >= 0) {
128+
result += static_cast<char>((hi << 4) | lo);
129+
i += 2;
130+
continue;
131+
}
132+
// Invalid hex digits — pass through the '%' literally
133+
result += '%';
134+
} else if (input[i] == '+') {
135+
// '+' is commonly used as space in application/x-www-form-urlencoded
136+
result += ' ';
137+
} else {
138+
result += input[i];
139+
}
140+
}
141+
142+
return result;
143+
}
144+
145+
} // namespace netquack
146+
} // namespace duckdb
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Copyright 2026 Arash Hatami
2+
3+
#pragma once
4+
5+
#include "duckdb.hpp"
6+
7+
namespace duckdb {
8+
void UrlEncodeFunction(DataChunk &args, ExpressionState &state, Vector &result);
9+
void UrlDecodeFunction(DataChunk &args, ExpressionState &state, Vector &result);
10+
11+
namespace netquack {
12+
std::string UrlEncode(const std::string &input);
13+
std::string UrlDecode(const std::string &input);
14+
} // namespace netquack
15+
} // namespace duckdb

src/netquack_extension.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "functions/extract_host.hpp"
1515
#include "functions/extract_path.hpp"
1616
#include "functions/extract_path_segments.hpp"
17+
#include "functions/url_encode_functions.hpp"
1718
#include "functions/extract_port.hpp"
1819
#include "functions/extract_query.hpp"
1920
#include "functions/extract_schema.hpp"
@@ -145,6 +146,14 @@ static void LoadInternal(ExtensionLoader &loader) {
145146
extract_path_segments_function.in_out_function = netquack::ExtractPathSegmentsFunc::Function;
146147
loader.RegisterFunction(extract_path_segments_function);
147148

149+
auto url_encode_function =
150+
ScalarFunction("url_encode", {LogicalType::VARCHAR}, LogicalType::VARCHAR, UrlEncodeFunction);
151+
loader.RegisterFunction(url_encode_function);
152+
153+
auto url_decode_function =
154+
ScalarFunction("url_decode", {LogicalType::VARCHAR}, LogicalType::VARCHAR, UrlDecodeFunction);
155+
loader.RegisterFunction(url_decode_function);
156+
148157
auto version_function =
149158
TableFunction("netquack_version", {}, netquack::VersionFunc::Scan, netquack::VersionFunc::Bind,
150159
netquack::VersionFunc::InitGlobal, netquack::VersionFunc::InitLocal);

test/sql/null_handling.test

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,18 @@ SELECT is_valid_domain(NULL);
9494
----
9595
NULL
9696

97+
# Test NULL handling for url_encode
98+
query I
99+
SELECT url_encode(NULL);
100+
----
101+
NULL
102+
103+
# Test NULL handling for url_decode
104+
query I
105+
SELECT url_decode(NULL);
106+
----
107+
NULL
108+
97109
# Test NULL values in a table
98110
statement ok
99111
CREATE TABLE test_nulls (url VARCHAR);

0 commit comments

Comments
 (0)