Skip to content

Commit 7772b7c

Browse files
authored
Merge pull request #25 from hatamiarash7/extract-path-segments
Add `extract_path_segments` function
2 parents eec48de + 2f4ec25 commit 7772b7c

File tree

7 files changed

+515
-1
lines changed

7 files changed

+515
-1
lines changed

README.md

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ Table of Contents
4141
- [Base64 Encode / Decode](#base64-encode--decode)
4242
- [Validate URL](#validate-url)
4343
- [Validate Domain](#validate-domain)
44+
- [Extract Path Segments](#extract-path-segments)
4445
- [Get Extension Version](#get-extension-version)
4546
- [Build Requirements](#build-requirements)
4647
- [Debugging](#debugging)
@@ -793,6 +794,44 @@ D SELECT is_valid_domain('localhost') AS valid;
793794
└─────────┘
794795
```
795796

797+
### Extract Path Segments
798+
799+
The `extract_path_segments` table function splits a URL path into individual segment rows. Each row contains a 1-based `segment_index` and the `segment` string. Returns 0 rows for `NULL`, empty, or root-only paths.
800+
801+
```sql
802+
D SELECT * FROM extract_path_segments('https://example.com/path/to/page?q=1');
803+
┌───────────────┬─────────┐
804+
│ segment_index │ segment │
805+
│ int32 │ varchar
806+
├───────────────┼─────────┤
807+
1path
808+
2 │ to │
809+
3 │ page │
810+
└───────────────┴─────────┘
811+
```
812+
813+
Use with `LATERAL` to expand segments per row in a table:
814+
815+
```sql
816+
D SELECT u.url,
817+
s.segment_index,
818+
s.segment
819+
FROM urls u,
820+
LATERAL extract_path_segments(u.url) s
821+
ORDER BY u.url,
822+
s.segment_index;
823+
┌───────────────────────────┬───────────────┬─────────┐
824+
│ url │ segment_index │ segment │
825+
varchar │ int32 │ varchar
826+
├───────────────────────────┼───────────────┼─────────┤
827+
│ https://example.com/a/b/c │ 1 │ a │
828+
│ https://example.com/a/b/c │ 2 │ b │
829+
│ https://example.com/a/b/c │ 3 │ c │
830+
│ https://test.org/x/y │ 1 │ x │
831+
│ https://test.org/x/y │ 2 │ y │
832+
└───────────────────────────┴───────────────┴─────────┘
833+
```
834+
796835
### Get Extension Version
797836

798837
You can use the `netquack_version` function to get the extension version.
@@ -839,7 +878,6 @@ Also, there will be stdout errors for background tasks like CURL.
839878
- [ ] Implement `ip_in_range` function - Check if an IP falls within a given CIDR block
840879
- [ ] Support internationalized domain names (IDNs)
841880
- [ ] Implement `punycode_encode` / `punycode_decode` functions - Convert internationalized domain names to/from ASCII-compatible encoding
842-
- [ ] Implement `extract_path_segments` table function - Split a URL path into individual segment rows
843881

844882
## Contributing 🤝
845883

docs/SUMMARY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
* [Base64 Encode / Decode](functions/base64-functions.md)
2727
* [Validate URL](functions/is-valid-url.md)
2828
* [Validate Domain](functions/is-valid-domain.md)
29+
* [Extract Path Segments](functions/extract-path-segments.md)
2930
* [Tranco](functions/tranco/README.md)
3031
* [Get Tranco Rank](functions/tranco/get-tranco-rank.md)
3132
* [Download / Update Tranco](functions/tranco/download-update-tranco.md)
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
---
2+
layout:
3+
title:
4+
visible: true
5+
description:
6+
visible: false
7+
tableOfContents:
8+
visible: true
9+
outline:
10+
visible: true
11+
pagination:
12+
visible: true
13+
---
14+
15+
# Extract Path Segments
16+
17+
The `extract_path_segments` table function splits a URL path into individual segment rows. Each row contains a 1-based `segment_index` and the `segment` string. This is useful for analyzing URL structures, filtering by path depth, or joining path components.
18+
19+
```sql
20+
D SELECT * FROM extract_path_segments('https://example.com/path/to/page?q=1');
21+
┌───────────────┬─────────┐
22+
│ segment_index │ segment │
23+
│ int32 │ varchar
24+
├───────────────┼─────────┤
25+
1path
26+
2 │ to │
27+
3 │ page │
28+
└───────────────┴─────────┘
29+
```
30+
31+
```sql
32+
D SELECT * FROM extract_path_segments('https://api.example.com/v3/users/42/repos');
33+
┌───────────────┬─────────┐
34+
│ segment_index │ segment │
35+
│ int32 │ varchar
36+
├───────────────┼─────────┤
37+
1 │ v3 │
38+
2 │ users │
39+
342
40+
4 │ repos │
41+
└───────────────┴─────────┘
42+
```
43+
44+
## LATERAL Join
45+
46+
Use with `LATERAL` to expand path segments per row in a table:
47+
48+
```sql
49+
D SELECT u.url,
50+
s.segment_index,
51+
s.segment
52+
FROM urls u,
53+
LATERAL extract_path_segments(u.url) s
54+
ORDER BY u.url,
55+
s.segment_index;
56+
┌───────────────────────────┬───────────────┬─────────┐
57+
│ url │ segment_index │ segment │
58+
varchar │ int32 │ varchar
59+
├───────────────────────────┼───────────────┼─────────┤
60+
│ https://example.com/a/b/c │ 1 │ a │
61+
│ https://example.com/a/b/c │ 2 │ b │
62+
│ https://example.com/a/b/c │ 3 │ c │
63+
│ https://test.org/x/y │ 1 │ x │
64+
│ https://test.org/x/y │ 2 │ y │
65+
└───────────────────────────┴───────────────┴─────────┘
66+
```
67+
68+
Returns 0 rows for URLs with no path, root path (`/`), empty strings, and `NULL` input.
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
// Copyright 2026 Arash Hatami
2+
3+
#include "extract_path_segments.hpp"
4+
5+
#include "../utils/url_helpers.hpp"
6+
7+
#include <string>
8+
#include <vector>
9+
10+
namespace duckdb {
11+
namespace netquack {
12+
13+
// ---------------------------------------------------------------------------
14+
// Pure logic: extract path from URL, then split into non-empty segments
15+
// ---------------------------------------------------------------------------
16+
std::vector<std::string> ExtractPathSegments(const std::string &input) {
17+
std::vector<std::string> segments;
18+
19+
if (input.empty()) {
20+
return segments;
21+
}
22+
23+
const char *data = input.data();
24+
size_t size = input.size();
25+
const char *pos = data;
26+
const char *end = pos + size;
27+
28+
// Locate the path start (same logic as ExtractPath)
29+
pos = find_first_symbols<'/'>(pos, end);
30+
if (end == pos) {
31+
return segments;
32+
}
33+
34+
bool has_subsequent_slash = pos + 1 < end && pos[1] == '/';
35+
if (has_subsequent_slash) {
36+
pos = find_first_symbols<'/'>(pos + 2, end);
37+
if (end == pos) {
38+
return segments;
39+
}
40+
}
41+
42+
// Path ends at '?' or '#'
43+
const char *path_end = find_first_symbols<'?', '#'>(pos, end);
44+
45+
// Skip leading '/'
46+
if (pos < path_end && *pos == '/') {
47+
++pos;
48+
}
49+
50+
// Split on '/'
51+
const char *seg_start = pos;
52+
for (const char *cur = pos; cur <= path_end; ++cur) {
53+
if (cur == path_end || *cur == '/') {
54+
if (cur > seg_start) {
55+
segments.emplace_back(seg_start, cur - seg_start);
56+
}
57+
seg_start = cur + 1;
58+
}
59+
}
60+
61+
return segments;
62+
}
63+
64+
// ---------------------------------------------------------------------------
65+
// Table function callbacks
66+
// ---------------------------------------------------------------------------
67+
struct ExtractPathSegmentsData : public TableFunctionData {};
68+
69+
struct ExtractPathSegmentsLocalState : public LocalTableFunctionState {
70+
std::vector<std::string> segments;
71+
idx_t current_idx = 0;
72+
bool done = false;
73+
};
74+
75+
unique_ptr<FunctionData> ExtractPathSegmentsFunc::Bind(ClientContext &context, TableFunctionBindInput &input,
76+
vector<LogicalType> &return_types, vector<string> &names) {
77+
// Output columns: segment_index (1-based) and segment
78+
return_types.emplace_back(LogicalType(LogicalTypeId::INTEGER));
79+
names.emplace_back("segment_index");
80+
81+
return_types.emplace_back(LogicalType(LogicalTypeId::VARCHAR));
82+
names.emplace_back("segment");
83+
84+
return make_uniq<ExtractPathSegmentsData>();
85+
}
86+
87+
unique_ptr<LocalTableFunctionState> ExtractPathSegmentsFunc::InitLocal(ExecutionContext &context,
88+
TableFunctionInitInput &input,
89+
GlobalTableFunctionState *global_state_p) {
90+
return make_uniq<ExtractPathSegmentsLocalState>();
91+
}
92+
93+
OperatorResultType ExtractPathSegmentsFunc::Function(ExecutionContext &context, TableFunctionInput &data_p,
94+
DataChunk &input, DataChunk &output) {
95+
auto &local_state = data_p.local_state->Cast<ExtractPathSegmentsLocalState>();
96+
97+
// Already finished outputting for this input row — request next input
98+
if (local_state.done) {
99+
local_state.done = false;
100+
local_state.segments.clear();
101+
local_state.current_idx = 0;
102+
return OperatorResultType::NEED_MORE_INPUT;
103+
}
104+
105+
// Continue outputting remaining segments from a previous HAVE_MORE_OUTPUT
106+
if (!local_state.segments.empty() && local_state.current_idx < local_state.segments.size()) {
107+
idx_t count = 0;
108+
while (local_state.current_idx < local_state.segments.size() && count < STANDARD_VECTOR_SIZE) {
109+
output.data[0].SetValue(count, Value::INTEGER(static_cast<int32_t>(local_state.current_idx + 1)));
110+
output.data[1].SetValue(count, Value(local_state.segments[local_state.current_idx]));
111+
++local_state.current_idx;
112+
++count;
113+
}
114+
output.SetCardinality(count);
115+
116+
if (local_state.current_idx >= local_state.segments.size()) {
117+
local_state.done = true;
118+
}
119+
return OperatorResultType::HAVE_MORE_OUTPUT;
120+
}
121+
122+
// Parse the URL from the input chunk
123+
auto url_value = input.data[0].GetValue(0);
124+
if (url_value.IsNull()) {
125+
output.SetCardinality(0);
126+
return OperatorResultType::NEED_MORE_INPUT;
127+
}
128+
129+
auto url = url_value.GetValue<string>();
130+
local_state.segments = ExtractPathSegments(url);
131+
local_state.current_idx = 0;
132+
133+
if (local_state.segments.empty()) {
134+
output.SetCardinality(0);
135+
return OperatorResultType::NEED_MORE_INPUT;
136+
}
137+
138+
// Output as many segments as we can
139+
idx_t count = 0;
140+
while (local_state.current_idx < local_state.segments.size() && count < STANDARD_VECTOR_SIZE) {
141+
output.data[0].SetValue(count, Value::INTEGER(static_cast<int32_t>(local_state.current_idx + 1)));
142+
output.data[1].SetValue(count, Value(local_state.segments[local_state.current_idx]));
143+
++local_state.current_idx;
144+
++count;
145+
}
146+
output.SetCardinality(count);
147+
148+
if (local_state.current_idx >= local_state.segments.size()) {
149+
local_state.done = true;
150+
}
151+
return OperatorResultType::HAVE_MORE_OUTPUT;
152+
}
153+
154+
} // namespace netquack
155+
} // namespace duckdb
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// Copyright 2026 Arash Hatami
2+
3+
#pragma once
4+
5+
#include "duckdb.hpp"
6+
7+
namespace duckdb {
8+
namespace netquack {
9+
10+
// Table function to split a URL path into individual segment rows
11+
struct ExtractPathSegmentsFunc {
12+
static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindInput &input,
13+
vector<LogicalType> &return_types, vector<string> &names);
14+
static unique_ptr<LocalTableFunctionState> InitLocal(ExecutionContext &context, TableFunctionInitInput &input,
15+
GlobalTableFunctionState *global_state_p);
16+
static OperatorResultType Function(ExecutionContext &context, TableFunctionInput &data_p, DataChunk &input,
17+
DataChunk &output);
18+
};
19+
20+
// Pure logic: extract path segments from a URL
21+
std::vector<std::string> ExtractPathSegments(const std::string &input);
22+
23+
} // namespace netquack
24+
} // namespace duckdb

src/netquack_extension.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "functions/extract_fragment.hpp"
1414
#include "functions/extract_host.hpp"
1515
#include "functions/extract_path.hpp"
16+
#include "functions/extract_path_segments.hpp"
1617
#include "functions/extract_port.hpp"
1718
#include "functions/extract_query.hpp"
1819
#include "functions/extract_schema.hpp"
@@ -138,6 +139,12 @@ static void LoadInternal(ExtensionLoader &loader) {
138139
ScalarFunction("is_valid_domain", {LogicalType::VARCHAR}, LogicalType::BOOLEAN, IsValidDomainFunction);
139140
loader.RegisterFunction(is_valid_domain_function);
140141

142+
auto extract_path_segments_function =
143+
TableFunction("extract_path_segments", {LogicalType::VARCHAR}, nullptr, netquack::ExtractPathSegmentsFunc::Bind,
144+
nullptr, netquack::ExtractPathSegmentsFunc::InitLocal);
145+
extract_path_segments_function.in_out_function = netquack::ExtractPathSegmentsFunc::Function;
146+
loader.RegisterFunction(extract_path_segments_function);
147+
141148
auto version_function =
142149
TableFunction("netquack_version", {}, netquack::VersionFunc::Scan, netquack::VersionFunc::Bind,
143150
netquack::VersionFunc::InitGlobal, netquack::VersionFunc::InitLocal);

0 commit comments

Comments
 (0)