Skip to content

Commit f656ca8

Browse files
committed
feat: add domain_depth function
1 parent b429653 commit f656ca8

File tree

8 files changed

+574
-1
lines changed

8 files changed

+574
-1
lines changed

README.md

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Table of Contents
3737
- [IP Version](#ip-version)
3838
- [IP to Integer / Integer to IP](#ip-to-integer--integer-to-ip)
3939
- [Normalize URL](#normalize-url)
40+
- [Domain Depth](#domain-depth)
4041
- [Get Extension Version](#get-extension-version)
4142
- [Build Requirements](#build-requirements)
4243
- [Debugging](#debugging)
@@ -669,6 +670,36 @@ GROUP BY normalized
669670
HAVING cnt > 1;
670671
```
671672

673+
### Domain Depth
674+
675+
The `domain_depth` function returns the number of dot-separated levels in a domain. It extracts the host from a URL and counts the labels. Returns `0` for IP addresses and invalid input, `NULL` for `NULL`.
676+
677+
```sql
678+
D SELECT domain_depth('example.com') AS depth;
679+
┌───────┐
680+
│ depth │
681+
│ int32 │
682+
├───────┤
683+
2
684+
└───────┘
685+
686+
D SELECT domain_depth('https://www.example.com/page') AS depth;
687+
┌───────┐
688+
│ depth │
689+
│ int32 │
690+
├───────┤
691+
3
692+
└───────┘
693+
694+
D SELECT domain_depth('http://a.b.c.example.co.uk/page') AS depth;
695+
┌───────┐
696+
│ depth │
697+
│ int32 │
698+
├───────┤
699+
6
700+
└───────┘
701+
```
702+
672703
### Get Extension Version
673704

674705
You can use the `netquack_version` function to get the extension version.
@@ -717,7 +748,6 @@ Also, there will be stdout errors for background tasks like CURL.
717748
- [ ] Support internationalized domain names (IDNs)
718749
- [ ] Implement `punycode_encode` / `punycode_decode` functions - Convert internationalized domain names to/from ASCII-compatible encoding
719750
- [ ] Implement `is_valid_domain` function - Validate a domain name against RFC rules
720-
- [ ] Implement `domain_depth` function - Return the number of levels in a domain
721751
- [ ] Implement `base64_encode` / `base64_decode` functions - Encode and decode Base64 strings
722752
- [ ] Implement `extract_path_segments` table function - Split a URL path into individual segment rows
723753

docs/SUMMARY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
* [Extract TLD](functions/extract-tld.md)
2323
* [Extract Fragment](functions/extract-fragment.md)
2424
* [Normalize URL](functions/normalize-url.md)
25+
* [Domain Depth](functions/domain-depth.md)
2526
* [Tranco](functions/tranco/README.md)
2627
* [Get Tranco Rank](functions/tranco/get-tranco-rank.md)
2728
* [Download / Update Tranco](functions/tranco/download-update-tranco.md)

docs/functions/domain-depth.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
---
2+
layout:
3+
title:
4+
visible: true
5+
description:
6+
visible: false
7+
tableOfContents:
8+
visible: true
9+
outline:
10+
visible: true
11+
pagination:
12+
visible: true
13+
---
14+
15+
# Domain Depth
16+
17+
This function returns the number of dot-separated levels in a domain. It extracts the host from a URL and counts the labels. Returns `0` for IP addresses, empty strings, and invalid input. Returns `NULL` for `NULL` input.
18+
19+
```sql
20+
D SELECT domain_depth('example.com') AS depth;
21+
┌───────┐
22+
│ depth │
23+
│ int32 │
24+
├───────┤
25+
2
26+
└───────┘
27+
28+
D SELECT domain_depth('https://www.example.com/page') AS depth;
29+
┌───────┐
30+
│ depth │
31+
│ int32 │
32+
├───────┤
33+
3
34+
└───────┘
35+
36+
D SELECT domain_depth('http://a.b.c.example.co.uk/page') AS depth;
37+
┌───────┐
38+
│ depth │
39+
│ int32 │
40+
├───────┤
41+
6
42+
└───────┘
43+
```
44+
45+
## Special Cases
46+
47+
- **IP addresses** return `0` (both IPv4 and IPv6 — they are not domains)
48+
- **Single-label names** like `localhost` return `1`
49+
- **Trailing dots** are stripped before counting (DNS canonical form)
50+
51+
```sql
52+
D SELECT domain_depth('192.168.1.1') AS depth;
53+
┌───────┐
54+
│ depth │
55+
│ int32 │
56+
├───────┤
57+
0
58+
└───────┘
59+
60+
D SELECT domain_depth('localhost') AS depth;
61+
┌───────┐
62+
│ depth │
63+
│ int32 │
64+
├───────┤
65+
1
66+
└───────┘
67+
```

src/functions/domain_depth.cpp

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
// Copyright 2026 Arash Hatami
2+
3+
#include "domain_depth.hpp"
4+
5+
#include "../utils/url_helpers.hpp"
6+
7+
namespace duckdb {
8+
void DomainDepthFunction(DataChunk &args, ExpressionState &state, Vector &result) {
9+
auto &input_vector = args.data[0];
10+
auto result_data = FlatVector::GetData<int32_t>(result);
11+
auto &result_validity = FlatVector::Validity(result);
12+
13+
for (idx_t i = 0; i < args.size(); i++) {
14+
auto value = input_vector.GetValue(i);
15+
if (value.IsNull()) {
16+
result_validity.SetInvalid(i);
17+
continue;
18+
}
19+
20+
auto input = value.ToString();
21+
22+
try {
23+
result_data[i] = netquack::DomainDepth(input);
24+
} catch (const std::exception &) {
25+
result_data[i] = 0;
26+
}
27+
}
28+
}
29+
30+
namespace netquack {
31+
int32_t DomainDepth(const std::string &input) {
32+
if (input.empty()) {
33+
return 0;
34+
}
35+
36+
// Lowercase the input for consistent host extraction
37+
std::string lowered = input;
38+
std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
39+
40+
// Extract the host from the URL using the existing utility
41+
const char *data = lowered.data();
42+
size_t size = lowered.size();
43+
std::string_view host = getURLHost(data, size);
44+
45+
// If no host could be extracted, treat the whole input as a potential bare domain
46+
std::string host_str;
47+
if (host.empty()) {
48+
// If the input has a scheme (://), there's no valid host — return 0
49+
if (lowered.find("://") != std::string::npos) {
50+
return 0;
51+
}
52+
53+
// Strip any path/query/fragment from bare domain input
54+
std::string bare = lowered;
55+
auto slash_pos = bare.find('/');
56+
if (slash_pos != std::string::npos) {
57+
bare = bare.substr(0, slash_pos);
58+
}
59+
auto question_pos = bare.find('?');
60+
if (question_pos != std::string::npos) {
61+
bare = bare.substr(0, question_pos);
62+
}
63+
auto hash_pos = bare.find('#');
64+
if (hash_pos != std::string::npos) {
65+
bare = bare.substr(0, hash_pos);
66+
}
67+
// Strip port
68+
auto colon_pos = bare.rfind(':');
69+
if (colon_pos != std::string::npos) {
70+
bool all_digits = true;
71+
for (size_t k = colon_pos + 1; k < bare.size(); ++k) {
72+
if (!std::isdigit(static_cast<unsigned char>(bare[k]))) {
73+
all_digits = false;
74+
break;
75+
}
76+
}
77+
if (all_digits && colon_pos + 1 < bare.size()) {
78+
bare = bare.substr(0, colon_pos);
79+
}
80+
}
81+
82+
if (bare.empty()) {
83+
return 0;
84+
}
85+
86+
// Check if it contains at least one dot
87+
bool has_dot = false;
88+
for (char c : bare) {
89+
if (c == '.') {
90+
has_dot = true;
91+
}
92+
if (c == ' ' || c == '\t' || c == '<' || c == '>') {
93+
return 0;
94+
}
95+
}
96+
if (!has_dot) {
97+
// Single label like "localhost" — depth is 1
98+
// But only if it looks reasonable (alphanumeric + hyphens)
99+
for (char c : bare) {
100+
if (std::isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '_') {
101+
continue;
102+
}
103+
return 0;
104+
}
105+
return 1;
106+
}
107+
host_str = bare;
108+
} else {
109+
host_str = std::string(host);
110+
}
111+
112+
if (host_str.empty()) {
113+
return 0;
114+
}
115+
116+
// Remove trailing dot if present (DNS canonical form)
117+
if (!host_str.empty() && host_str.back() == '.') {
118+
host_str.pop_back();
119+
}
120+
121+
if (host_str.empty()) {
122+
return 0;
123+
}
124+
125+
// Skip IPv6 addresses (e.g., [::1]) — they have no domain depth
126+
if (host_str.front() == '[') {
127+
return 0;
128+
}
129+
130+
// Skip IPv4 addresses (all parts are numeric)
131+
{
132+
bool all_numeric_parts = true;
133+
size_t part_start = 0;
134+
int dot_count = 0;
135+
for (size_t j = 0; j <= host_str.size(); ++j) {
136+
if (j == host_str.size() || host_str[j] == '.') {
137+
if (j == part_start) {
138+
all_numeric_parts = false;
139+
break;
140+
}
141+
for (size_t k = part_start; k < j; ++k) {
142+
if (!std::isdigit(static_cast<unsigned char>(host_str[k]))) {
143+
all_numeric_parts = false;
144+
break;
145+
}
146+
}
147+
if (!all_numeric_parts) {
148+
break;
149+
}
150+
if (host_str[j] == '.') {
151+
++dot_count;
152+
}
153+
part_start = j + 1;
154+
}
155+
}
156+
if (all_numeric_parts && dot_count == 3) {
157+
return 0; // IPv4 address, not a domain
158+
}
159+
}
160+
161+
// Count dot-separated levels
162+
int32_t depth = 1;
163+
for (char c : host_str) {
164+
if (c == '.') {
165+
++depth;
166+
}
167+
}
168+
169+
return depth;
170+
}
171+
} // namespace netquack
172+
} // namespace duckdb

src/functions/domain_depth.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Copyright 2026 Arash Hatami
2+
3+
#pragma once
4+
5+
#include "duckdb.hpp"
6+
7+
namespace duckdb {
8+
// Scalar function: domain_depth(VARCHAR) -> INTEGER
9+
void DomainDepthFunction(DataChunk &args, ExpressionState &state, Vector &result);
10+
11+
namespace netquack {
12+
// Return the number of dot-separated levels in a domain/host
13+
int32_t DomainDepth(const std::string &input);
14+
} // namespace netquack
15+
} // namespace duckdb

src/netquack_extension.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "duckdb/common/exception.hpp"
88
#include "duckdb/function/scalar_function.hpp"
99
#include "functions/extract_domain.hpp"
10+
#include "functions/domain_depth.hpp"
1011
#include "functions/extract_extension.hpp"
1112
#include "functions/extract_fragment.hpp"
1213
#include "functions/extract_host.hpp"
@@ -111,6 +112,10 @@ static void LoadInternal(ExtensionLoader &loader) {
111112
ScalarFunction("extract_fragment", {LogicalType::VARCHAR}, LogicalType::VARCHAR, ExtractFragmentFunction);
112113
loader.RegisterFunction(netquack_extract_fragment_function);
113114

115+
auto domain_depth_function =
116+
ScalarFunction("domain_depth", {LogicalType::VARCHAR}, LogicalType::INTEGER, DomainDepthFunction);
117+
loader.RegisterFunction(domain_depth_function);
118+
114119
auto normalize_url_function =
115120
ScalarFunction("normalize_url", {LogicalType::VARCHAR}, LogicalType::VARCHAR, NormalizeURLFunction);
116121
loader.RegisterFunction(normalize_url_function);

0 commit comments

Comments
 (0)