Skip to content

Commit a587222

Browse files
committed
feat: add get_tranco_rank_category function
1 parent 6a6843e commit a587222

File tree

4 files changed

+62
-1
lines changed

4 files changed

+62
-1
lines changed

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,18 @@ D SELECT get_tranco_rank('cloudflare.com') as rank;
255255
└─────────┘
256256
```
257257

258+
You can use the `get_tranco_rank_category` function to retrieve the category utility column that gives you the rank category of the domain. The `category` value is on a log10 scale with half steps (e.g. top 1k, top 5k, top 10k, top 50k, top 100k, top 500k, top 1M, top 5m, etc.) with each rank excluding the previous (e.g. top 5k is actually 4k domains, excluding top 1k).
259+
260+
```sql
261+
D SELECT get_tranco_rank_category('microsoft.com') as category;
262+
┌──────────┐
263+
│ category │
264+
varchar
265+
├──────────┤
266+
│ top1k │
267+
└──────────┘
268+
```
269+
258270
### Get Extension Version
259271

260272
You can use the `netquack_version` function to get the version of the extension.
@@ -273,7 +285,6 @@ D select * from netquack_version();
273285

274286
- [ ] Create a `TableFunction` for `extract_query_parameters` that return each key-value pair as a row.
275287
- [ ] Save Tranco data as Parquet
276-
- [ ] Create Rank category for Tranco ( `top1k` , `top5k`, `top10k`, `top100k`, `top500k`, `top1m` )
277288
- [ ] Implement GeoIP functionality
278289
- [ ] Add new functions to work with IPs
279290
- [ ] Return default value for `get_tranco_rank`

src/functions/get_tranco.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ namespace duckdb
124124
" WHEN rank <= 1000 THEN 'top1k'"
125125
" WHEN rank <= 5000 THEN 'top5k'"
126126
" WHEN rank <= 10000 THEN 'top10k'"
127+
" WHEN rank <= 50000 THEN 'top50k'"
127128
" WHEN rank <= 100000 THEN 'top100k'"
128129
" WHEN rank <= 500000 THEN 'top500k'"
129130
" WHEN rank <= 1000000 THEN 'top1m'"
@@ -192,5 +193,44 @@ namespace duckdb
192193
}
193194
}
194195
}
196+
197+
// Function to get the Tranco rank category of a domain
198+
void GetTrancoRankCategoryFunction (DataChunk &args, ExpressionState &state, Vector &result)
199+
{
200+
auto &db = *state.GetContext ().db;
201+
netquack::LoadPublicSuffixList (db, false);
202+
Connection con (db);
203+
204+
auto table_exists = con.Query ("SELECT 1 FROM information_schema.tables WHERE table_name = 'tranco_list'");
205+
206+
if (table_exists->RowCount () == 0)
207+
{
208+
throw std::runtime_error ("Tranco table not found. Download it first using `SELECT update_tranco(true);`");
209+
}
210+
211+
// Extract the input from the arguments
212+
auto &input_vector = args.data[0];
213+
auto result_data = FlatVector::GetData<string_t> (result);
214+
215+
for (idx_t i = 0; i < args.size (); i++)
216+
{
217+
auto input = input_vector.GetValue (i).ToString ();
218+
std::transform (input.begin (), input.end (), input.begin (), ::tolower);
219+
220+
try
221+
{
222+
auto query = "SELECT category FROM tranco_list WHERE domain = '" + input + "'";
223+
224+
auto query_result = con.Query (query);
225+
auto category = query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value ();
226+
227+
result_data[i] = StringVector::AddString (result, category.ToString ());
228+
}
229+
catch (const std::exception &e)
230+
{
231+
result_data[i] = "Error extracting tranco category: " + std::string (e.what ());
232+
}
233+
}
234+
}
195235
} // namespace netquack
196236
} // namespace duckdb

src/functions/get_tranco.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,8 @@ namespace duckdb
1111

1212
// Function to get the Tranco rank of a domain
1313
void GetTrancoRankFunction (DataChunk &args, ExpressionState &state, Vector &result);
14+
15+
// Function to get the Tranco rank category of a domain
16+
void GetTrancoRankCategoryFunction (DataChunk &args, ExpressionState &state, Vector &result);
1417
} // namespace netquack
1518
} // namespace duckdb

src/netquack_extension.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ namespace duckdb
9999
netquack::GetTrancoRankFunction);
100100
ExtensionUtil::RegisterFunction (instance, get_tranco_rank_function);
101101

102+
auto get_tranco_rank_category_function = ScalarFunction (
103+
"get_tranco_rank_category",
104+
{ LogicalType::VARCHAR },
105+
LogicalType::VARCHAR,
106+
netquack::GetTrancoRankCategoryFunction);
107+
ExtensionUtil::RegisterFunction (instance, get_tranco_rank_category_function);
108+
102109
auto version_function = TableFunction (
103110
"netquack_version",
104111
{},

0 commit comments

Comments
 (0)