hatamiarash7 · hatamiarash7 · Oct 2, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 26, 2025
@@ -8,4 +8,5 @@ test/python/__pycache__/
 .Rhistory
 *.log
 *.csv
+public_suffix_list.dat
 !test/data/*.csv
@@ -1,4 +1,17 @@
 {
+  "cmake.generator": "Ninja",
+  "cmake.sourceDirectory": "${workspaceFolder}",
+  "cmake.buildDirectory": "${workspaceFolder}/build/release",
+  "cmake.configureOnOpen": false,
+  // Use compile_commands.json generated by Ninja for accurate include paths and flags
+  "C_Cpp.default.compileCommands": "${workspaceFolder}/build/release/compile_commands.json",
+
+  "clangd.arguments": [
+    "--compile-commands-dir=${workspaceFolder}/build/release",
+    "--background-index",
+    "--enable-config",
+    "--suggest-missing-includes"
+  ],
   "cSpell.words": [
     "duckdb",
     "Hostroute",

@@ -7,6 +7,10 @@ set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
 
 project(${TARGET_NAME})
 
+# Force C++17 standard for string_view support
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 set(EXTENSION_SOURCES ${EXTENSION_SOURCES})
 include_directories(src/include)
 file(GLOB_RECURSE EXTENSION_SOURCES src/*.cpp)

@@ -8,6 +8,8 @@ This extension is designed to simplify working with domains, URIs, and web paths
 
 With Netquack, you can unlock deeper insights from your web-related datasets without the need for external tools or complex workflows.
 
+NetQuack uses ClickHouse-inspired character-by-character parsing and gperf-generated perfect hash functions for optimal performance.
+
 Table of Contents
 
 - [DuckDB Netquack Extension](#duckdb-netquack-extension)
@@ -55,9 +57,7 @@ Once installed, the [macro functions](https://duckdb.org/community_extensions/ex
 
 ### Extracting The Main Domain
 
-This function extracts the main domain from a URL. For this purpose, the extension will get all public suffixes from the [publicsuffix.org](https://publicsuffix.org/) list and extract the main domain from the URL.
-
-The download process of the public suffix list is done automatically when the function is called for the first time. After that, the list is stored in the `public_suffix_list` table to avoid downloading it again.
+This function extracts the main domain from a URL using an optimized static TLD lookup system. The extension uses Mozilla's Public Suffix List compiled into a gperf-generated perfect hash function for O(1) TLD lookups with zero collisions.
 
 ```sql
 D SELECT extract_domain('a.example.com') AS domain;
@@ -77,23 +77,7 @@ D SELECT extract_domain('https://b.a.example.com/path') AS domain;
 └─────────────┘
 ```
 
-You can use the `update_suffixes` function to update the public suffix list manually.
-
-```sql
-D SELECT update_suffixes();
-┌───────────────────┐
-│ update_suffixes() │
-│      varchar      │
-├───────────────────┤
-│ updated           │
-└───────────────────┘
-```
-
-> [!WARNING]
-> This a public service with a limited number of requests. If you call the function too many times, you may get a 403 error.  
-> `<?xml version='1.0' encoding='UTF-8'?><Error><Code>AccessDenied</Code><Message>Access denied.</Message></Error>`  
-> The list usually changes a few times per week; more frequent downloading will cause rate limiting.
-> In this case, you can download the list manually from [publicsuffix.org](https://publicsuffix.org/) and save it in the `public_suffix_list` table.
+The TLD lookup is built into the extension at compile time using the latest Mozilla Public Suffix List. No runtime downloads or database operations are required.
 
 ### Extracting The Path
 
@@ -234,7 +218,7 @@ D SELECT extract_extension('http://example.com/image.jpg') AS ext;
 
 ### Extracting The TLD (Top-Level Domain)
 
-This function extracts the top-level domain from a URL. This function will use the public suffix list to extract the TLD. Check the [Extracting The Main Domain](#extracting-the-main-domain) section for more information about the public suffix list.
+This function extracts the top-level domain from a URL using the optimized gperf-based public suffix lookup system. The function correctly handles multi-part TLDs (like `com.au`) using the longest-match algorithm from Mozilla's Public Suffix List.
 
 ```sql
 D SELECT extract_tld('https://example.com.ac/path/path') AS tld;
@@ -256,7 +240,7 @@ D SELECT extract_tld('a.example.com') AS tld;
 
 ### Extracting The Sub Domain
 
-This function extracts the sub-domain from a URL. This function will use the public suffix list to extract the TLD. Check the [Extracting The Main Domain](#extracting-the-main-domain) section for more information about the public suffix list.
+This function extracts the sub-domain from a URL using the optimized public suffix lookup system to correctly identify the domain boundary and extract everything before it.
 
 ```sql
 D SELECT extract_subdomain('http://a.b.example.com/path') AS dns_record;
@@ -398,6 +382,10 @@ D SELECT * FROM netquack_version();
 └─────────┘
 ```
 
+### 🛠 **Build Requirements**
+
+- **gperf required**: Perfect hash generation requires `gperf` (install via `brew install gperf` or `apt-get install gperf`)
+
 ## Debugging
 
 The debugging process for DuckDB extensions is not an easy job. For Netquack, we have created a log file in the current directory. The log file is named `netquack.log` and contains all the logs for the extension. You can use this file to debug your code.

@@ -14,9 +14,9 @@ layout:
 
 # Extract Domain
 
-This function extracts the main domain from a URL. For this purpose, the extension will get all public suffixes from the [publicsuffix.org](https://publicsuffix.org/) list and extract the main domain from the URL.
+This function extracts the main domain from a URL using an optimized static TLD lookup system. The extension uses Mozilla's Public Suffix List compiled into a gperf-generated perfect hash function for O(1) TLD lookups with zero collisions.
 
-The download process of the public suffix list is done automatically when the function is called for the first time. After that, the list is stored in the `public_suffix_list` table to avoid downloading it again.
+The TLD lookup is built into the extension at compile time using the latest Mozilla Public Suffix List. No runtime downloads or database operations are required.
 
 ```sql
 D SELECT extract_domain('a.example.com') AS domain;

@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+
+echo "=== NetQuack Performance Comparison ==="
+echo ""
+
+# Change to the project root directory (parent of script location)
+CWD="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$CWD"
+
+# Create results directory in build (gitignored)
+mkdir -p build/benchmark_results
+
+# Function to create benchmark SQL with specified load command
+create_benchmark_sql() {
+    local load_command="$1"
+    local version_label="$2"
+    local output_file="$3"
+
+    cat > "$output_file" << EOF
+${load_command}
+
+-- Create test data
+CREATE TABLE benchmark_urls AS SELECT * FROM (VALUES
+    ('https://www.example.com/path?query=value#fragment'),
+    ('http://subdomain.example.co.uk:8080/long/path/to/file.html?param1=value1&param2=value2'),
+    ('ftp://user:[email protected]:21/directory/file.zip'),
+    ('https://blog.example.org/2023/12/article-title.html'),
+    ('http://api.service.example.net:3000/v1/users/123?format=json'),
+    ('https://cdn.assets.example.com/images/logo.png'),
+    ('mailto:[email protected]'),
+    ('http://localhost:8080/debug'),
+    ('https://secure.payment.example.gov:8443/transaction?id=abc123'),
+    ('https://example.com')
+) AS t(url);
+
+-- Expand dataset
+CREATE TABLE large_benchmark_urls AS 
+WITH RECURSIVE series(i) AS (
+    SELECT 1
+    UNION ALL
+    SELECT i + 1 FROM series WHERE i <= 3000
+)
+SELECT url FROM benchmark_urls CROSS JOIN series;
+
+.timer on
+.print '=== ${version_label} BENCHMARKS ==='
+
+.print 'extract_schema:'
+SELECT extract_schema(url) FROM large_benchmark_urls;
+
+.print 'extract_host:'
+SELECT extract_host(url) FROM large_benchmark_urls;
+
+.print 'extract_port:'
+SELECT extract_port(url) FROM large_benchmark_urls;
+
+.print 'extract_path:'
+SELECT extract_path(url) FROM large_benchmark_urls;
+
+.print 'extract_query_string:'
+SELECT extract_query_string(url) FROM large_benchmark_urls;
+
+.print 'extract_domain:'
+SELECT extract_domain(url) FROM large_benchmark_urls;
+
+.print 'extract_subdomain:'
+SELECT extract_subdomain(url) FROM large_benchmark_urls;
+
+.print 'extract_tld:'
+SELECT extract_tld(url) FROM large_benchmark_urls;
+
+.print 'extract_extension:'
+SELECT extract_extension(url) FROM large_benchmark_urls;
+EOF
+}
+
+# Create temporary directory
+mkdir -p build/tmp
+
+# Create benchmark SQL files using the DRY function
+create_benchmark_sql "FORCE INSTALL netquack FROM community; LOAD netquack;" "PUBLISHED VERSION" "build/tmp/published_benchmark.sql"
+create_benchmark_sql "LOAD './build/release/extension/netquack/netquack.duckdb_extension';" "local VERSION" "build/tmp/local_benchmark.sql"
+
+echo "Step 1: Installing and benchmarking PUBLISHED NetQuack extension..."
+duckdb < build/tmp/published_benchmark.sql > build/benchmark_results/published_full_output.txt 2>&1
+
+echo "Step 2: Running benchmarks on LOCAL implementation..."
+./build/release/duckdb < build/tmp/local_benchmark.sql > build/benchmark_results/local_full_output.txt 2>&1
+
+echo "Step 3: Generating comparison analysis..."
+
+# Extract times and calculate improvements
+echo "📊 RESULTS SUMMARY:" > build/benchmark_results/analysis.txt
+echo "" >> build/benchmark_results/analysis.txt
+
+echo "| Function | Published Time |   Local Time   | Improvement |" >> build/benchmark_results/analysis.txt
+echo "|----------|----------------|----------------|-------------|" >> build/benchmark_results/analysis.txt
+
+# Extract timing data using a more robust approach
+functions=("extract_schema" "extract_host" "extract_port" "extract_path" "extract_query_string" "extract_domain" "extract_subdomain" "extract_tld" "extract_extension")
+
+# Extract all times into temporary files
+grep "Run Time" build/benchmark_results/published_full_output.txt | grep -o "real [0-9.]*" | cut -d' ' -f2 > build/tmp/published_times.txt
+grep "Run Time" build/benchmark_results/local_full_output.txt | grep -o "real [0-9.]*" | cut -d' ' -f2 > build/tmp/local_times.txt
+
+# Process each function
+for i in {0..9}; do
+    func=${functions[$i]}
+    line_num=$((i + 1))
+
+    published_time=$(sed -n "${line_num}p" build/tmp/published_times.txt)
+    local_time=$(sed -n "${line_num}p" build/tmp/local_times.txt)
+
+    if [ ! -z "$published_time" ] && [ ! -z "$local_time" ]; then
+        improvement=$(echo "scale=1; $published_time / $local_time" | bc -l)
+        echo "| ${func} | ${published_time}s | ${local_time}s | ${improvement}x faster |" >> build/benchmark_results/analysis.txt
+    fi
+done
+
+# Clean up timing files
+rm -f build/tmp/published_times.txt build/tmp/local_times.txt
+
+echo ""
+echo "✅ Benchmark comparison complete!"
+echo ""
+cat build/benchmark_results/analysis.txt