Query-farm
diff --git a/‎CLAUDE.md‎
Lines changed: 33 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 10 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/README.md‎
Lines changed: 77 additions & 0 deletions b/‎docs/README.md‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/adbc_catalog.cpp‎
Lines changed: 31 additions & 0 deletions b/‎src/adbc_catalog.cpp‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/adbc_scan.cpp‎
Lines changed: 77 additions & 18 deletions b/‎src/adbc_scan.cpp‎
Lines changed: 77 additions & 18 deletions
diff --git a/‎src/adbc_scanner_extension.cpp‎
Lines changed: 9 additions & 0 deletions b/‎src/adbc_scanner_extension.cpp‎
Lines changed: 9 additions & 0 deletions
@@ -100,6 +100,39 @@ A manifest file is a TOML file (e.g., `sqlite.toml`) containing driver metadata
 - `adbc_columns(handle, [table_name := ...])` - Returns column metadata (name, type, ordinal position, nullability).
 - `adbc_schema(handle, table_name)` - Returns the Arrow schema for a specific table (field names, Arrow types, nullability).
 
+### Storage Extension (ATTACH)
+
+The extension also provides a storage extension that allows attaching ADBC data sources as DuckDB databases. This enables querying remote tables using standard SQL syntax without explicit function calls.
+
+```sql
+-- Attach an ADBC data source
+ATTACH 'path/to/database.db' AS my_db (TYPE adbc, driver 'sqlite');
+
+-- Query tables directly
+SELECT * FROM my_db.my_table;
+```
+
+**ATTACH options:**
+- `driver` (required) - Driver name, path to shared library, or manifest name
+- `entrypoint` - Custom entry point function name
+- `search_paths` - Additional paths to search for driver manifests
+- `use_manifests` - Enable/disable manifest search (default: 'true')
+- `batch_size` - Hint for number of rows per batch when scanning tables (default: driver-specific). Larger batch sizes can reduce network round-trips for remote databases.
+- Other options are passed directly to the ADBC driver (e.g., `username`, `password`)
+
+**Examples:**
+```sql
+-- Attach SQLite database
+ATTACH '/path/to/mydb.sqlite' AS sqlite_db (TYPE adbc, driver 'sqlite');
+
+-- Attach with custom batch size (useful for network databases)
+ATTACH 'postgresql://localhost/mydb' AS pg_db (TYPE adbc, driver 'postgresql', batch_size 65536);
+
+-- Query attached tables
+SELECT * FROM pg_db.public.users WHERE id > 100;
+SELECT COUNT(*) FROM sqlite_db.main.orders;
+```
+
 ### Example Usage
 
 ```sql
 
@@ -32,6 +32,16 @@ set(EXTENSION_SOURCES
     src/adbc_secrets.cpp
     src/adbc_filter_pushdown.cpp
     src/query_farm_telemetry.cpp
+    src/storage/adbc_catalog.cpp
+    src/storage/adbc_catalog_set.cpp
+    src/storage/adbc_schema_set.cpp
+    src/storage/adbc_schema_entry.cpp
+    src/storage/adbc_table_set.cpp
+    src/storage/adbc_table_entry.cpp
+    src/storage/adbc_transaction.cpp
+    src/storage/adbc_transaction_manager.cpp
+    src/storage/adbc_storage.cpp
+    src/storage/adbc_clear_cache.cpp
 )
 
 build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
 
@@ -726,6 +726,83 @@ DROP PERSISTENT SECRET my_postgres;
 - Persistent secrets are stored encrypted on disk
 - Secrets are scoped to the current DuckDB connection/session
 
+## Storage Extension (ATTACH)
+
+The ADBC Scanner extension provides a storage extension that allows you to attach ADBC data sources as DuckDB databases. This enables querying remote tables using standard SQL syntax without explicit function calls.
+
+### Basic Usage
+
+```sql
+-- Attach an ADBC data source
+ATTACH '/path/to/database.db' AS my_db (TYPE adbc, driver 'sqlite');
+
+-- Query tables directly using standard SQL
+SELECT * FROM my_db.my_table;
+SELECT * FROM my_db.main.users WHERE id > 100;
+
+-- Detach when done
+DETACH my_db;
+```
+
+### ATTACH Options
+
+| Option | Required | Description |
+|--------|----------|-------------|
+| `TYPE` | Yes | Must be `adbc` |
+| `driver` | Yes | Driver name (e.g., `'sqlite'`, `'postgresql'`), path to shared library, or manifest name |
+| `entrypoint` | No | Custom driver entry point function name |
+| `search_paths` | No | Additional paths to search for driver manifests |
+| `use_manifests` | No | Enable/disable manifest search (default: `'true'`) |
+| `batch_size` | No | Hint for number of rows per batch when scanning tables (default: driver-specific). Larger batch sizes can reduce network round-trips for remote databases. |
+
+Any other options are passed directly to the ADBC driver (e.g., `username`, `password`).
+
+### Examples
+
+```sql
+-- Attach a SQLite database
+ATTACH '/path/to/mydb.sqlite' AS sqlite_db (TYPE adbc, driver 'sqlite');
+
+-- Attach PostgreSQL with credentials
+ATTACH 'postgresql://localhost/mydb' AS pg_db (
+    TYPE adbc,
+    driver 'postgresql',
+    username 'user',
+    password 'secret'
+);
+
+-- Attach with custom batch size (useful for network databases)
+ATTACH 'postgresql://localhost/mydb' AS pg_db (
+    TYPE adbc,
+    driver 'postgresql',
+    batch_size 65536
+);
+
+-- Query attached databases
+SELECT * FROM pg_db.public.users WHERE active = true;
+SELECT COUNT(*) FROM sqlite_db.main.orders;
+
+-- Join tables from different attached databases
+SELECT u.name, o.total
+FROM pg_db.public.users u
+JOIN sqlite_db.main.orders o ON u.id = o.user_id;
+```
+
+### Features
+
+When querying attached ADBC tables, the following optimizations are automatically applied:
+
+- **Projection pushdown**: Only requested columns are fetched from the remote database
+- **Filter pushdown**: WHERE clauses are pushed to the remote database with parameter binding
+- **Cardinality estimation**: Row count statistics are used for query planning
+- **Progress reporting**: Scan progress is reported based on estimated row counts
+
+### Limitations
+
+- Attached ADBC databases are read-only; INSERT, UPDATE, and DELETE operations are not supported through the ATTACH interface (use `adbc_execute` instead)
+- Schema creation and modification are not supported
+- The connection remains open while the database is attached
+
 ## ADBC Drivers
 
 ADBC drivers are available for many databases. When using driver manifests (see below), you can reference drivers by their short name:
 
@@ -60,6 +60,11 @@ static unique_ptr<FunctionData> AdbcInfoBind(ClientContext &context, TableFuncti
                                               vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcInfoBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_info: Connection handle cannot be NULL");
+    }
+
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
 
     auto &registry = ConnectionRegistry::Get();
@@ -344,6 +349,11 @@ static unique_ptr<FunctionData> AdbcTablesBind(ClientContext &context, TableFunc
                                                 vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcTablesBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_tables: Connection handle cannot be NULL");
+    }
+
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
 
     // Check for optional filter parameters
@@ -497,6 +507,11 @@ static unique_ptr<FunctionData> AdbcTableTypesBind(ClientContext &context, Table
                                                     vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcTableTypesBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_table_types: Connection handle cannot be NULL");
+    }
+
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
 
     auto &registry = ConnectionRegistry::Get();
@@ -809,6 +824,11 @@ static unique_ptr<FunctionData> AdbcColumnsBind(ClientContext &context, TableFun
                                                  vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcColumnsBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_columns: Connection handle cannot be NULL");
+    }
+
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
 
     // Check for optional filter parameters
@@ -1131,7 +1151,18 @@ static unique_ptr<FunctionData> AdbcSchemaBind(ClientContext &context, TableFunc
                                                 vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcSchemaBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_schema: Connection handle cannot be NULL");
+    }
+
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
+
+    // Check for NULL table name
+    if (input.inputs[1].IsNull()) {
+        throw InvalidInputException("adbc_schema: Table name cannot be NULL");
+    }
+
     bind_data->table_name = input.inputs[1].GetValue<string>();
 
     // Check for optional filter parameters
 
@@ -362,9 +362,19 @@ static unique_ptr<FunctionData> AdbcScanBind(ClientContext &context, TableFuncti
                                               vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcScanBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_scan: Connection handle cannot be NULL");
+    }
+
     // Get connection ID from first argument
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
 
+    // Check for NULL query
+    if (input.inputs[1].IsNull()) {
+        throw InvalidInputException("adbc_scan: Query cannot be NULL");
+    }
+
     // Get SQL query from second argument
     bind_data->query = input.inputs[1].GetValue<string>();
 
@@ -724,9 +734,19 @@ static unique_ptr<FunctionData> AdbcScanTableBind(ClientContext &context, TableF
                                                    vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcScanBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_scan_table: Connection handle cannot be NULL");
+    }
+
     // Get connection ID from first argument
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
 
+    // Check for NULL table name
+    if (input.inputs[1].IsNull()) {
+        throw InvalidInputException("adbc_scan_table: Table name cannot be NULL");
+    }
+
     // Get table name from second argument
     bind_data->table_name = input.inputs[1].GetValue<string>();
 
@@ -861,17 +881,30 @@ static unique_ptr<GlobalTableFunctionState> AdbcScanTableInitGlobal(ClientContex
     // If we have column_ids and they're a subset of all columns, build a projected query
     string query;
     bool needs_projection = false;
-    if (!bind_data.table_name.empty() && !input.column_ids.empty()) {
+
+    // Count how many valid column IDs we have (excluding special IDs like COLUMN_IDENTIFIER_ROW_ID)
+    idx_t valid_column_count = 0;
+    for (auto col_id : input.column_ids) {
+        if (col_id < bind_data.all_column_names.size()) {
+            valid_column_count++;
+        }
+    }
+
+    if (!bind_data.table_name.empty() && valid_column_count > 0) {
         // Check if we need all columns or just a subset
-        needs_projection = input.column_ids.size() < bind_data.all_column_names.size();
+        needs_projection = valid_column_count < bind_data.all_column_names.size();
 
         // Also check if columns are in order and consecutive from 0
         // If column_ids = [0, 1, 2, ...] matching all_column_names size, no projection needed
         if (!needs_projection) {
-            for (idx_t i = 0; i < input.column_ids.size(); i++) {
-                if (input.column_ids[i] != i) {
-                    needs_projection = true;
-                    break;
+            idx_t expected_idx = 0;
+            for (auto col_id : input.column_ids) {
+                if (col_id < bind_data.all_column_names.size()) {
+                    if (col_id != expected_idx) {
+                        needs_projection = true;
+                        break;
+                    }
+                    expected_idx++;
                 }
             }
         }
@@ -898,7 +931,8 @@ static unique_ptr<GlobalTableFunctionState> AdbcScanTableInitGlobal(ClientContex
             query = bind_data.query;
         }
     } else {
-        // Fall back to original query
+        // No valid columns requested (e.g., count(*)) or empty column_ids - use SELECT *
+        // We need to select something, so fall back to original query
         query = bind_data.query;
     }
 
@@ -1359,26 +1393,38 @@ static void AdbcExecuteFunction(DataChunk &args, ExpressionState &state, Vector
     // Handle constant input (for constant folding optimization)
     if (conn_vector.GetVectorType() == VectorType::CONSTANT_VECTOR &&
         query_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
-        if (ConstantVector::IsNull(conn_vector) || ConstantVector::IsNull(query_vector)) {
-            result.SetVectorType(VectorType::CONSTANT_VECTOR);
-            ConstantVector::SetNull(result, true);
-        } else {
-            auto connection_id = conn_vector.GetValue(0).GetValue<int64_t>();
-            auto query = query_vector.GetValue(0).GetValue<string>();
-            auto rows_affected = ExecuteStatement(connection_id, query);
-            result.SetVectorType(VectorType::CONSTANT_VECTOR);
-            ConstantVector::GetData<int64_t>(result)[0] = rows_affected;
+        if (ConstantVector::IsNull(conn_vector)) {
+            throw InvalidInputException("adbc_execute: Connection handle cannot be NULL");
+        }
+        if (ConstantVector::IsNull(query_vector)) {
+            throw InvalidInputException("adbc_execute: Query cannot be NULL");
         }
+        auto connection_id = conn_vector.GetValue(0).GetValue<int64_t>();
+        auto query = query_vector.GetValue(0).GetValue<string>();
+        auto rows_affected = ExecuteStatement(connection_id, query);
+        result.SetVectorType(VectorType::CONSTANT_VECTOR);
+        ConstantVector::GetData<int64_t>(result)[0] = rows_affected;
         return;
     }
 
     // Handle flat/dictionary vectors
     result.SetVectorType(VectorType::FLAT_VECTOR);
     auto result_data = FlatVector::GetData<int64_t>(result);
+    auto &validity = FlatVector::Validity(result);
 
     for (idx_t row_idx = 0; row_idx < count; row_idx++) {
-        auto connection_id = conn_vector.GetValue(row_idx).GetValue<int64_t>();
-        auto query = query_vector.GetValue(row_idx).GetValue<string>();
+        auto conn_value = conn_vector.GetValue(row_idx);
+        auto query_value = query_vector.GetValue(row_idx);
+
+        if (conn_value.IsNull()) {
+            throw InvalidInputException("adbc_execute: Connection handle cannot be NULL");
+        }
+        if (query_value.IsNull()) {
+            throw InvalidInputException("adbc_execute: Query cannot be NULL");
+        }
+
+        auto connection_id = conn_value.GetValue<int64_t>();
+        auto query = query_value.GetValue<string>();
         result_data[row_idx] = ExecuteStatement(connection_id, query);
     }
 }
@@ -1394,6 +1440,8 @@ void RegisterAdbcExecuteFunction(DatabaseInstance &db) {
         AdbcExecuteFunction,
         AdbcExecuteBind
     );
+    // Disable automatic NULL propagation so we can throw a meaningful error
+    adbc_execute_function.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
 
     CreateScalarFunctionInfo info(adbc_execute_function);
     FunctionDescription desc;
@@ -1537,8 +1585,19 @@ static unique_ptr<FunctionData> AdbcInsertBind(ClientContext &context, TableFunc
                                                 vector<LogicalType> &return_types, vector<string> &names) {
     auto bind_data = make_uniq<AdbcInsertBindData>();
 
+    // Check for NULL connection handle
+    if (input.inputs[0].IsNull()) {
+        throw InvalidInputException("adbc_insert: Connection handle cannot be NULL");
+    }
+
     // First argument is connection handle
     bind_data->connection_id = input.inputs[0].GetValue<int64_t>();
+
+    // Check for NULL table name
+    if (input.inputs[1].IsNull()) {
+        throw InvalidInputException("adbc_insert: Target table name cannot be NULL");
+    }
+
     // Second argument is target table name
     bind_data->target_table = input.inputs[1].GetValue<string>();
 
 
@@ -3,8 +3,10 @@
 #include "adbc_scanner_extension.hpp"
 #include "adbc_functions.hpp"
 #include "adbc_secrets.hpp"
+#include "storage/adbc_storage.hpp"
 #include "duckdb.hpp"
 #include "duckdb/common/exception.hpp"
+#include "duckdb/main/config.hpp"
 #include "query_farm_telemetry.hpp"
 
 namespace duckdb {
@@ -28,6 +30,13 @@ static void LoadInternal(ExtensionLoader &loader) {
 	// Register ADBC insert function (adbc_insert for bulk ingestion)
 	adbc::RegisterAdbcInsertFunction(loader.GetDatabaseInstance());
 
+	// Register ADBC clear cache function
+	RegisterAdbcClearCacheFunction(loader.GetDatabaseInstance());
+
+	// Register ADBC storage extension for ATTACH support
+	auto &config = DBConfig::GetConfig(loader.GetDatabaseInstance());
+	config.storage_extensions["adbc"] = make_uniq<AdbcStorageExtension>();
+
 	QueryFarmSendTelemetry(loader, "adbc", "2025120801");
 }