Skip to content

Commit 82310dd

Browse files
authored
Merge pull request #129 from hafenkran/fix_scan_projection_ordering
Fix bigquery_scan Projection Ordering
2 parents 121634e + 95e339d commit 82310dd

File tree

3 files changed

+268
-19
lines changed

3 files changed

+268
-19
lines changed

src/bigquery_arrow_scan.cpp

Lines changed: 55 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "duckdb/common/arrow/arrow.hpp"
33
#include "duckdb/common/arrow/arrow_wrapper.hpp"
44
#include "duckdb/common/atomic.hpp"
5+
#include "duckdb/common/limits.hpp"
56
#include "duckdb/function/table/arrow.hpp"
67
#include "duckdb/main/attached_database.hpp"
78
#include "duckdb/main/database_manager.hpp"
@@ -14,8 +15,7 @@
1415

1516
#include <arrow/c/bridge.h>
1617
#include <arrow/util/iterator.h>
17-
#include <iostream>
18-
#include <limits>
18+
#include <unordered_map>
1919

2020
namespace duckdb {
2121
namespace bigquery {
@@ -58,7 +58,6 @@ unique_ptr<FunctionData> BigqueryArrowScanFunction::BigqueryArrowScanBind(Client
5858
if (!status.ok()) {
5959
throw BinderException("Arrow schema export failed: " + status.ToString());
6060
}
61-
6261
// Convert Arrow schema to DuckDB types and names
6362
vector<LogicalType> mapped_bq_types; // original physical types if casts required
6463
BigqueryUtils::PopulateAndMapArrowTableTypes(context,
@@ -161,30 +160,68 @@ unique_ptr<GlobalTableFunctionState> BigqueryArrowScanFunction::BigqueryArrowSca
161160
gstate->scanned_types.emplace_back(bind_data.mapped_bq_types[col_id]);
162161
}
163162
}
164-
165-
bool needs_projection = false;
166-
for (idx_t out_idx = 0; out_idx < input.projection_ids.size(); ++out_idx) {
167-
idx_t proj_id = input.projection_ids[out_idx];
168-
idx_t col_id = input.column_ids[proj_id];
163+
// Map each requested column (in column_ids order) to its physical index in the Arrow schema
164+
vector<idx_t> column_physical_positions;
165+
column_physical_positions.reserve(input.column_ids.size());
166+
const idx_t invalid_phys_idx = NumericLimits<idx_t>::Maximum();
167+
bool contains_rowid = false;
168+
for (idx_t col_pos = 0; col_pos < input.column_ids.size(); ++col_pos) {
169+
idx_t col_id = input.column_ids[col_pos];
169170
if (col_id == COLUMN_IDENTIFIER_ROW_ID || col_id < 0) {
171+
contains_rowid = true;
172+
column_physical_positions.push_back(invalid_phys_idx);
170173
continue;
171174
}
172-
173175
const string &name = bind_data.names[col_id];
174176
idx_t phys_idx = static_cast<idx_t>(arrow_schema->GetFieldIndex(name));
175177
if (phys_idx == static_cast<idx_t>(-1)) {
176178
throw InternalException("Column '" + name + "' not found in Arrow schema");
177179
}
178-
179-
gstate->projection_ids.push_back(phys_idx);
180-
if (phys_idx != out_idx) {
181-
needs_projection = true;
182-
}
180+
column_physical_positions.push_back(phys_idx);
183181
}
184182

185-
// Clear projection IDs if no reordering is needed
186-
if (!needs_projection) {
187-
gstate->projection_ids.clear();
183+
bool requires_physical_reorder = false;
184+
vector<idx_t> projection_mapping;
185+
if (!input.projection_ids.empty()) {
186+
projection_mapping.reserve(input.projection_ids.size());
187+
for (idx_t out_idx = 0; out_idx < input.projection_ids.size(); ++out_idx) {
188+
idx_t col_pos = input.projection_ids[out_idx];
189+
D_ASSERT(col_pos < column_physical_positions.size());
190+
idx_t phys_idx = column_physical_positions[col_pos];
191+
if (phys_idx == invalid_phys_idx) {
192+
contains_rowid = true;
193+
requires_physical_reorder = true;
194+
break;
195+
}
196+
projection_mapping.push_back(phys_idx);
197+
if (phys_idx != col_pos) {
198+
requires_physical_reorder = true;
199+
}
200+
}
201+
if (!contains_rowid) {
202+
gstate->projection_ids = std::move(projection_mapping);
203+
} else {
204+
gstate->projection_ids.clear();
205+
}
206+
} else {
207+
projection_mapping.reserve(column_physical_positions.size());
208+
for (idx_t out_idx = 0; out_idx < column_physical_positions.size(); ++out_idx) {
209+
idx_t phys_idx = column_physical_positions[out_idx];
210+
if (phys_idx == invalid_phys_idx) {
211+
contains_rowid = true;
212+
requires_physical_reorder = true;
213+
break;
214+
}
215+
projection_mapping.push_back(phys_idx);
216+
if (phys_idx != out_idx && phys_idx != COLUMN_IDENTIFIER_ROW_ID && phys_idx >= 0) {
217+
requires_physical_reorder = true;
218+
}
219+
}
220+
if (!contains_rowid && requires_physical_reorder) {
221+
gstate->projection_ids = std::move(projection_mapping);
222+
} else {
223+
gstate->projection_ids.clear();
224+
}
188225
}
189226

190227
// Create the Arrow scan stream
@@ -208,10 +245,9 @@ unique_ptr<LocalTableFunctionState> BigqueryArrowScanFunction::BigqueryArrowScan
208245

209246
result->column_ids = sorted_column_ids;
210247
result->filters = input.filters.get();
211-
212248
if (!bind_data.projection_pushdown_enabled) {
213249
result->column_ids.clear();
214-
} else if (!input.projection_ids.empty() || bind_data.requires_cast) {
250+
} else if (!input.projection_ids.empty() || bind_data.requires_cast || !global_state.projection_ids.empty()) {
215251
auto &asgs = global_state_p->Cast<ArrowScanGlobalState>();
216252
result->all_columns.Initialize(client_context, asgs.scanned_types);
217253
}
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# name: test/sql/bigquery/function_bigquery_attach.test
2+
# description: Validate BigQuery ATTACH-based scans respect projection ordering, filtering, and legacy mode
3+
# group: [bigquery]
4+
5+
require bigquery
6+
7+
require-env BQ_TEST_PROJECT
8+
9+
require-env BQ_TEST_DATASET
10+
11+
statement ok
12+
FROM bigquery_execute('${BQ_TEST_PROJECT}', '
13+
CREATE OR REPLACE TABLE `${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test` AS
14+
SELECT 1 AS a, 2 AS b, "alpha" AS c
15+
UNION ALL
16+
SELECT 3 AS a, 4 AS b, "beta" AS c
17+
');
18+
19+
statement ok
20+
ATTACH 'project=${BQ_TEST_PROJECT}' AS bq (TYPE bigquery);
21+
22+
# Default (Arrow) path through attached catalog
23+
query III
24+
SELECT a, b, c FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY a;
25+
----
26+
1 2 alpha
27+
3 4 beta
28+
29+
query III
30+
SELECT c, a, b FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY a;
31+
----
32+
alpha 1 2
33+
beta 3 4
34+
35+
query I
36+
SELECT b FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY b;
37+
----
38+
2
39+
4
40+
41+
query II
42+
SELECT a AS first_a, a AS second_a FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY a;
43+
----
44+
1 1
45+
3 3
46+
47+
query I
48+
SELECT COUNT(*) FROM bq.${BQ_TEST_DATASET}.function_scan_test;
49+
----
50+
2
51+
52+
query II
53+
SELECT a, c FROM bq.${BQ_TEST_DATASET}.function_scan_test WHERE b = 4;
54+
----
55+
3 beta
56+
57+
statement ok
58+
SET bq_use_legacy_scan = true;
59+
60+
# Legacy mode should match Arrow behaviour for attached catalog reads
61+
query III
62+
SELECT a, b, c FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY a;
63+
----
64+
1 2 alpha
65+
3 4 beta
66+
67+
query III
68+
SELECT c, a, b FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY a;
69+
----
70+
alpha 1 2
71+
beta 3 4
72+
73+
query I
74+
SELECT b FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY b;
75+
----
76+
2
77+
4
78+
79+
query II
80+
SELECT a AS first_a, a AS second_a FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY a;
81+
----
82+
1 1
83+
3 3
84+
85+
query I
86+
SELECT COUNT(*) FROM bq.${BQ_TEST_DATASET}.function_scan_test;
87+
----
88+
2
89+
90+
query II
91+
SELECT a, c FROM bq.${BQ_TEST_DATASET}.function_scan_test WHERE b = 4;
92+
----
93+
3 beta
94+
95+
statement ok
96+
SET bq_use_legacy_scan = false;
97+
98+
statement ok
99+
DETACH bq;
100+
101+
statement ok
102+
FROM bigquery_execute('${BQ_TEST_PROJECT}', 'DROP TABLE `${BQ_TEST_DATASET}.function_scan_test`');
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# name: test/sql/bigquery/function_bigquery_scan.test
2+
# description: Comprehensive coverage for bigquery_scan column projection, ordering, and filtering
3+
# group: [bigquery]
4+
5+
require bigquery
6+
7+
require-env BQ_TEST_PROJECT
8+
9+
require-env BQ_TEST_DATASET
10+
11+
statement ok
12+
FROM bigquery_execute('${BQ_TEST_PROJECT}', '
13+
CREATE OR REPLACE TABLE `${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test` AS
14+
SELECT 1 AS a, 2 AS b, "alpha" AS c
15+
UNION ALL
16+
SELECT 3 AS a, 4 AS b, "beta" AS c
17+
');
18+
19+
# Default scan should preserve natural ordering and column layout
20+
query III
21+
SELECT a, b, c FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY a;
22+
----
23+
1 2 alpha
24+
3 4 beta
25+
26+
# Reordered projection must follow requested order
27+
query III
28+
SELECT c, a, b FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY a;
29+
----
30+
alpha 1 2
31+
beta 3 4
32+
33+
# Subset of columns should match requested names and values
34+
query I
35+
SELECT b FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY b;
36+
----
37+
2
38+
4
39+
40+
# Duplicate projections must be consistent
41+
query II
42+
SELECT a AS first_a, a AS second_a FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY a;
43+
----
44+
1 1
45+
3 3
46+
47+
# Aggregations rely on implicit ROWID handling
48+
query I
49+
SELECT COUNT(*) FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test');
50+
----
51+
2
52+
53+
# Filters should round-trip expected rows
54+
query II
55+
SELECT a, c FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') WHERE b = 4;
56+
----
57+
3 beta
58+
59+
statement ok
60+
SET bq_use_legacy_scan = true;
61+
62+
# Legacy scan should mirror Arrow-based behaviour
63+
query III
64+
SELECT a, b, c FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY a;
65+
----
66+
1 2 alpha
67+
3 4 beta
68+
69+
query III
70+
SELECT c, a, b FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY a;
71+
----
72+
alpha 1 2
73+
beta 3 4
74+
75+
query I
76+
SELECT b FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY b;
77+
----
78+
2
79+
4
80+
81+
query II
82+
SELECT a AS first_a, a AS second_a FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') ORDER BY a;
83+
----
84+
1 1
85+
3 3
86+
87+
query I
88+
SELECT COUNT(*) FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test');
89+
----
90+
2
91+
92+
query II
93+
SELECT a, c FROM bigquery_scan('${BQ_TEST_PROJECT}.${BQ_TEST_DATASET}.function_scan_test') WHERE b = 4;
94+
----
95+
3 beta
96+
97+
statement ok
98+
SET bq_use_legacy_scan = false;
99+
100+
statement ok
101+
ATTACH 'project=${BQ_TEST_PROJECT}' AS bq (TYPE bigquery);
102+
103+
# Catalog scan goes through same projection handling
104+
query III
105+
SELECT c, a, b FROM bq.${BQ_TEST_DATASET}.function_scan_test ORDER BY a;
106+
----
107+
alpha 1 2
108+
beta 3 4
109+
110+
statement ok
111+
FROM bigquery_execute('${BQ_TEST_PROJECT}', 'DROP TABLE `${BQ_TEST_DATASET}.function_scan_test`');

0 commit comments

Comments
 (0)