-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Support data source sampling with TABLESAMPLE #16325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
c68cce9
dc1d326
af54350
43e66f7
123f4bb
9d5c681
e4cba7f
98e205a
a133380
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4714,3 +4714,115 @@ fn test_using_join_wildcard_schema() { | |
| ] | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
|
||
| fn select_tablesample_value() { | ||
| let sql = "SELECT count(*) | ||
| FROM person | ||
| TABLESAMPLE 42 | ||
| WHERE id > 5"; | ||
| let plan = logical_plan(sql).unwrap(); | ||
| assert_snapshot!( | ||
| plan, | ||
| @r#" | ||
| Projection: count(*) | ||
| Aggregate: groupBy=[[]], aggr=[[count(*)]] | ||
| Filter: random() < Int64(42) / Float64(100) | ||
| Filter: person.id > Int64(5) | ||
| TableScan: person | ||
| "# | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn select_tablesample_value_float() { | ||
| let sql = "SELECT count(*) | ||
| FROM person | ||
| TABLESAMPLE 42.3 | ||
| WHERE id > 5"; | ||
| let plan = logical_plan(sql).unwrap(); | ||
| assert_snapshot!( | ||
| plan, | ||
| @r#" | ||
| Projection: count(*) | ||
| Aggregate: groupBy=[[]], aggr=[[count(*)]] | ||
| Filter: random() < Float64(42.3) / Float64(100) | ||
| Filter: person.id > Int64(5) | ||
| TableScan: person | ||
| "# | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn select_tablesample_percent() { | ||
| let sql = "SELECT count(*) | ||
| FROM person | ||
| TABLESAMPLE SYSTEM (42 PERCENT) | ||
| WHERE id > 5"; | ||
| let plan = logical_plan(sql).unwrap(); | ||
| assert_snapshot!( | ||
| plan, | ||
| @r#" | ||
| Projection: count(*) | ||
| Aggregate: groupBy=[[]], aggr=[[count(*)]] | ||
| Filter: random() < Int64(42) / Float64(100) | ||
| Filter: person.id > Int64(5) | ||
| TableScan: person | ||
| "# | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn select_sample() { | ||
| let sql = "SELECT count(*) | ||
| FROM person | ||
| SAMPLE 0.42 | ||
| WHERE id > 5"; | ||
| let plan = logical_plan(sql).unwrap(); | ||
| assert_snapshot!( | ||
| plan, | ||
| @r#" | ||
| Projection: count(*) | ||
| Aggregate: groupBy=[[]], aggr=[[count(*)]] | ||
| Filter: random() < Float64(0.42) | ||
| Filter: person.id > Int64(5) | ||
| TableScan: person | ||
| "# | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn select_sample_rows_unsupported() { | ||
| let sql = "SELECT count(*) | ||
| FROM person | ||
| TABLESAMPLE (5 ROWS)"; | ||
| let err = logical_plan(sql); | ||
| assert_contains!( | ||
| err.unwrap_err().to_string(), | ||
| "Table sample with rows unit is not supported" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn select_sample_bucket_unsupported() { | ||
| let sql = "SELECT count(*) | ||
| FROM person | ||
| TABLESAMPLE (BUCKET 3 OUT OF 16 ON id)"; | ||
| let err = logical_plan(sql); | ||
| assert_contains!( | ||
| err.unwrap_err().to_string(), | ||
| "Table sample bucket is not supported" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn select_sample_seed_unsupported() { | ||
| let sql = "SELECT count(*) | ||
| FROM person | ||
| TABLESAMPLE SYSTEM (3) REPEATABLE (82)"; | ||
| let err = logical_plan(sql); | ||
| assert_contains!( | ||
| err.unwrap_err().to_string(), | ||
| "Table sample seed is not supported" | ||
| ); | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel we'd better do this rewrite in a separate logical optimizer rule, to keep the planning code clean. It can be done with a follow-up PR before adding more functionality to scan sampling.
(Unless there's a specific reason to do this during the planning phase — I did notice some rewrites happening during planning, but I'm not sure why.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"logical optimizer rule" mainly focuses on Optimization, I think it's fair to rewrite during planning phase.