Skip to content

Commit cbcc31e

Browse files
authored
Merge branch 'main' into bump-deps
2 parents 8cfb462 + 251cd2f commit cbcc31e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+1755
-407
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ docker run --net=host datafuselabs/databend
127127
- [How to Load Data from Remote Files](https://databend.rs/doc/load-data/http)
128128
- [How to Load Data from Amazon S3](https://databend.rs/doc/load-data/s3)
129129
- [How to Load Data from Databend Stages](https://databend.rs/doc/load-data/stage)
130-
130+
* [Querying Data in Staged Files](https://databend.rs/doc/load-data/querying-stage)
131+
* [Transforming Data During a Load](http://databend.rs/doc/load-data/data-load-transform)
131132

132133
### Unloading Data from Databend
133134

docs/doc/01-guides/index.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ These tutorials are intended to help you get started with Databend:
3030
* [How to Load Data from Amazon S3](../12-load-data/01-s3.md)
3131
* [How to Load Data from Local File System](../12-load-data/02-local.md)
3232
* [How to Load Data from Remote Files](../12-load-data/04-http.md)
33-
33+
* [Querying Data in Staged Files](../12-load-data/05-querying-stage.md)
34+
* [Transforming Data During a Load](../12-load-data/06-data-load-transform.md)
3435

3536
## Unloading Data from Databend
3637

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
---
2+
title: Querying Data in Staged Files
3+
description: Learn how to use standard SQL to query data files in internal or external storage stages with Databend
4+
---
5+
6+
Databend supports using standard SQL to query data files located in an internal stage or named external stage (Amazon S3, Google Cloud Storage, or Microsoft Azure). This can be useful for inspecting or viewing the contents of the staged files, particularly before loading or after unloading data.
7+
8+
The schema is automatically detected, same as [infer_schema](../15-sql-functions/112-table-functions/infer_schema.md).
9+
10+
## Query Syntax and Parameters
11+
12+
```sql
13+
SELECT <columns> FROM
14+
{@<stage_name>[/<path>] | '<uri>'} [(
15+
[ PARTTERN => '<regex_pattern>']
16+
[ FILE_FORMAT => '<format_name>']
17+
[ FILES => ( 'file_name' [ , 'file_name' ... ] ) ]
18+
[ ENDPOINT_URL => <'url'> ]
19+
[ AWS_KEY_ID => <'aws_key_id'> ]
20+
[ AWS_KEY_SECRET => <'aws_key_secret'> ]
21+
[ ACCESS_KEY_ID => <'access_key_id'> ]
22+
[ ACCESS_KEY_SECRET => <'access_key_secret'> ]
23+
[ SECRET_ACCESS_KEY => <'secret_access_key'> ]
24+
[ SESSION_TOKEN => <'session_token'> ]
25+
[ REGION => <'region'> ]
26+
[ ENABLE_VIRTUAL_HOST_STYLE => true|false ]
27+
)]
28+
```
29+
30+
The function parameters are as follows:
31+
32+
### FILE_FORMAT = '<format_name>'
33+
34+
`<format_name>` should be one of the following:
35+
36+
1. A built-in file format (see [Input & Output File Formats](../13-sql-reference/50-file-format-options.md).
37+
2. A named file format created by [CREATE FILE FORMAT](../14-sql-commands/00-ddl/100-file-format/01-ddl-create-file-format.md).
38+
39+
If not specified for named stages, the format of the stage should be used.
40+
41+
:::caution
42+
43+
Only parquet file format is currently supported.
44+
45+
:::
46+
47+
### PATTERN = '<regex_pattern>'
48+
49+
A [PCRE2](https://www.pcre.org/current/doc/html/)-based regular expression pattern string, enclosed in single quotes, specifying the file names to match. Click [here](#loading-data-with-pattern-matching) to see an example. For PCRE2 syntax, see http://www.pcre.org/current/doc/html/pcre2syntax.html.
50+
51+
52+
### FILES = ( 'file1' [ , 'file2' ... ] )
53+
54+
Specifies a list of one or more files names (separated by commas) to be read.
55+
56+
### Connection Options for `<uri>` only
57+
58+
These include:
59+
60+
- ENDPOINT_URL
61+
- AWS_KEY_ID
62+
- AWS_SECRET_KEY
63+
- ACCESS_KEY_ID
64+
- ACCESS_KEY_SECRET
65+
- SECRET_ACCESS_KEY
66+
- SESSION_TOKEN
67+
- REGION
68+
- ENABLE_VIRTUAL_HOST_STYLE
69+
70+
They are explained in [Create Stage](../14-sql-commands/00-ddl/40-stage/01-ddl-create-stage.md).
71+
72+
## Query Examples
73+
74+
### Example 1: Querying Columns in a Parquet File
75+
76+
Let's assume you have a Parquet file called "example.parquet" with the following data:
77+
78+
```text
79+
80+
| name | age | city |
81+
|-------|-----|--------|
82+
| Alice | 28 | London |
83+
| Bob | 35 | Berlin |
84+
| Carol | 42 | Paris |
85+
```
86+
87+
You can query the "name" and "age" columns from this file using the following query:
88+
```sql
89+
SELECT name, age FROM @internal_stage/example.parquet;
90+
```
91+
92+
### Example 2: Querying Data using Pattern Matching
93+
94+
Suppose you have a directory with several Parquet files, and you only want to query the files that end with ".parquet". You can do this using the PATTERN parameter.
95+
96+
Assuming the following file structure:
97+
98+
```text
99+
data/
100+
├── 2022-01-01.parquet
101+
├── 2022-01-02.parquet
102+
├── 2022-01-03.parquet
103+
├── 2022-01-04.parquet
104+
└── 2022-01-05.parquet
105+
```
106+
107+
You can query the "name" and "age" columns from all files in the "data" folder using the following query:
108+
109+
```sql
110+
SELECT name, age FROM @internal_stage/data/
111+
(file_format => 'parquet', pattern => '.*parquet');
112+
```
113+
114+
### Example 3: Querying Data in an External Stage
115+
116+
Let's assume you have data in an Amazon S3 bucket and you want to query it using Databend. You can use the following query:
117+
```sql
118+
SELECT * FROM 's3://bucket/'
119+
(
120+
access_key_id => 'your-access-key-id',
121+
secret_access_key => 'your-secret-access-key',
122+
endpoint_url => 'your-object-storage-endpoint',
123+
file_format => 'parquet',
124+
pattern => '.*parquet'
125+
);
126+
```
127+
128+
### Example 4: Querying Data from a URI
129+
130+
You can also use a URI to query data files from a remote location, like this:
131+
132+
```sql
133+
SELECT count(*), author FROM 'https://datafuse-1253727613.cos.ap-hongkong.myqcloud.com/data/books.parquet'
134+
(file_format => 'parquet')
135+
GROUP BY author;
136+
```
137+
138+
## Conclusion
139+
140+
We hope this document has provided you with a better understanding of how to use standard SQL to query data files in an internal or external storage stage with Databend. By using the query function, you can easily inspect or view the contents of staged files, making it easier to load and unload data. The examples we provided should help you get started with using this powerful feature.
File renamed without changes.

src/query/ast/src/ast/format/ast_format.rs

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2074,19 +2074,47 @@ impl<'ast> Visitor<'ast> for AstFormatVisitor {
20742074
FormatTreeNode::with_children(selection_format_ctx, vec![selection_child]);
20752075
children.push(selection_node);
20762076
}
2077-
if !stmt.group_by.is_empty() {
2078-
let mut group_by_list_children = Vec::with_capacity(stmt.group_by.len());
2079-
for group_by in stmt.group_by.iter() {
2080-
self.visit_expr(group_by);
2081-
group_by_list_children.push(self.children.pop().unwrap());
2077+
match &stmt.group_by {
2078+
Some(GroupBy::Normal(exprs)) => {
2079+
let mut group_by_list_children = Vec::with_capacity(exprs.len());
2080+
for group_by in exprs.iter() {
2081+
self.visit_expr(group_by);
2082+
group_by_list_children.push(self.children.pop().unwrap());
2083+
}
2084+
let group_by_list_name = "GroupByList".to_string();
2085+
let group_by_list_format_ctx = AstFormatContext::with_children(
2086+
group_by_list_name,
2087+
group_by_list_children.len(),
2088+
);
2089+
let group_by_list_node =
2090+
FormatTreeNode::with_children(group_by_list_format_ctx, group_by_list_children);
2091+
children.push(group_by_list_node);
20822092
}
2083-
let group_by_list_name = "GroupByList".to_string();
2084-
let group_by_list_format_ctx =
2085-
AstFormatContext::with_children(group_by_list_name, group_by_list_children.len());
2086-
let group_by_list_node =
2087-
FormatTreeNode::with_children(group_by_list_format_ctx, group_by_list_children);
2088-
children.push(group_by_list_node);
2093+
Some(GroupBy::GroupingSets(sets)) => {
2094+
let mut grouping_sets = Vec::with_capacity(sets.len());
2095+
for set in sets.iter() {
2096+
let mut grouping_set = Vec::with_capacity(set.len());
2097+
for expr in set.iter() {
2098+
self.visit_expr(expr);
2099+
grouping_set.push(self.children.pop().unwrap());
2100+
}
2101+
let name = "GroupingSet".to_string();
2102+
let grouping_set_format_ctx =
2103+
AstFormatContext::with_children(name, grouping_set.len());
2104+
let grouping_set_node =
2105+
FormatTreeNode::with_children(grouping_set_format_ctx, grouping_set);
2106+
grouping_sets.push(grouping_set_node);
2107+
}
2108+
let group_by_list_name = "GroupByList".to_string();
2109+
let group_by_list_format_ctx =
2110+
AstFormatContext::with_children(group_by_list_name, grouping_sets.len());
2111+
let group_by_list_node =
2112+
FormatTreeNode::with_children(group_by_list_format_ctx, grouping_sets);
2113+
children.push(group_by_list_node);
2114+
}
2115+
_ => {}
20892116
}
2117+
20902118
if let Some(having) = &stmt.having {
20912119
self.visit_expr(having);
20922120
let having_child = self.children.pop().unwrap();

src/query/ast/src/ast/format/syntax/query.rs

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use crate::ast::format::syntax::interweave_comma;
2121
use crate::ast::format::syntax::parenthenized;
2222
use crate::ast::format::syntax::NEST_FACTOR;
2323
use crate::ast::Expr;
24+
use crate::ast::GroupBy;
2425
use crate::ast::JoinCondition;
2526
use crate::ast::JoinOperator;
2627
use crate::ast::OrderByExpr;
@@ -194,12 +195,19 @@ fn pretty_selection(selection: Option<Expr>) -> RcDoc<'static> {
194195
}
195196
}
196197

197-
fn pretty_group_by(group_by: Vec<Expr>) -> RcDoc<'static> {
198-
if !group_by.is_empty() {
199-
RcDoc::line()
198+
fn pretty_group_set(set: Vec<Expr>) -> RcDoc<'static> {
199+
RcDoc::nil()
200+
.append(RcDoc::text("("))
201+
.append(inline_comma(set.into_iter().map(pretty_expr)))
202+
.append(RcDoc::text(")"))
203+
}
204+
205+
fn pretty_group_by(group_by: Option<GroupBy>) -> RcDoc<'static> {
206+
match group_by {
207+
Some(GroupBy::Normal(exprs)) => RcDoc::line()
200208
.append(
201209
RcDoc::text("GROUP BY").append(
202-
if group_by.len() > 1 {
210+
if exprs.len() > 1 {
203211
RcDoc::line()
204212
} else {
205213
RcDoc::space()
@@ -208,12 +216,20 @@ fn pretty_group_by(group_by: Vec<Expr>) -> RcDoc<'static> {
208216
),
209217
)
210218
.append(
211-
interweave_comma(group_by.into_iter().map(pretty_expr))
219+
interweave_comma(exprs.into_iter().map(pretty_expr))
220+
.nest(NEST_FACTOR)
221+
.group(),
222+
),
223+
Some(GroupBy::GroupingSets(sets)) => RcDoc::line()
224+
.append(RcDoc::text("GROUP BY GROUPING SETS (").append(RcDoc::line().nest(NEST_FACTOR)))
225+
.append(
226+
interweave_comma(sets.into_iter().map(pretty_group_set))
212227
.nest(NEST_FACTOR)
213228
.group(),
214229
)
215-
} else {
216-
RcDoc::nil()
230+
.append(RcDoc::line())
231+
.append(RcDoc::text(")")),
232+
_ => RcDoc::nil(),
217233
}
218234
}
219235

src/query/ast/src/ast/query.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,22 @@ pub struct SelectStmt {
8282
// `WHERE` clause
8383
pub selection: Option<Expr>,
8484
// `GROUP BY` clause
85-
pub group_by: Vec<Expr>,
85+
pub group_by: Option<GroupBy>,
8686
// `HAVING` clause
8787
pub having: Option<Expr>,
8888
}
8989

90+
/// Group by Clause.
91+
#[derive(Debug, Clone, PartialEq)]
92+
pub enum GroupBy {
93+
/// GROUP BY expr [, expr]*
94+
Normal(Vec<Expr>),
95+
/// GROUP BY GROUPING SETS ( GroupSet [, GroupSet]* )
96+
///
97+
/// GroupSet := (expr [, expr]*) | expr
98+
GroupingSets(Vec<Vec<Expr>>),
99+
}
100+
90101
/// A relational set expression, like `SELECT ... FROM ... {UNION|EXCEPT|INTERSECT} SELECT ... FROM ...`
91102
#[derive(Debug, Clone, PartialEq)]
92103
pub enum SetExpr {
@@ -442,9 +453,25 @@ impl Display for SelectStmt {
442453
}
443454

444455
// GROUP BY clause
445-
if !self.group_by.is_empty() {
456+
if self.group_by.is_some() {
446457
write!(f, " GROUP BY ")?;
447-
write_comma_separated_list(f, &self.group_by)?;
458+
match self.group_by.as_ref().unwrap() {
459+
GroupBy::Normal(exprs) => {
460+
write_comma_separated_list(f, exprs)?;
461+
}
462+
GroupBy::GroupingSets(sets) => {
463+
write!(f, "GROUPING SETS (")?;
464+
for (i, set) in sets.iter().enumerate() {
465+
if i > 0 {
466+
write!(f, ", ")?;
467+
}
468+
write!(f, "(")?;
469+
write_comma_separated_list(f, set)?;
470+
write!(f, ")")?;
471+
}
472+
write!(f, ")")?;
473+
}
474+
}
448475
}
449476

450477
// HAVING clause

src/query/ast/src/parser/query.rs

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ pub enum SetOperationElement {
555555
select_list: Box<Vec<SelectTarget>>,
556556
from: Box<Vec<TableReference>>,
557557
selection: Box<Option<Expr>>,
558-
group_by: Box<Vec<Expr>>,
558+
group_by: Option<GroupBy>,
559559
having: Box<Option<Expr>>,
560560
},
561561
SetOperation {
@@ -565,6 +565,25 @@ pub enum SetOperationElement {
565565
Group(SetExpr),
566566
}
567567
568+
pub fn group_by_items(i: Input) -> IResult<GroupBy> {
569+
let normal = map(rule! { ^#comma_separated_list1(expr) }, |groups| {
570+
GroupBy::Normal(groups)
571+
});
572+
let group_set = alt((
573+
map(rule! {"(" ~ ")"}, |(_, _)| vec![]), // empty grouping set
574+
map(
575+
rule! {"(" ~ #comma_separated_list1(expr) ~ ")"},
576+
|(_, sets, _)| sets,
577+
),
578+
map(rule! { #expr }, |e| vec![e]),
579+
));
580+
let group_sets = map(
581+
rule! { GROUPING ~ SETS ~ "(" ~ ^#comma_separated_list1(group_set) ~ ")" },
582+
|(_, _, _, sets, _)| GroupBy::GroupingSets(sets),
583+
);
584+
rule!(#group_sets | #normal)(i)
585+
}
586+
568587
pub fn set_operation_element(i: Input) -> IResult<WithSpan<SetOperationElement>> {
569588
let set_operator = map(
570589
rule! {
@@ -588,7 +607,7 @@ pub fn set_operation_element(i: Input) -> IResult<WithSpan<SetOperationElement>>
588607
SELECT ~ DISTINCT? ~ ^#comma_separated_list1(select_target)
589608
~ ( FROM ~ ^#comma_separated_list1(table_reference) )?
590609
~ ( WHERE ~ ^#expr )?
591-
~ ( GROUP ~ ^BY ~ ^#comma_separated_list1(expr) )?
610+
~ ( GROUP ~ ^BY ~ ^#group_by_items )?
592611
~ ( HAVING ~ ^#expr )?
593612
},
594613
|(
@@ -609,11 +628,7 @@ pub fn set_operation_element(i: Input) -> IResult<WithSpan<SetOperationElement>>
609628
.unwrap_or_default(),
610629
),
611630
selection: Box::new(opt_where_block.map(|(_, selection)| selection)),
612-
group_by: Box::new(
613-
opt_group_by_block
614-
.map(|(_, _, group_by)| group_by)
615-
.unwrap_or_default(),
616-
),
631+
group_by: opt_group_by_block.map(|(_, _, group_by)| group_by),
617632
having: Box::new(opt_having_block.map(|(_, having)| having)),
618633
}
619634
},
@@ -667,7 +682,7 @@ impl<'a, I: Iterator<Item = WithSpan<'a, SetOperationElement>>> PrattParser<I>
667682
select_list: *select_list,
668683
from: *from,
669684
selection: *selection,
670-
group_by: *group_by,
685+
group_by,
671686
having: *having,
672687
})),
673688
_ => unreachable!(),

0 commit comments

Comments
 (0)