Skip to content

Commit ffa31eb

Browse files
committed
Merge remote-tracking branch 'origin/main' into bump-deps
2 parents 305d399 + 35f38d5 commit ffa31eb

File tree

81 files changed

+1588
-143
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+1588
-143
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/doc/12-load-data/00-stage.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
2-
title: Tutorial - Load from an Internal Stage
3-
sidebar_label: Tutorial - Load from an Internal Stage
2+
title: Load from an Internal Stage
3+
sidebar_label: Load from an Internal Stage
44
description:
55
Load data from Databend stages.
66
---

docs/doc/12-load-data/01-s3.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
2-
title: Tutorial - Load from an Amazon S3 Bucket
3-
sidebar_label: Tutorial - Load from an Amazon S3 Bucket
2+
title: Load from an Amazon S3 Bucket
3+
sidebar_label: Load from an Amazon S3 Bucket
44
description:
55
Load data from Amazon S3.
66
---

docs/doc/12-load-data/02-local.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
2-
title: Tutorial - Load from a Local File
3-
sidebar_label: Tutorial - Load from a Local File
2+
title: Load from a Local File
3+
sidebar_label: Load from a Local File
44
description:
55
Load data from local file system.
66
---

docs/doc/12-load-data/04-http.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
2-
title: Tutorial - Load from Remote File
3-
sidebar_label: Tutorial - Load from a Remote File
2+
title: Load from Remote File
3+
sidebar_label: Load from a Remote File
44
description:
55
Load data from remote files.
66
---
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
---
2+
title: Transforming Data During a Load
3+
sidebar_label: Transforming Data During a Load
4+
description: Learn how to use Databend to transform data while loading it into a table using the COPY INTO <table> command.
5+
---
6+
7+
Databend supports transforming data while loading it into a table using the `COPY INTO <table>` command, which simplifies your ETL pipeline for basic transformations.
8+
9+
This feature helps you avoid the use of temporary tables to store pre-transformed data when reordering columns during a data load.
10+
11+
The `COPY` command supports:
12+
- Column reordering, column omission, and casts using a SELECT statement. There is no requirement for your data files to have the same number and ordering of columns as your target table.
13+
14+
:::note
15+
Transforming is only supported for Parquet format in the stage.
16+
:::
17+
18+
## Load a Subset of Table Data
19+
20+
Load a subset of data into a table. The following example loads data from columns `id`, `name` of a staged Parquet file:
21+
22+
**Sample Data**
23+
```text
24+
id | name | age
25+
---|------------|----
26+
1 | John Doe | 35
27+
2 | Jane Smith | 28
28+
```
29+
30+
**Example**
31+
```sql
32+
-- create a table
33+
CREATE TABLE my_table(id int, name string);
34+
35+
COPY INTO my_table
36+
FROM (SELECT t.id, t.name FROM @mystage t)
37+
FILE_FORMAT = (type = parquet) PATTERN='.*parquet';
38+
````
39+
40+
## Reorder Columns During a Load
41+
42+
To reorder the columns from a staged Parquet file before loading it into a table, you can use the `COPY INTO` command with a `SELECT` statement. The following example reorders the columns `name` and `id`:
43+
44+
**Sample Data**
45+
```text
46+
id | name | age
47+
---|------------|----
48+
1 | John Doe | 35
49+
2 | Jane Smith | 28
50+
```
51+
52+
**Example**
53+
````sql
54+
CREATE TABLE my_table(name string, id int);
55+
56+
COPY INTO my_table
57+
FROM (SELECT t.name, t.id FROM @mystage t)
58+
FILE_FORMAT = (type = parquet) PATTERN='.*parquet';
59+
````
60+
61+
## Convert Data Types During a Load
62+
63+
To convert staged data into other data types during a data load, you can use the appropriate conversion function in your `SELECT` statement.
64+
65+
The following example converts a timestamp into a date:
66+
67+
**Sample Data**
68+
```text
69+
id | name | timestamp
70+
---|---------|--------------------
71+
1 | John Doe| 2022-03-15 10:30:00
72+
2 | Jane Doe| 2022-03-14 09:00:00
73+
```
74+
75+
**Example**
76+
```sql
77+
CREATE TABLE my_table(id int, name string, time date);
78+
79+
COPY INTO my_table
80+
FROM (SELECT t.id, t.name, to_date(t.timestamp) FROM @mystage t)
81+
FILE_FORMAT = (type = parquet) PATTERN='.*parquet';
82+
```
83+
84+
## Conclusion
85+
86+
Transforming data during a load is a powerful feature of Databend that allows you to simplify your ETL pipeline and avoid the use of temporary tables. With the ability to transform data during a load, you can streamline your ETL pipeline and focus on the analysis of your data rather than the mechanics of moving it around.

src/common/exception/src/exception_code.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ build_exceptions! {
135135
ReadTableDataError(1107),
136136
AddColumnExistError(1108),
137137
DropColumnEmptyError(1109),
138+
// create table or alter table add column with internal column name
139+
TableWithInternalColumnName(1110),
138140

139141
// Data Related Errors
140142

@@ -207,6 +209,7 @@ build_exceptions! {
207209
/// - and without `IF EXISTS`
208210
CatalogNotFound(2320),
209211

212+
210213
// Cluster error codes.
211214
ClusterUnknownNode(2401),
212215
ClusterNodeAlreadyExists(2402),

src/query/catalog/src/plan/datasource/datasource_plan.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ pub struct DataSourcePlan {
3535

3636
pub tbl_args: Option<TableArgs>,
3737
pub push_downs: Option<PushDownInfo>,
38+
pub query_internal_columns: bool,
3839
}
3940

4041
impl DataSourcePlan {
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
// Copyright 2023 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use common_expression::types::string::StringColumnBuilder;
16+
use common_expression::types::DataType;
17+
use common_expression::types::NumberDataType;
18+
use common_expression::types::UInt64Type;
19+
use common_expression::BlockEntry;
20+
use common_expression::ColumnId;
21+
use common_expression::FromData;
22+
use common_expression::Scalar;
23+
use common_expression::TableDataType;
24+
use common_expression::Value;
25+
26+
// Segment and Block id Bits when generate internal column `_row_id`
27+
// Since `DEFAULT_BLOCK_PER_SEGMENT` is 1000, so `block_id` 10 bits is enough.
28+
const NUM_BLOCK_ID_BITS: usize = 10;
29+
const NUM_SEGMENT_ID_BITS: usize = 22;
30+
31+
pub const ROW_ID: &str = "_row_id";
32+
pub const SNAPSHOT_NAME: &str = "_snapshot_name";
33+
pub const SEGMENT_NAME: &str = "_segment_name";
34+
pub const BLOCK_NAME: &str = "_block_name";
35+
36+
// meta data for generate internal columns
37+
#[derive(Debug)]
38+
pub struct InternalColumnMeta {
39+
pub segment_id: usize,
40+
pub block_id: usize,
41+
pub block_location: String,
42+
pub segment_location: String,
43+
pub snapshot_location: String,
44+
}
45+
46+
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)]
47+
pub enum InternalColumnType {
48+
RowId,
49+
BlockName,
50+
SegmentName,
51+
SnapshotName,
52+
}
53+
54+
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)]
55+
pub struct InternalColumn {
56+
pub column_name: String,
57+
pub column_type: InternalColumnType,
58+
}
59+
60+
impl InternalColumn {
61+
pub fn new(name: &str, column_type: InternalColumnType) -> Self {
62+
InternalColumn {
63+
column_name: name.to_string(),
64+
column_type,
65+
}
66+
}
67+
68+
pub fn column_type(&self) -> &InternalColumnType {
69+
&self.column_type
70+
}
71+
72+
pub fn table_data_type(&self) -> TableDataType {
73+
match &self.column_type {
74+
InternalColumnType::RowId => TableDataType::Number(NumberDataType::UInt64),
75+
InternalColumnType::BlockName => TableDataType::String,
76+
InternalColumnType::SegmentName => TableDataType::String,
77+
InternalColumnType::SnapshotName => TableDataType::String,
78+
}
79+
}
80+
81+
pub fn data_type(&self) -> DataType {
82+
let t = &self.table_data_type();
83+
t.into()
84+
}
85+
86+
pub fn column_name(&self) -> &String {
87+
&self.column_name
88+
}
89+
90+
pub fn column_id(&self) -> ColumnId {
91+
match &self.column_type {
92+
InternalColumnType::RowId => u32::MAX,
93+
InternalColumnType::BlockName => u32::MAX - 1,
94+
InternalColumnType::SegmentName => u32::MAX - 2,
95+
InternalColumnType::SnapshotName => u32::MAX - 3,
96+
}
97+
}
98+
99+
pub fn generate_column_values(&self, meta: &InternalColumnMeta, num_rows: usize) -> BlockEntry {
100+
match &self.column_type {
101+
InternalColumnType::RowId => {
102+
let block_id = meta.block_id as u64;
103+
let seg_id = meta.segment_id as u64;
104+
let high_32bit = (seg_id << NUM_SEGMENT_ID_BITS) + (block_id << NUM_BLOCK_ID_BITS);
105+
let mut row_ids = Vec::with_capacity(num_rows);
106+
for i in 0..num_rows {
107+
let row_id = high_32bit + i as u64;
108+
row_ids.push(row_id);
109+
}
110+
BlockEntry {
111+
data_type: DataType::Number(NumberDataType::UInt64),
112+
value: Value::Column(UInt64Type::from_data(row_ids)),
113+
}
114+
}
115+
InternalColumnType::BlockName => {
116+
let mut builder = StringColumnBuilder::with_capacity(1, meta.block_location.len());
117+
builder.put_str(&meta.block_location);
118+
builder.commit_row();
119+
BlockEntry {
120+
data_type: DataType::String,
121+
value: Value::Scalar(Scalar::String(builder.build_scalar())),
122+
}
123+
}
124+
InternalColumnType::SegmentName => {
125+
let mut builder =
126+
StringColumnBuilder::with_capacity(1, meta.segment_location.len());
127+
builder.put_str(&meta.segment_location);
128+
builder.commit_row();
129+
BlockEntry {
130+
data_type: DataType::String,
131+
value: Value::Scalar(Scalar::String(builder.build_scalar())),
132+
}
133+
}
134+
InternalColumnType::SnapshotName => {
135+
let mut builder =
136+
StringColumnBuilder::with_capacity(1, meta.snapshot_location.len());
137+
builder.put_str(&meta.snapshot_location);
138+
builder.commit_row();
139+
BlockEntry {
140+
data_type: DataType::String,
141+
value: Value::Scalar(Scalar::String(builder.build_scalar())),
142+
}
143+
}
144+
}
145+
}
146+
}

src/query/catalog/src/plan/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@
1313
// limitations under the License.
1414

1515
mod datasource;
16+
mod internal_column;
1617
mod partition;
1718
mod partition_statistics;
1819
mod projection;
1920
mod pruning_statistics;
2021
mod pushdown;
2122

2223
pub use datasource::*;
24+
pub use internal_column::*;
2325
pub use partition::*;
2426
pub use partition_statistics::PartStatistics;
2527
pub use projection::Projection;

0 commit comments

Comments
 (0)