refine: COPY INTO <table> (#1951)

bohutang · web-flow · commit b6a5dbdb4f8a · 2025-04-24T23:19:40.000-04:00
* refine: copy into table

* refine the syntax
diff --git a/docs/en/sql-reference/10-sql-commands/10-dml/dml-copy-into-table.md b/docs/en/sql-reference/10-sql-commands/10-dml/dml-copy-into-table.md
@@ -4,6 +4,8 @@ sidebar_label: "COPY INTO <table>"
 ---
 
 import FunctionDescription from '@site/src/components/FunctionDescription';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
 
 <FunctionDescription description="Introduced or updated: v1.2.704"/>
 
@@ -12,188 +14,239 @@ COPY INTO allows you to load data from files located in one of the following loc
 - User / Internal / External stages: See [What is Stage?](/guides/load-data/stage/what-is-stage) to learn about stages in Databend.
 - Buckets or containers created in a storage service.
 - Remote servers from where you can access the files by their URL (starting with "https://...").
-- [IPFS](https://ipfs.tech).
+- [IPFS](https://ipfs.tech) and Hugging Face repositories.
 
 See also: [`COPY INTO <location>`](dml-copy-into-location.md)
 
 ## Syntax
 
 ```sql
-COPY INTO [<database_name>.]<table_name>
-     FROM { userStage | internalStage | externalStage | externalLocation |
-            ( SELECT [<file_col> ... ]
-              FROM { userStage | internalStage | externalStage } ) }
+/* Standard data load */
+COPY INTO [<database_name>.]<table_name> [ ( <col_name> [ , <col_name> ... ] ) ]
+     FROM { userStage | internalStage | externalStage | externalLocation }
 [ FILES = ( '<file_name>' [ , '<file_name>' ] [ , ... ] ) ]
 [ PATTERN = '<regex_pattern>' ]
 [ FILE_FORMAT = (
          FORMAT_NAME = '<your-custom-format>'
          | TYPE = { CSV | TSV | NDJSON | PARQUET | ORC | AVRO } [ formatTypeOptions ]
        ) ]
 [ copyOptions ]
-```
-
-### FROM ...
-
-The FROM clause specifies the source location (user stage, internal stage, external stage, or external location) from which data will be loaded into the specified table using the COPY INTO command. You can also nest a SELECT ... FROM subquery to transform the data you want to load. For more information, see [Transforming Data on Load](/guides/load-data/transform/data-load-transform).
-
-:::note
-When you load data from a staged file and the stage path contains special characters such as spaces or parentheses, you can enclose the entire path in single quotes, as demonstrated in the following SQL statements:
 
-COPY INTO mytable FROM 's3://mybucket/dataset(databend)/' ...
-COPY INTO mytable FROM 's3://mybucket/dataset databend/' ...
-:::
+/* Data load with transformation */
+COPY INTO [<database_name>.]<table_name> [ ( <col_name> [ , <col_name> ... ] ) ]
+     FROM ( SELECT [<alias>.]$<file_col_num>[.<element>] [ , [<alias>.]$<file_col_num>[.<element>] ... ]
+            FROM { userStage | internalStage | externalStage } )
+[ FILES = ( '<file_name>' [ , '<file_name>' ] [ , ... ] ) ]
+[ PATTERN = '<regex_pattern>' ]
+[ FILE_FORMAT = (
+         FORMAT_NAME = '<your-custom-format>'
+         | TYPE = { CSV | TSV | NDJSON | PARQUET | ORC | AVRO } [ formatTypeOptions ]
+       ) ]
+[ copyOptions ]
+```
 
-#### userStage
+Where:
 
 ```sql
 userStage ::= @~[/<path>]
-```
-
-#### internalStage
 
-```sql
 internalStage ::= @<internal_stage_name>[/<path>]
-```
-
-#### externalStage
 
-```sql
 externalStage ::= @<external_stage_name>[/<path>]
-```
-
-#### externalLocation
-
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
 
-<Tabs groupId="externallocation">
-
-<TabItem value="Amazon S3-like Storage" label="Amazon S3-like Storage">
-
-```sql
-externalLocation ::=
-  's3://<bucket>[<path>]'
-  CONNECTION = (
-        <connection_parameters>
-  )
-```
-
-For the connection parameters available for accessing Amazon S3-like storage services, see [Connection Parameters](/00-sql-reference/51-connect-parameters.md).
-</TabItem>
-
-<TabItem value="Azure Blob Storage" label="Azure Blob Storage">
-
-```sql
 externalLocation ::=
-  'azblob://<container>[<path>]'
+  /* Amazon S3-like Storage */
+  's3://<bucket>[/<path>]'
   CONNECTION = (
-        <connection_parameters>
+    [ ENDPOINT_URL = '<endpoint-url>' ]
+    [ ACCESS_KEY_ID = '<your-access-key-ID>' ]
+    [ SECRET_ACCESS_KEY = '<your-secret-access-key>' ]
+    [ ENABLE_VIRTUAL_HOST_STYLE = TRUE | FALSE ]
+    [ MASTER_KEY = '<your-master-key>' ]
+    [ REGION = '<region>' ]
+    [ SECURITY_TOKEN = '<security-token>' ]
+    [ ROLE_ARN = '<role-arn>' ]
+    [ EXTERNAL_ID = '<external-id>' ]
   )
-```
+  
+  /* Azure Blob Storage */
+  | 'azblob://<container>[/<path>]'
+    CONNECTION = (
+      ENDPOINT_URL = '<endpoint-url>'
+      ACCOUNT_NAME = '<account-name>'
+      ACCOUNT_KEY = '<account-key>'
+    )
+  
+  /* Google Cloud Storage */
+  | 'gcs://<bucket>[/<path>]'
+    CONNECTION = (
+      CREDENTIAL = '<your-base64-encoded-credential>'
+    )
+  
+  /* Alibaba Cloud OSS */
+  | 'oss://<bucket>[/<path>]'
+    CONNECTION = (
+      ACCESS_KEY_ID = '<your-ak>'
+      ACCESS_KEY_SECRET = '<your-sk>'
+      ENDPOINT_URL = '<endpoint-url>'
+      [ PRESIGN_ENDPOINT_URL = '<presign-endpoint-url>' ]
+    )
+  
+  /* Tencent Cloud Object Storage */
+  | 'cos://<bucket>[/<path>]'
+    CONNECTION = (
+      SECRET_ID = '<your-secret-id>'
+      SECRET_KEY = '<your-secret-key>'
+      ENDPOINT_URL = '<endpoint-url>'
+    )
+  
+  /* Remote Files */
+  | 'https://<url>'
+  
+  /* IPFS */
+  | 'ipfs://<your-ipfs-hash>'
+    CONNECTION = (ENDPOINT_URL = 'https://<your-ipfs-gateway>')
+  
+  /* Hugging Face */
+  | 'hf://<repo-id>[/<path>]'
+    CONNECTION = (
+      [ REPO_TYPE = 'dataset' | 'model' ]
+      [ REVISION = '<revision>' ]
+      [ TOKEN = '<your-api-token>' ]
+    )
 
-For the connection parameters available for accessing Azure Blob Storage, see [Connection Parameters](/00-sql-reference/51-connect-parameters.md).
-</TabItem>
+formatTypeOptions ::=
+  /* Common options for all formats */
+  [ COMPRESSION = AUTO | GZIP | BZ2 | BROTLI | ZSTD | DEFLATE | RAW_DEFLATE | XZ | NONE ]
+  
+  /* CSV specific options */
+  [ RECORD_DELIMITER = '<character>' ]
+  [ FIELD_DELIMITER = '<character>' ]
+  [ SKIP_HEADER = <integer> ]
+  [ QUOTE = '<character>' ]
+  [ ESCAPE = '<character>' ]
+  [ NAN_DISPLAY = '<string>' ]
+  [ NULL_DISPLAY = '<string>' ]
+  [ ERROR_ON_COLUMN_COUNT_MISMATCH = TRUE | FALSE ]
+  [ EMPTY_FIELD_AS = null | string | field_default ]
+  [ BINARY_FORMAT = HEX | BASE64 ]
+  
+  /* TSV specific options */
+  [ RECORD_DELIMITER = '<character>' ]
+  [ FIELD_DELIMITER = '<character>' ]
+  
+  /* NDJSON specific options */
+  [ NULL_FIELD_AS = NULL | FIELD_DEFAULT ]
+  [ MISSING_FIELD_AS = ERROR | NULL | FIELD_DEFAULT ]
+  [ ALLOW_DUPLICATE_KEYS = TRUE | FALSE ]
+  
+  /* PARQUET specific options */
+  [ MISSING_FIELD_AS = ERROR | FIELD_DEFAULT ]
+  
+  /* ORC specific options */
+  [ MISSING_FIELD_AS = ERROR | FIELD_DEFAULT ]
+  
+  /* AVRO specific options */
+  [ MISSING_FIELD_AS = ERROR | FIELD_DEFAULT ]
 
-<TabItem value="Google Cloud Storage" label="Google Cloud Storage">
+copyOptions ::=
+  [ SIZE_LIMIT = <num> ]
+  [ PURGE = <bool> ]
+  [ FORCE = <bool> ]
+  [ DISABLE_VARIANT_CHECK = <bool> ]
+  [ ON_ERROR = { continue | abort | abort_N } ]
+  [ MAX_FILES = <num> ]
+  [ RETURN_FAILED_ONLY = <bool> ]
+  [ COLUMN_MATCH_MODE = { case-sensitive | case-insensitive } ]
 
-```sql
-externalLocation ::=
-  'gcs://<bucket>[<path>]'
-  CONNECTION = (
-        <connection_parameters>
-  )
 ```
 
-For the connection parameters available for accessing Google Cloud Storage, see [Connection Parameters](/00-sql-reference/51-connect-parameters.md).
-</TabItem>
-
-<TabItem value="Alibaba Cloud OSS" label="Alibaba Cloud OSS">
+:::note
+For remote files, you can use glob patterns to specify multiple files. For example:
+- `ontime_200{6,7,8}.csv` represents `ontime_2006.csv`, `ontime_2007.csv`, `ontime_2008.csv`
+- `ontime_200[6-8].csv` represents the same files
+:::
 
-```sql
-externalLocation ::=
-  'oss://<bucket>[<path>]'
-  CONNECTION = (
-        <connection_parameters>
-  )
-```
+## Key Parameters
 
-For the connection parameters available for accessing Alibaba Cloud OSS, see [Connection Parameters](/00-sql-reference/51-connect-parameters.md).
-</TabItem>
+- **FILES**: Specifies one or more file names (separated by commas) to be loaded.
 
-<TabItem value="Tencent Cloud Object Storage" label="Tencent Cloud Object Storage">
+- **PATTERN**: A [PCRE2](https://www.pcre.org/current/doc/html/)-based regular expression pattern string that specifies file names to match. See [Example 4: Filtering Files with Pattern](#example-4-filtering-files-with-pattern).
 
-```sql
-externalLocation ::=
-  'cos://<bucket>[<path>]'
-  CONNECTION = (
-        <connection_parameters>
-  )
-```
+## Format Type Options
 
-For the connection parameters available for accessing Tencent Cloud Object Storage, see [Connection Parameters](/00-sql-reference/51-connect-parameters.md).
-</TabItem>
+The `FILE_FORMAT` parameter supports different file types, each with specific formatting options. Below are the available options for each supported file format:
 
-<TabItem value="Remote Files" label="Remote Files">
+### Common Options for All Formats
 
-```sql
-externalLocation ::=
-  'https://<url>'
-```
+| Option | Description | Values | Default |
+|--------|-------------|--------|--------|
+| COMPRESSION | Compression algorithm for data files | AUTO, GZIP, BZ2, BROTLI, ZSTD, DEFLATE, RAW_DEFLATE, XZ, NONE | AUTO |
 
-You can use glob patterns to specify more than one file. For example, use
+### TYPE = CSV
 
-- `ontime_200{6,7,8}.csv` to represents `ontime_2006.csv`,`ontime_2007.csv`,`ontime_2008.csv`.
-- `ontime_200[6-8].csv` to represents `ontime_2006.csv`,`ontime_2007.csv`,`ontime_2008.csv`.
+| Option | Description | Default |
+|--------|-------------|--------|
+| RECORD_DELIMITER | Character(s) separating records | newline |
+| FIELD_DELIMITER | Character(s) separating fields | comma (,) |
+| SKIP_HEADER | Number of header lines to skip | 0 |
+| QUOTE | Character used to quote fields | double-quote (") |
+| ESCAPE | Escape character for enclosed fields | NONE |
+| NAN_DISPLAY | String representing NaN values | NaN |
+| NULL_DISPLAY | String representing NULL values | \N |
+| ERROR_ON_COLUMN_COUNT_MISMATCH | Error if column count doesn't match | TRUE |
+| EMPTY_FIELD_AS | How to handle empty fields | null |
+| BINARY_FORMAT | Encoding format for binary data | HEX |
 
-</TabItem>
+### TYPE = TSV
 
-<TabItem value="IPFS" label="IPFS">
+| Option | Description | Default |
+|--------|-------------|--------|
+| RECORD_DELIMITER | Character(s) separating records | newline |
+| FIELD_DELIMITER | Character(s) separating fields | tab (\t) |
 
-```sql
-externalLocation ::=
-  'ipfs://<your-ipfs-hash>'
-  CONNECTION = (ENDPOINT_URL = 'https://<your-ipfs-gateway>')
-```
+### TYPE = NDJSON
 
-</TabItem>
-</Tabs>
+| Option | Description | Default |
+|--------|-------------|--------|
+| NULL_FIELD_AS | How to handle null fields | NULL |
+| MISSING_FIELD_AS | How to handle missing fields | ERROR |
+| ALLOW_DUPLICATE_KEYS | Allow duplicate object keys | FALSE |
 
-### FILES
+### TYPE = PARQUET
 
-FILES specifies one or more file names (separated by commas) to be loaded.
+| Option | Description | Default |
+|--------|-------------|--------|
+| MISSING_FIELD_AS | How to handle missing fields | ERROR |
 
-### PATTERN
+### TYPE = ORC
 
-A [PCRE2](https://www.pcre.org/current/doc/html/)-based regular expression pattern string, enclosed in single quotes, specifying the file names to match. For PCRE2 syntax, see http://www.pcre.org/current/doc/html/pcre2syntax.html. See [Example 4: Filtering Files with Pattern](#example-4-filtering-files-with-pattern) for examples and useful tips about filtering files with the PATTERN parameter.
+| Option | Description | Default |
+|--------|-------------|--------|
+| MISSING_FIELD_AS | How to handle missing fields | ERROR |
 
-### FILE_FORMAT
+### TYPE = AVRO
 
-See [Input & Output File Formats](../../00-sql-reference/50-file-format-options.md) for details.
+| Option | Description | Default |
+|--------|-------------|--------|
+| MISSING_FIELD_AS | How to handle missing fields | ERROR |
 
-### copyOptions
+## Copy Options
 
-```sql
-copyOptions ::=
-  [ SIZE_LIMIT = <num> ]
-  [ PURGE = <bool> ]
-  [ FORCE = <bool> ]
-  [ DISABLE_VARIANT_CHECK = <bool> ]
-  [ ON_ERROR = { continue | abort | abort_N } ]
-  [ MAX_FILES = <num> ]
-  [ RETURN_FAILED_ONLY = <bool> ]
-  [ COLUMN_MATCH_MODE =  { case-sensitive | case-insensitive } ]
-```
+| Parameter | Description | Default |
+|-----------|-------------|----------|
+| SIZE_LIMIT | Maximum rows of data to load | `0` (no limit) |
+| PURGE | Purges files after successful load | `false` |
+| FORCE | Allows reloading of duplicate files | `false` (skips duplicates) |
+| DISABLE_VARIANT_CHECK | Replaces invalid JSON with null | `false` (fails on invalid JSON) |
+| ON_ERROR | How to handle errors: `continue`, `abort`, or `abort_N` | `abort` |
+| MAX_FILES | Maximum number of files to load (up to 15,000) | - |
+| RETURN_FAILED_ONLY | Only returns failed files in output | `false` |
+| COLUMN_MATCH_MODE | For Parquet: column name matching mode | `case-insensitive` |
 
-| Parameter             | Description                                                                                                                                                                                                                                                                                                                                                                                                        | Required |
-| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------- |
-| SIZE_LIMIT            | Specifies the maximum rows of data to be loaded for a given COPY statement. Defaults to `0` meaning no limits.                                                                                                                                                                                                                                                                                                     | Optional |
-| PURGE                 | If `true`, the command will purge the files in the stage after they are loaded successfully into the table. Default: `false`.                                                                                                                                                                                                                                                                                      | Optional |
-| FORCE                 | COPY INTO ensures idempotence by automatically tracking and preventing the reloading of files for a default period of 12 hours. This can be customized using the `load_file_metadata_expire_hours` setting to control the expiration time for file metadata.<br/>This parameter defaults to `false` meaning COPY INTO will skip duplicate files when copying data. If `true`, duplicate files will not be skipped. | Optional |
-| DISABLE_VARIANT_CHECK | If `true`, invalid JSON data is replaced with null values during COPY INTO. If `false` (default), COPY INTO fails on invalid JSON data.                                                                                                                                                                                                                                                                            | Optional |
-| ON_ERROR              | Decides how to handle a file that contains errors: `continue` to skip and proceed, `abort` (default) to terminate on error, `abort_N` to terminate when errors ≥ N. Note: `abort_N` not available for Parquet files.                                                                                                                                                                                     | Optional |
-| MAX_FILES             | Sets the maximum number of files to load that have not been loaded already. The value can be set up to 15,000; any value greater than 15,000 will be treated as 15,000.                                                                                                                                                                                                                                               | Optional |
-| RETURN_FAILED_ONLY    | When set to `true`, only files that failed to load will be returned in the output. Default: `false`.                                                                                                                                                                                                                                                                                                               | Optional |
-| COLUMN_MATCH_MODE     | (For Parquet only) Determines if column name matching during COPY INTO is `case-sensitive` or `case-insensitive` (default). | Optional |
+:::tip
+When importing large volumes of data (like logs), set both `PURGE` and `FORCE` to `true` for efficient data import without Meta server interaction. Note this may lead to duplicate data imports.
+:::
 
 :::tip
 When importing large volumes of data, such as logs, it is recommended to set both `PURGE` and `FORCE` to `true`. This ensures efficient data import without the need for interaction with the Meta server (updating the copied-files set). However, it is important to be aware that this may lead to duplicate data imports.
@@ -213,10 +266,6 @@ COPY INTO provides a summary of the data loading results with these columns:
 
 If `RETURN_FAILED_ONLY` is set to `true`, the output will only contain the files that failed to load.
 
-## Distributed COPY INTO
-
-The COPY INTO feature in Databend activates distributed execution automatically in cluster environments, enhancing data loading efficiency and scalability.
-
 ## Examples
 
 ### Example 1: Loading from Stages