diff --git a/docs/_snippets/_users-and-roles-common.md b/docs/_snippets/_users-and-roles-common.md index 29726229be9..964f9a8b40d 100644 --- a/docs/_snippets/_users-and-roles-common.md +++ b/docs/_snippets/_users-and-roles-common.md @@ -42,64 +42,73 @@ Create these tables and users to be used in the examples. #### Creating a sample database, table, and rows {#creating-a-sample-database-table-and-rows} -1. Create a test database + - ```sql - CREATE DATABASE db1; - ``` +##### Create a test database {#create-a-test-database} -2. Create a table +```sql +CREATE DATABASE db1; +``` - ```sql - CREATE TABLE db1.table1 ( - id UInt64, - column1 String, - column2 String - ) - ENGINE MergeTree - ORDER BY id; - ``` +##### Create a table {#create-a-table} -3. Populate the table with sample rows +```sql +CREATE TABLE db1.table1 ( + id UInt64, + column1 String, + column2 String +) +ENGINE MergeTree +ORDER BY id; +``` - ```sql - INSERT INTO db1.table1 - (id, column1, column2) - VALUES - (1, 'A', 'abc'), - (2, 'A', 'def'), - (3, 'B', 'abc'), - (4, 'B', 'def'); - ``` +##### Populate the table with sample rows {#populate} -4. Verify the table: +```sql +INSERT INTO db1.table1 + (id, column1, column2) +VALUES + (1, 'A', 'abc'), + (2, 'A', 'def'), + (3, 'B', 'abc'), + (4, 'B', 'def'); +``` - ```sql - SELECT * - FROM db1.table1 - ``` +##### Verify the table {#verify} - ```response - Query id: 475015cc-6f51-4b20-bda2-3c9c41404e49 +```sql title="Query" +SELECT * +FROM db1.table1 +``` - ┌─id─┬─column1─┬─column2─┐ - │ 1 │ A │ abc │ - │ 2 │ A │ def │ - │ 3 │ B │ abc │ - │ 4 │ B │ def │ - └────┴─────────┴─────────┘ - ``` +```response title="Response" +Query id: 475015cc-6f51-4b20-bda2-3c9c41404e49 -5. Create a regular user that will be used to demonstrate restrict access to certain columns: +┌─id─┬─column1─┬─column2─┐ +│ 1 │ A │ abc │ +│ 2 │ A │ def │ +│ 3 │ B │ abc │ +│ 4 │ B │ def │ +└────┴─────────┴─────────┘ +``` - ```sql - CREATE USER column_user IDENTIFIED BY 'password'; - ``` +##### Create `column_user` {#create-a-user-with-restricted-access-to-columns} -6. Create a regular user that will be used to demonstrate restricting access to rows with certain values: - ```sql - CREATE USER row_user IDENTIFIED BY 'password'; - ``` +Create a regular user that will be used to demonstrate restrict access to certain columns: + +```sql +CREATE USER column_user IDENTIFIED BY 'password'; +``` + +##### Create `row_user` {#create-a-user-with-restricted-access-to-rows-with-certain-values} + +Create a regular user that will be used to demonstrate restricting access to rows with certain values: + +```sql +CREATE USER row_user IDENTIFIED BY 'password'; +``` + + #### Creating roles {#creating-roles} diff --git a/docs/best-practices/_snippets/_table_of_contents.md b/docs/best-practices/_snippets/_table_of_contents.md new file mode 100644 index 00000000000..9e0d34ef2d1 --- /dev/null +++ b/docs/best-practices/_snippets/_table_of_contents.md @@ -0,0 +1,12 @@ +| Page | Description | +|--------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------| +| [Choosing a Primary Key](/best-practices/choosing-a-primary-key) | How to select primary keys that maximize query performance and minimize storage overhead. | +| [Select Data Types](/best-practices/select-data-types) | Choose optimal data types to reduce memory usage, improve compression, and accelerate queries. | +| [Use Materialized Views](/best-practices/use-materialized-views) | Leverage materialized views to pre-aggregate data and dramatically speed up analytical queries. | +| [Minimize and Optimize JOINs](/best-practices/minimize-optimize-joins) | Best practices for using ClickHouse's `JOIN` capabilities efficiently. | +| [Choosing a Partitioning Key](/best-practices/choosing-a-partitioning-key) | Select partitioning strategies that enable efficient data pruning and faster query execution. | +| [Selecting an Insert Strategy](/best-practices/selecting-an-insert-strategy) | Optimize data ingestion throughput and reduce resource consumption with proper insert patterns. | +| [Data Skipping Indices](/best-practices/use-data-skipping-indices-where-appropriate) | Apply secondary indices strategically to skip irrelevant data blocks and accelerate filtered queries. | +| [Avoid Mutations](/best-practices/avoid-mutations) | Design schemas and workflows that eliminate costly `UPDATE`/`DELETE` operations for better performance. | +| [Avoid OPTIMIZE FINAL](/best-practices/avoid-optimize-final) | Prevent performance bottlenecks by understanding when `OPTIMIZE FINAL` hurts more than it helps. | +| [Use JSON where appropriate](/best-practices/use-json-where-appropriate) | Balance flexibility and performance when working with semi-structured JSON data in ClickHouse. | \ No newline at end of file diff --git a/docs/best-practices/index.md b/docs/best-practices/index.md index 5a3ae78ab5f..b4721106510 100644 --- a/docs/best-practices/index.md +++ b/docs/best-practices/index.md @@ -6,19 +6,10 @@ hide_title: true description: 'Landing page for Best Practices section in ClickHouse' --- +import TableOfContents from '@site/docs/best-practices/_snippets/_table_of_contents.md'; + # Best Practices in ClickHouse {#best-practices-in-clickhouse} This section provides the best practices you will want to follow to get the most out of ClickHouse. -| Page | Description | -|----------------------------------------------------------------------|--------------------------------------------------------------------------| -| [Choosing a Primary Key](/best-practices/choosing-a-primary-key) | Guidance on selecting an effective Primary Key in ClickHouse. | -| [Select Data Types](/best-practices/select-data-types) | Recommendations for choosing appropriate data types. | -| [Use Materialized Views](/best-practices/use-materialized-views) | When and how to benefit from materialized views. | -| [Minimize and Optimize JOINs](/best-practices/minimize-optimize-joins)| Best practices for minimizing and optimizing JOIN operations. | -| [Choosing a Partitioning Key](/best-practices/choosing-a-partitioning-key) | How to choose and apply partitioning keys effectively. | -| [Selecting an Insert Strategy](/best-practices/selecting-an-insert-strategy) | Strategies for efficient data insertion in ClickHouse. | -| [Data Skipping Indices](/best-practices/use-data-skipping-indices-where-appropriate) | When to apply data skipping indices for performance gains. | -| [Avoid Mutations](/best-practices/avoid-mutations) | Reasons to avoid mutations and how to design without them. | -| [Avoid OPTIMIZE FINAL](/best-practices/avoid-optimize-final) | Why `OPTIMIZE FINAL` can be costly and how to work around it. | -| [Use JSON where appropriate](/best-practices/use-json-where-appropriate) | Considerations for using JSON columns in ClickHouse. | + \ No newline at end of file diff --git a/docs/cloud-index.md b/docs/cloud-index.md deleted file mode 100644 index 911b6d139ff..00000000000 --- a/docs/cloud-index.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /cloud/overview -keywords: ['AWS', 'Cloud', 'serverless'] -title: 'Overview' -hide_title: true -description: 'Overview page for Cloud' ---- - -import Content from '@site/docs/about-us/cloud.md'; - - diff --git a/docs/cloud/_snippets/_security_table_of_contents.md b/docs/cloud/_snippets/_security_table_of_contents.md new file mode 100644 index 00000000000..45aa2a68290 --- /dev/null +++ b/docs/cloud/_snippets/_security_table_of_contents.md @@ -0,0 +1,8 @@ +| Page | Description | +|---------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------| +| [Shared Responsibility Model](shared-responsibility-model.md) | Understand how security responsibilities are divided between ClickHouse Cloud and your organization for different service types. | +| [Cloud Access Management](cloud-access-management/index.md) | Manage user access with authentication, single sign-on (SSO), role-based permissions, and team invitations. | +| [Connectivity](connectivity-overview.md) | Configure secure network access including IP allow-lists, private networking, S3 data access, and Cloud IP address management. | +| [Enhanced Encryption](cmek.md) | Learn about default AES 256 encryption and how to enable Transparent Data Encryption (TDE) for additional data protection at rest. | +| [Audit Logging](audit-logging.md) | Set up and use audit logging to track and monitor activities in your ClickHouse Cloud environment. | +| [Privacy and Compliance](privacy-compliance-overview.md) | Review security certifications, compliance standards, and learn how to manage your personal information and data rights. | \ No newline at end of file diff --git a/docs/cloud/manage/api/api-overview.md b/docs/cloud/api/api-overview.md similarity index 98% rename from docs/cloud/manage/api/api-overview.md rename to docs/cloud/api/api-overview.md index ab0484d0c5c..5e81632b5e8 100644 --- a/docs/cloud/manage/api/api-overview.md +++ b/docs/cloud/api/api-overview.md @@ -56,7 +56,8 @@ If your organization has been migrated to one of the [new pricing plans](https:/ You will now also be able to specify the `num_replicas` field as a property of the service resource. ::: -## Terraform and OpenAPI New Pricing: Replica Settings Explained +## Terraform and OpenAPI New Pricing: Replica Settings Explained {#terraform-and-openapi-new-pricing---replica-settings-explained} + The number of replicas each service will be created with defaults to 3 for the Scale and Enterprise tiers, while it defaults to 1 for the Basic tier. For the Scale and the Enterprise tiers it is possible to adjust it by passing a `numReplicas` field in the service creation request. The value of the `numReplicas` field must be between 2 and 20 for the first service in a warehouse. Services that are created in an existing warehouse can have a number of replicas as low as 1. diff --git a/docs/cloud/manage/api/index.md b/docs/cloud/api/index.md similarity index 100% rename from docs/cloud/manage/api/index.md rename to docs/cloud/api/index.md diff --git a/docs/cloud/manage/openapi.md b/docs/cloud/api/openapi.md similarity index 100% rename from docs/cloud/manage/openapi.md rename to docs/cloud/api/openapi.md diff --git a/docs/cloud/manage/postman.md b/docs/cloud/api/postman.md similarity index 100% rename from docs/cloud/manage/postman.md rename to docs/cloud/api/postman.md diff --git a/docs/cloud/bestpractices/index.md b/docs/cloud/bestpractices/index.md deleted file mode 100644 index 550f2901bc4..00000000000 --- a/docs/cloud/bestpractices/index.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -slug: /cloud/bestpractices -keywords: ['Cloud', 'Best Practices', 'Bulk Inserts', 'Asynchronous Inserts', 'Avoid mutations', 'Avoid nullable columns', 'Avoid Optimize Final', 'Low Cardinality Partitioning Key', 'Multi Tenancy', 'Usage Limits'] -title: 'Overview' -hide_title: true -description: 'Landing page for Best Practices section in ClickHouse Cloud' ---- - -# Best Practices in ClickHouse Cloud {#best-practices-in-clickhouse-cloud} - -This section provides best practices you will want to follow to get the most out of ClickHouse Cloud. - -| Page | Description | -|----------------------------------------------------------|----------------------------------------------------------------------------| -| [Usage Limits](/cloud/bestpractices/usage-limits)| Explore the limits of ClickHouse. | -| [Multi tenancy](/cloud/bestpractices/multi-tenancy)| Learn about different strategies to implement multi-tenancy. | - -These are in addition to the standard best practices which apply to all deployments of ClickHouse. - -| Page | Description | -|----------------------------------------------------------------------|--------------------------------------------------------------------------| -| [Choosing a Primary Key](/best-practices/choosing-a-primary-key) | Guidance on selecting an effective Primary Key in ClickHouse. | -| [Select Data Types](/best-practices/select-data-types) | Recommendations for choosing appropriate data types. | -| [Use Materialized Views](/best-practices/use-materialized-views) | When and how to benefit from materialized views. | -| [Minimize and Optimize JOINs](/best-practices/minimize-optimize-joins)| Best practices for minimizing and optimizing JOIN operations. | -| [Choosing a Partitioning Key](/best-practices/choosing-a-partitioning-key) | How to choose and apply partitioning keys effectively. | -| [Selecting an Insert Strategy](/best-practices/selecting-an-insert-strategy) | Strategies for efficient data insertion in ClickHouse. | -| [Data Skipping Indices](/best-practices/use-data-skipping-indices-where-appropriate) | When to apply data skipping indices for performance gains. | -| [Avoid Mutations](/best-practices/avoid-mutations) | Reasons to avoid mutations and how to design without them. | -| [Avoid `OPTIMIZE FINAL`](/best-practices/avoid-optimize-final) | Why `OPTIMIZE FINAL` can be costly and how to work around it. | -| [Use JSON where appropriate](/best-practices/use-json-where-appropriate) | Considerations for using JSON columns in ClickHouse. | diff --git a/docs/cloud/bestpractices/usagelimits.md b/docs/cloud/bestpractices/usagelimits.md deleted file mode 100644 index 37ab67b542c..00000000000 --- a/docs/cloud/bestpractices/usagelimits.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -slug: /cloud/bestpractices/usage-limits -sidebar_label: 'Usage Limits' -title: 'Usage limits' -description: 'Describes the recommended usage limits in ClickHouse Cloud' ---- - -While ClickHouse is known for its speed and reliability, optimal performance is achieved within certain operating parameters. For example, having too many tables, databases or parts could negatively impact performance. To avoid this, Clickhouse Cloud has guardrails set up for several types of items. You can find details of these guardrails below. - -:::tip -If you've run up against one of these guardrails, it's possible that you are implementing your use case in an unoptimized way. Contact our support team and we will gladly help you refine your use case to avoid exceeding the guardrails or look together at how we can increase them in a controlled manner. -::: - -| Dimension | Limit | -|-----------|-------| -|**Databases**| 1000| -|**Tables**| 5000| -|**Columns**| ∼1000 (wide format is preferred to compact)| -|**Partitions**| 50k| -|**Parts**| 100k across the entire instance| -|**Part size**| 150gb| -|**Services per organization**| 20 (soft)| -|**Services per warehouse**| 5 (soft)| -|**Low cardinality**| 10k or less| -|**Primary keys in a table**| 4-5 that sufficiently filter down the data| -|**Query concurrency**| 1000| -|**Batch ingest**| anything > 1M will be split by the system in 1M row blocks| - -:::note -For Single Replica Services, the maximum number of databases is restricted to 100, and the maximum number of tables is restricted to 500. In addition, storage for Basic Tier Services is limited to 1 TB. -::: diff --git a/docs/cloud/manage/scaling.md b/docs/cloud/features/01_automatic_scaling.md similarity index 100% rename from docs/cloud/manage/scaling.md rename to docs/cloud/features/01_automatic_scaling.md diff --git a/docs/cloud/get-started/sql-console.md b/docs/cloud/features/01_cloud_console_features/01_sql-console.md similarity index 100% rename from docs/cloud/get-started/sql-console.md rename to docs/cloud/features/01_cloud_console_features/01_sql-console.md diff --git a/docs/cloud/get-started/query-insights.md b/docs/cloud/features/01_cloud_console_features/02_query-insights.md similarity index 100% rename from docs/cloud/get-started/query-insights.md rename to docs/cloud/features/01_cloud_console_features/02_query-insights.md diff --git a/docs/cloud/get-started/query-endpoints.md b/docs/cloud/features/01_cloud_console_features/03_query-endpoints.md similarity index 100% rename from docs/cloud/get-started/query-endpoints.md rename to docs/cloud/features/01_cloud_console_features/03_query-endpoints.md diff --git a/docs/cloud/manage/dashboards.md b/docs/cloud/features/01_cloud_console_features/04_dashboards.md similarity index 100% rename from docs/cloud/manage/dashboards.md rename to docs/cloud/features/01_cloud_console_features/04_dashboards.md diff --git a/docs/cloud/features/01_cloud_console_features/_category_.json b/docs/cloud/features/01_cloud_console_features/_category_.json new file mode 100644 index 00000000000..85ba09bce82 --- /dev/null +++ b/docs/cloud/features/01_cloud_console_features/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Cloud console", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/features/02_infrastructure_and_deploy/_category_.json b/docs/cloud/features/02_infrastructure_and_deploy/_category_.json new file mode 100644 index 00000000000..3e6367dd545 --- /dev/null +++ b/docs/cloud/features/02_infrastructure_and_deploy/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Infrastructure and deploy", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/reference/byoc.md b/docs/cloud/features/02_infrastructure_and_deploy/byoc.md similarity index 100% rename from docs/cloud/reference/byoc.md rename to docs/cloud/features/02_infrastructure_and_deploy/byoc.md diff --git a/docs/cloud/reference/shared-catalog.md b/docs/cloud/features/02_infrastructure_and_deploy/shared-catalog.md similarity index 100% rename from docs/cloud/reference/shared-catalog.md rename to docs/cloud/features/02_infrastructure_and_deploy/shared-catalog.md diff --git a/docs/cloud/reference/shared-merge-tree.md b/docs/cloud/features/02_infrastructure_and_deploy/shared-merge-tree.md similarity index 100% rename from docs/cloud/reference/shared-merge-tree.md rename to docs/cloud/features/02_infrastructure_and_deploy/shared-merge-tree.md diff --git a/docs/cloud/reference/warehouses.md b/docs/cloud/features/02_infrastructure_and_deploy/warehouses.md similarity index 100% rename from docs/cloud/reference/warehouses.md rename to docs/cloud/features/02_infrastructure_and_deploy/warehouses.md diff --git a/docs/cloud/features/03_monitoring/_category_.json b/docs/cloud/features/03_monitoring/_category_.json new file mode 100644 index 00000000000..ef0bd973e2c --- /dev/null +++ b/docs/cloud/features/03_monitoring/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Monitoring", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/manage/monitoring/advanced_dashboard.md b/docs/cloud/features/03_monitoring/advanced_dashboard.md similarity index 99% rename from docs/cloud/manage/monitoring/advanced_dashboard.md rename to docs/cloud/features/03_monitoring/advanced_dashboard.md index ab320eb6ebe..578a412846e 100644 --- a/docs/cloud/manage/monitoring/advanced_dashboard.md +++ b/docs/cloud/features/03_monitoring/advanced_dashboard.md @@ -110,7 +110,7 @@ interface can help detect issues. | Network receive bytes/sec | Tracks the current speed of outbound network traffic | | Concurrent network connections | Tracks the number of current concurrent network connections | -## Identifying issues with the Advanced dashboard {#identifying-issues-with-the-advanced-dashboard} +## Identifying issues using the advanced dashboard {#identifying-issues-with-the-advanced-dashboard} Having this real-time view of the health of your ClickHouse service greatly helps mitigate issues before they impact your business or help solve them. Below are a diff --git a/docs/cloud/manage/monitoring/prometheus.md b/docs/cloud/features/03_monitoring/prometheus.md similarity index 100% rename from docs/cloud/manage/monitoring/prometheus.md rename to docs/cloud/features/03_monitoring/prometheus.md diff --git a/docs/cloud/security/shared-responsibility-model.md b/docs/cloud/features/04_security/01_shared-responsibility-model.md similarity index 100% rename from docs/cloud/security/shared-responsibility-model.md rename to docs/cloud/features/04_security/01_shared-responsibility-model.md diff --git a/docs/cloud/features/04_security/_category_.json b/docs/cloud/features/04_security/_category_.json new file mode 100644 index 00000000000..aed26fa7f7a --- /dev/null +++ b/docs/cloud/features/04_security/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Security", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/cloud-access-management/cloud-access-management.md b/docs/cloud/features/04_security/cloud-access-management/cloud-access-management.md similarity index 76% rename from docs/cloud/security/cloud-access-management/cloud-access-management.md rename to docs/cloud/features/04_security/cloud-access-management/cloud-access-management.md index b0794fccf84..cfab1faad61 100644 --- a/docs/cloud/security/cloud-access-management/cloud-access-management.md +++ b/docs/cloud/features/04_security/cloud-access-management/cloud-access-management.md @@ -32,23 +32,31 @@ Users must be assigned an organization level role and may optionally be assigned | SQL console | Custom | Configure using SQL [`GRANT`](/sql-reference/statements/grant) statement; assign the role to a SQL console user by naming the role after the user | To create a custom role for a SQL console user and grant it a general role, run the following commands. The email address must match the user's email address in the console. + + + +#### Create `database_developer` and grant permissions {#create-database_developer-and-grant-permissions} + +Create the `database_developer` role and grant `SHOW`, `CREATE`, `ALTER`, and `DELETE` permissions. -1. Create the database_developer role and grant `SHOW`, `CREATE`, `ALTER`, and `DELETE` permissions. - - ```sql - CREATE ROLE OR REPLACE database_developer; - GRANT SHOW ON * TO database_developer; - GRANT CREATE ON * TO database_developer; - GRANT ALTER ON * TO database_developer; - GRANT DELETE ON * TO database_developer; - ``` - -2. Create a role for the SQL console user my.user@domain.com and assign it the database_developer role. +```sql +CREATE ROLE OR REPLACE database_developer; +GRANT SHOW ON * TO database_developer; +GRANT CREATE ON * TO database_developer; +GRANT ALTER ON * TO database_developer; +GRANT DELETE ON * TO database_developer; +``` + +#### Create SQL console user role {#create-sql-console-user-role} + +Create a role for the SQL console user my.user@domain.com and assign it the database_developer role. - ```sql - CREATE ROLE OR REPLACE `sql-console-role:my.user@domain.com`; - GRANT database_developer TO `sql-console-role:my.user@domain.com`; - ``` +```sql +CREATE ROLE OR REPLACE `sql-console-role:my.user@domain.com`; +GRANT database_developer TO `sql-console-role:my.user@domain.com`; +``` + + ### SQL console passwordless authentication {#sql-console-passwordless-authentication} SQL console users are created for each session and authenticated using X.509 certificates that are automatically rotated. The user is removed when the session is terminated. When generating access lists for audits, please navigate to the Settings tab for the service in the console and note the SQL console access in addition to the database users that exist in the database. If custom roles are configured, the user's access is listed in the role ending with the user's username. @@ -88,38 +96,46 @@ Users can use a SHA256 hash generator or code function such as `hashlib` in Pyth ### Database access listings with SQL console users {#database-access-listings-with-sql-console-users} The following process can be used to generate a complete access listing across the SQL console and databases in your organization. -1. Run the following queries to get a list of all grants in the database. - - ```sql - SELECT grants.user_name, - grants.role_name, - users.name AS role_member, - grants.access_type, - grants.database, - grants.table - FROM system.grants LEFT OUTER JOIN system.role_grants ON grants.role_name = role_grants.granted_role_name - LEFT OUTER JOIN system.users ON role_grants.user_name = users.name - - UNION ALL - - SELECT grants.user_name, - grants.role_name, - role_grants.role_name AS role_member, - grants.access_type, - grants.database, - grants.table - FROM system.role_grants LEFT OUTER JOIN system.grants ON role_grants.granted_role_name = grants.role_name - WHERE role_grants.user_name is null; - ``` - -2. Associate this list to Console users with access to SQL console. + + +#### Get a list of all database grants {#get-a-list-of-all-database-grants} + +Run the following queries to get a list of all grants in the database. + +```sql +SELECT grants.user_name, +grants.role_name, +users.name AS role_member, +grants.access_type, +grants.database, +grants.table +FROM system.grants LEFT OUTER JOIN system.role_grants ON grants.role_name = role_grants.granted_role_name +LEFT OUTER JOIN system.users ON role_grants.user_name = users.name + +UNION ALL + +SELECT grants.user_name, +grants.role_name, +role_grants.role_name AS role_member, +grants.access_type, +grants.database, +grants.table +FROM system.role_grants LEFT OUTER JOIN system.grants ON role_grants.granted_role_name = grants.role_name +WHERE role_grants.user_name is null; +``` + +#### Associate grant list to Console users with access to SQL console {#associate-grant-list-to-console-users-with-access-to-sql-console} + +Associate this list with Console users that have access to SQL console. - a. Go to the Console. +a. Go to the Console. + +b. Select the relevant service. - b. Select the relevant service. +c. Select Settings on the left. - c. Select Settings on the left. +d. Scroll to the SQL console access section. - d. Scroll to the SQL console access section. +e. Click the link for the number of users with access to the database `There are # users with access to this service.` to see the user listing. - e. Click the link for the number of users with access to the database `There are # users with access to this service.` to see the user listing. + \ No newline at end of file diff --git a/docs/cloud/security/cloud-access-management/cloud-authentication.md b/docs/cloud/features/04_security/cloud-access-management/cloud-authentication.md similarity index 100% rename from docs/cloud/security/cloud-access-management/cloud-authentication.md rename to docs/cloud/features/04_security/cloud-access-management/cloud-authentication.md diff --git a/docs/cloud/security/cloud-access-management/index.md b/docs/cloud/features/04_security/cloud-access-management/index.md similarity index 100% rename from docs/cloud/security/cloud-access-management/index.md rename to docs/cloud/features/04_security/cloud-access-management/index.md diff --git a/docs/cloud/security/inviting-new-users.md b/docs/cloud/features/04_security/cloud-access-management/inviting-new-users.md similarity index 100% rename from docs/cloud/security/inviting-new-users.md rename to docs/cloud/features/04_security/cloud-access-management/inviting-new-users.md diff --git a/docs/cloud/security/cmek.md b/docs/cloud/features/04_security/cmek.md similarity index 100% rename from docs/cloud/security/cmek.md rename to docs/cloud/features/04_security/cmek.md diff --git a/docs/cloud/security/cloud-endpoints-api.md b/docs/cloud/features/04_security/connectivity/cloud-endpoints-api.md similarity index 100% rename from docs/cloud/security/cloud-endpoints-api.md rename to docs/cloud/features/04_security/connectivity/cloud-endpoints-api.md diff --git a/docs/cloud/security/connectivity-overview.md b/docs/cloud/features/04_security/connectivity/connectivity-overview.md similarity index 100% rename from docs/cloud/security/connectivity-overview.md rename to docs/cloud/features/04_security/connectivity/connectivity-overview.md diff --git a/docs/cloud/security/aws-privatelink.md b/docs/cloud/features/04_security/connectivity/private_networking/aws-privatelink.md similarity index 100% rename from docs/cloud/security/aws-privatelink.md rename to docs/cloud/features/04_security/connectivity/private_networking/aws-privatelink.md diff --git a/docs/cloud/security/azure-privatelink.md b/docs/cloud/features/04_security/connectivity/private_networking/azure-privatelink.md similarity index 100% rename from docs/cloud/security/azure-privatelink.md rename to docs/cloud/features/04_security/connectivity/private_networking/azure-privatelink.md diff --git a/docs/cloud/security/gcp-private-service-connect.md b/docs/cloud/features/04_security/connectivity/private_networking/gcp-private-service-connect.md similarity index 100% rename from docs/cloud/security/gcp-private-service-connect.md rename to docs/cloud/features/04_security/connectivity/private_networking/gcp-private-service-connect.md diff --git a/docs/cloud/security/private-link-overview.md b/docs/cloud/features/04_security/connectivity/private_networking/private-link-overview.md similarity index 100% rename from docs/cloud/security/private-link-overview.md rename to docs/cloud/features/04_security/connectivity/private_networking/private-link-overview.md diff --git a/docs/cloud/security/setting-ip-filters.md b/docs/cloud/features/04_security/connectivity/setting-ip-filters.md similarity index 100% rename from docs/cloud/security/setting-ip-filters.md rename to docs/cloud/features/04_security/connectivity/setting-ip-filters.md diff --git a/docs/cloud/manage/notifications.md b/docs/cloud/features/05_notifications.md similarity index 100% rename from docs/cloud/manage/notifications.md rename to docs/cloud/features/05_notifications.md diff --git a/docs/cloud/support.md b/docs/cloud/features/06_support.md similarity index 88% rename from docs/cloud/support.md rename to docs/cloud/features/06_support.md index 836382cd3c5..e6b73fc87a0 100644 --- a/docs/cloud/support.md +++ b/docs/cloud/features/06_support.md @@ -1,6 +1,6 @@ --- sidebar_label: 'Cloud Support' -title: 'Cloud Support' +title: 'Support' slug: /cloud/support description: 'Learn about Cloud Support' hide_title: true diff --git a/docs/cloud/features/_category_.json b/docs/cloud/features/_category_.json new file mode 100644 index 00000000000..383c8150644 --- /dev/null +++ b/docs/cloud/features/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Features", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/manage/backups/configurable-backups.md b/docs/cloud/features/backups/configurable-backups.md similarity index 100% rename from docs/cloud/manage/backups/configurable-backups.md rename to docs/cloud/features/backups/configurable-backups.md diff --git a/docs/cloud/manage/backups/export-backups-to-own-cloud-account.md b/docs/cloud/features/backups/export-backups-to-own-cloud-account.md similarity index 100% rename from docs/cloud/manage/backups/export-backups-to-own-cloud-account.md rename to docs/cloud/features/backups/export-backups-to-own-cloud-account.md diff --git a/docs/cloud/manage/backups/index.md b/docs/cloud/features/backups/index.md similarity index 100% rename from docs/cloud/manage/backups/index.md rename to docs/cloud/features/backups/index.md diff --git a/docs/cloud/manage/backups/overview.md b/docs/cloud/features/backups/overview.md similarity index 100% rename from docs/cloud/manage/backups/overview.md rename to docs/cloud/features/backups/overview.md diff --git a/docs/cloud/manage/hyperdx.md b/docs/cloud/features/hyperdx.md similarity index 98% rename from docs/cloud/manage/hyperdx.md rename to docs/cloud/features/hyperdx.md index 7e56e90d279..71e5cee6102 100644 --- a/docs/cloud/manage/hyperdx.md +++ b/docs/cloud/features/hyperdx.md @@ -15,7 +15,7 @@ HyperDX is the user interface for [**ClickStack**](/use-cases/observability/clic HyperDX is a purpose-built frontend for exploring and visualizing observability data, supporting both Lucene-style and SQL queries, interactive dashboards, alerting, trace exploration, and more—all optimized for ClickHouse as the backend. -HyperDX in ClickHouse Cloud allows users to enjoy a more turnkey ClickStack experience - no infrastructure to manage, no separate authentication to configure. +HyperDX in ClickHouse Cloud allows users to enjoy a more turnkey ClickStack experience - no infrastructure to manage, no separate authentication to configure. HyperDX can be launched with a single click and connected to your data - fully integrated into the ClickHouse Cloud authentication system for seamless, secure access to your observability insights. ## Deployment {#main-concepts} diff --git a/docs/cloud/features/index.md b/docs/cloud/features/index.md new file mode 100644 index 00000000000..ce8d1500485 --- /dev/null +++ b/docs/cloud/features/index.md @@ -0,0 +1,7 @@ +--- +sidebar_label: 'Features' +slug: /cloud/features +title: 'Features' +description: 'Table of contents page linking to Cloud features' +--- + diff --git a/docs/cloud/manage/integrations.md b/docs/cloud/features/integrations.md similarity index 100% rename from docs/cloud/manage/integrations.md rename to docs/cloud/features/integrations.md diff --git a/docs/cloud/manage/replica-aware-routing.md b/docs/cloud/features/replica-aware-routing.md similarity index 100% rename from docs/cloud/manage/replica-aware-routing.md rename to docs/cloud/features/replica-aware-routing.md diff --git a/docs/cloud/manage/upgrades.md b/docs/cloud/features/upgrades.md similarity index 100% rename from docs/cloud/manage/upgrades.md rename to docs/cloud/features/upgrades.md diff --git a/docs/cloud/get-started/index.md b/docs/cloud/get-started/index.md deleted file mode 100644 index 3c30f63f149..00000000000 --- a/docs/cloud/get-started/index.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -slug: /cloud/get-started -title: 'Get Started' -description: 'Get Started Table Of Contents' -keywords: ['Cloud Quick Start', 'SQL Console', 'Query Insights', 'Query API Endpoints', 'Dashboards', 'Cloud Support'] ---- - -Welcome to ClickHouse Cloud! Explore the pages below to learn more about what ClickHouse Cloud has to offer. - -| Page | Description | -|--------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Overview](/cloud/overview) | Overview of the benefits of using ClickHouse Cloud and what version of ClickHouse is used for it. | -| [SQL Console](/cloud/get-started/sql-console) | Learn about the interactive SQL console available in Cloud | -| [Query Insights](/cloud/get-started/query-insights) | Learn about how Cloud's Query Insights feature makes ClickHouse's built-in query log easier to use through various visualizations and tables. | -| [Query Endpoints](/cloud/get-started/query-endpoints) | Learn about the Query API Endpoints feature which allows you to create an API endpoint directly from any saved SQL query in the ClickHouse Cloud console. | -| [Dashboards](/cloud/manage/dashboards) | Learn about how SQL Console's dashboards feature allows you to collect and share visualizations from saved queries. | -| [Cloud Support](/cloud/support) | Learn more about Support Services for ClickHouse Cloud users and customers. | diff --git a/docs/cloud/bestpractices/_category_.yml b/docs/cloud/guides/_category_.yml similarity index 83% rename from docs/cloud/bestpractices/_category_.yml rename to docs/cloud/guides/_category_.yml index 1648e8a79cb..747e5fb1796 100644 --- a/docs/cloud/bestpractices/_category_.yml +++ b/docs/cloud/guides/_category_.yml @@ -1,4 +1,4 @@ -label: 'Best Practices' +label: 'Guides' collapsible: true collapsed: true link: diff --git a/docs/cloud/guides/best_practices/_category_.json b/docs/cloud/guides/best_practices/_category_.json new file mode 100644 index 00000000000..21f95c55bca --- /dev/null +++ b/docs/cloud/guides/best_practices/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Best practices", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/guides/best_practices/index.md b/docs/cloud/guides/best_practices/index.md new file mode 100644 index 00000000000..4719ea2750a --- /dev/null +++ b/docs/cloud/guides/best_practices/index.md @@ -0,0 +1,22 @@ +--- +slug: /cloud/bestpractices +keywords: ['Cloud', 'Best Practices', 'Bulk Inserts', 'Asynchronous Inserts', 'Avoid Mutations', 'Avoid Nullable Columns', 'Avoid Optimize Final', 'Low Cardinality Partitioning Key', 'Multi Tenancy', 'Usage Limits'] +title: 'Overview' +hide_title: true +description: 'Landing page for Best Practices section in ClickHouse Cloud' +--- + +import TableOfContents from '@site/docs/best-practices/_snippets/_table_of_contents.md'; + +# Best Practices in ClickHouse Cloud {#best-practices-in-clickhouse-cloud} + +This section provides best practices you will want to follow to get the most out of ClickHouse Cloud. + +| Page | Description | +|----------------------------------------------------------|----------------------------------------------------------------------------| +| [Usage Limits](/cloud/bestpractices/usage-limits)| Explore the limits of ClickHouse. | +| [Multi tenancy](/cloud/bestpractices/multi-tenancy)| Learn about different strategies to implement multi-tenancy. | + +These are in addition to the standard best practices which apply to all deployments of ClickHouse. + + \ No newline at end of file diff --git a/docs/cloud/bestpractices/multitenancy.md b/docs/cloud/guides/best_practices/multitenancy.md similarity index 99% rename from docs/cloud/bestpractices/multitenancy.md rename to docs/cloud/guides/best_practices/multitenancy.md index 5289a09b067..5f7df65427a 100644 --- a/docs/cloud/bestpractices/multitenancy.md +++ b/docs/cloud/guides/best_practices/multitenancy.md @@ -1,6 +1,6 @@ --- slug: /cloud/bestpractices/multi-tenancy -sidebar_label: 'Implement multi tenancy' +sidebar_label: 'Multi tenancy' title: 'Multi tenancy' description: 'Best practices to implement multi tenancy' --- diff --git a/docs/cloud/guides/best_practices/usagelimits.md b/docs/cloud/guides/best_practices/usagelimits.md new file mode 100644 index 00000000000..af49f5956be --- /dev/null +++ b/docs/cloud/guides/best_practices/usagelimits.md @@ -0,0 +1,40 @@ +--- +slug: /cloud/bestpractices/usage-limits +sidebar_label: 'Service limits' +title: 'Usage limits' +description: 'Describes the recommended usage limits in ClickHouse Cloud' +--- + +While ClickHouse is known for its speed and reliability, optimal performance is +achieved within certain operating parameters. For example, having too many tables, +databases or parts could negatively impact performance. To avoid this, Clickhouse +Cloud has guardrails set up for several types of items. You can find details of +these guardrails below. + +:::tip +If you've run up against one of these guardrails, it's possible that you are +implementing your use case in an unoptimized way. Contact our support team and +we will gladly help you refine your use case to avoid exceeding the guardrails +or look together at how we can increase them in a controlled manner. +::: + +| Dimension | Limit | +|-------------------------------|------------------------------------------------------------| +| **Databases** | 1000 | +| **Tables** | 5000 | +| **Columns** | ∼1000 (wide format is preferred to compact) | +| **Partitions** | 50k | +| **Parts** | 100k across the entire instance | +| **Part size** | 150gb | +| **Services per organization** | 20 (soft) | +| **Services per warehouse** | 5 (soft) | +| **Low cardinality** | 10k or less | +| **Primary keys in a table** | 4-5 that sufficiently filter down the data | +| **Query concurrency** | 1000 | +| **Batch ingest** | anything > 1M will be split by the system in 1M row blocks | + +:::note +For Single Replica Services, the maximum number of databases is restricted to +100, and the maximum number of tables is restricted to 500. In addition, storage +for Basic Tier Services is limited to 1 TB. +::: diff --git a/docs/cloud/reference/cloud-compatibility.md b/docs/cloud/guides/cloud-compatibility.md similarity index 99% rename from docs/cloud/reference/cloud-compatibility.md rename to docs/cloud/guides/cloud-compatibility.md index 86dafbfefd5..59c238c9c08 100644 --- a/docs/cloud/reference/cloud-compatibility.md +++ b/docs/cloud/guides/cloud-compatibility.md @@ -1,6 +1,6 @@ --- slug: /whats-new/cloud-compatibility -sidebar_label: 'Cloud Compatibility' +sidebar_label: 'Cloud compatibility' title: 'Cloud Compatibility' description: 'This guide provides an overview of what to expect functionally and operationally in ClickHouse Cloud.' --- diff --git a/docs/cloud/guides/index.md b/docs/cloud/guides/index.md new file mode 100644 index 00000000000..2355ca4370c --- /dev/null +++ b/docs/cloud/guides/index.md @@ -0,0 +1,6 @@ +--- +slug: /cloud/guides +title: 'Guides' +hide_title: true +description: 'Table of contents page for the ClickHouse Cloud guides section' +--- \ No newline at end of file diff --git a/docs/cloud/guides/security/_category_.json b/docs/cloud/guides/security/_category_.json new file mode 100644 index 00000000000..aed26fa7f7a --- /dev/null +++ b/docs/cloud/guides/security/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Security", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/guides/security/cloud_access_management/_category_.json b/docs/cloud/guides/security/cloud_access_management/_category_.json new file mode 100644 index 00000000000..abfdcebed27 --- /dev/null +++ b/docs/cloud/guides/security/cloud_access_management/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Cloud Access Management", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/common-access-management-queries.md b/docs/cloud/guides/security/cloud_access_management/common-access-management-queries.md similarity index 100% rename from docs/cloud/security/common-access-management-queries.md rename to docs/cloud/guides/security/cloud_access_management/common-access-management-queries.md diff --git a/docs/cloud/security/saml-sso-setup.md b/docs/cloud/guides/security/cloud_access_management/saml-sso-setup.md similarity index 100% rename from docs/cloud/security/saml-sso-setup.md rename to docs/cloud/guides/security/cloud_access_management/saml-sso-setup.md diff --git a/docs/cloud/guides/security/connectivity/_category_.json b/docs/cloud/guides/security/connectivity/_category_.json new file mode 100644 index 00000000000..6e137e0592d --- /dev/null +++ b/docs/cloud/guides/security/connectivity/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Connectivity", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/accessing-s3-data-securely.md b/docs/cloud/guides/security/connectivity/accessing-s3-data-securely.md similarity index 100% rename from docs/cloud/security/accessing-s3-data-securely.md rename to docs/cloud/guides/security/connectivity/accessing-s3-data-securely.md diff --git a/docs/cloud/manage/_category_.yml b/docs/cloud/manage/_category_.yml deleted file mode 100644 index 59089856c86..00000000000 --- a/docs/cloud/manage/_category_.yml +++ /dev/null @@ -1,6 +0,0 @@ -label: 'Manage Cloud' -collapsible: true -collapsed: true -link: - type: generated-index - title: Manage ClickHouse Cloud diff --git a/docs/cloud/manage/index.md b/docs/cloud/manage/index.md deleted file mode 100644 index 46c407d0c6b..00000000000 --- a/docs/cloud/manage/index.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -slug: /cloud/manage -keywords: ['AWS', 'Cloud', 'serverless', 'management'] -title: 'Overview' -hide_title: true -description: 'Overview page for Managing Cloud' ---- - -# Managing Cloud - -In this section of the docs you will find all the information you may need about managing ClickHouse cloud. This section contains the following pages: - -| Page | Description | -|-----------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| -| [ClickHouse Cloud Tiers](/cloud/manage/cloud-tiers) | Describes the different cloud tiers, their features, and considerations for choosing the right one. | -| [Integrations](/manage/integrations) | Covers ClickHouse Cloud's built-in integrations, custom integrations, and integrations that are not supported. | -| [Backups](/cloud/manage/backups) | Describes how backups work in ClickHouse Cloud, what options you have to configure backups for your service, and how to restore from a backup. | -| [Monitoring](/integrations/prometheus) | How to integrate Prometheus as a way to monitor ClickHouse cloud. | -| [Billing](/cloud/manage/billing/overview) | Explains the pricing model for ClickHouse Cloud, including the factors that affect the cost of your service. | -| [Configuring Settings](/manage/settings) | Describes how to configure settings for ClickHouse Cloud. | -| [Replica-aware Routing](/manage/replica-aware-routing) | Explains what Replica-aware Routing in ClickHouse Cloud is, its limitations, and how to configure it. | -| [Automatic Scaling](/manage/scaling) | Explains how ClickHouse Cloud services can be scaled manually or automatically based on your resource needs. | -| [Service Uptime and SLA](/cloud/manage/service-uptime) | Information about service uptime and Service Level Agreements offered for production instances. | -| [Notifications](/cloud/notifications) | Shows how ClickHouse Cloud notifications are received and how they can be customized. | -| [Upgrades](/manage/updates) | Information on how upgrades are rolled out in ClickHouse Cloud. | -| [Delete Account](/cloud/manage/close_account) | Information on how to close or delete your account when necessary. | -| [Programmatic API Access with Postman](/cloud/manage/postman) | A guide to help you test the ClickHouse API using Postman. | -| [Troubleshooting](/faq/troubleshooting) | A collection of commonly encountered issues and how to troubleshoot them. | -| [Data Transfer](./network-data-transfer.mdx) | Learn more about how ClickHouse Cloud meters data transferred ingress and egress. | -| [Jan 2025 Changes FAQ](./jan2025_faq/index.md) | Learn more about changes to Cloud introduced in Jan 2025. | diff --git a/docs/cloud/manage/network-data-transfer.mdx b/docs/cloud/manage/network-data-transfer.mdx deleted file mode 100644 index 92725e6015c..00000000000 --- a/docs/cloud/manage/network-data-transfer.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -sidebar_label: 'Data Transfer' -slug: /cloud/manage/network-data-transfer -title: 'Data Transfer' -description: 'Learn more about how ClickHouse Cloud meters data transferred ingress and egress' ---- - -import NetworkPricing from '@site/docs/cloud/manage/_snippets/_network_transfer_rates.md'; - -ClickHouse Cloud meters data transferred ingress and egress. -This includes any data in and out of ClickHouse Cloud as well as any intra-region and cross-region data transfer. -This usage is tracked at the service level. Based on this usage, customers incur data transfer charges that are then added to their monthly bill. - -ClickHouse Cloud charges for: -- Data egress from ClickHouse Cloud to the public Internet, including to other regions of other cloud providers. -- Data egress to another region in the same cloud provider. - -There are no charges for intra-region data transfer or Private Link/Private Service Connect use and data transfer. -However, we reserve the right to implement additional data transfer pricing dimensions if we see usage patterns that impact our ability to charge users appropriately. - -Data transfer charges vary by Cloud Service Provider (CSP) and region, and prices will not be tiered as usage increases. Public internet egress pricing is based only on the origin region. -Inter-region (or cross-region) pricing depends on both the origin and destination regions. Data transfer pricing does **not** vary between organizational tiers. - -**Best Practices to minimize Data Transfer Costs** - -There are some patterns to keep in mind when ingressing and egressing data to minimize data transfer costs. -1. When ingressing or egressing data from Clickhouse Cloud, use compression where possible, to minimize the amount of data transferred and the associated cost. -2. Be aware that when doing an INSERT over the native protocol with non-inlined values (e.g. INSERT INTO [TABLE] FROM INFILE [FILE] FORMAT NATIVE), ClickHouse clients pull metadata from servers to pack the data. If the metadata is larger than the INSERT payload, you might counterintuitively see more egress than there is ingress from the server perspective. If this is unacceptable, consider inlining data with VALUES syntax or using the HTTP protocol. - -The tables below shows how data transfer charges for egress vary across public internet or cross-region by cloud provider and region. - -:::note -ClickHouse Cloud meters inter-region usage in terms of tiers, Tier 1 through Tier 4, depending on the origin and destination regions. The table below shows the tier for each combination of inter-region data transfer. In the Billing usage screen on ClickHouse Cloud you will see data transfer usage broken out by tiers. -::: - - diff --git a/docs/cloud/onboard/01_discover/01_what_is.md b/docs/cloud/onboard/01_discover/01_what_is.md new file mode 100644 index 00000000000..4814300e577 --- /dev/null +++ b/docs/cloud/onboard/01_discover/01_what_is.md @@ -0,0 +1,46 @@ +--- +slug: /cloud/overview +title: 'Introduction' +keywords: ['clickhouse cloud', 'what is clickhouse cloud', 'clickhouse cloud overview', 'clickhouse cloud features'] +hide_title: true +--- + +## What is ClickHouse Cloud? {#what-is-clickhouse-cloud} + +ClickHouse Cloud is a fully managed cloud service created by the original creators +of ClickHouse, the fastest and most popular open-source columnar online analytical +processing database. + +With Cloud, infrastructure, maintenance, scaling, and operations are taken care of +for you, so that you can focus on what matters most to you, which is building value +for your organization and your customers faster. + +## Benefits of ClickHouse Cloud {#benefits-of-clickhouse-cloud} + +ClickHouse Cloud offers several major benefits over the open-source version: + +- **Fast time to value**: Start building instantly without having to size and scale your cluster. +- **Seamless scaling**: Automatic scaling adjusts to variable workloads so you don't have to over-provision for peak usage. +- **Serverless operations**: Sit back while we take care of sizing, scaling, security, reliability, and upgrades. +- **Transparent pricing**: Pay only for what you use, with resource reservations and scaling controls. +- **Total cost of ownership**: Best price / performance ratio and low administrative overhead. +- **Broad ecosystem**: Bring your favorite data connectors, visualization tools, SQL and language clients with you. + +## OSS vs ClickHouse Cloud comparison {#oss-vs-clickhouse-cloud} + +| Feature | Benefits | OSS ClickHouse | ClickHouse Cloud | +|--------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-------------------| +| **Deployment modes** | ClickHouse provides the flexibility to self-manage with open-source or deploy in the cloud. Use ClickHouse local for local files without a server or chDB to embed ClickHouse directly into your application. | ✅ | ✅ | +| **Storage** | As an open-source and cloud-hosted product, ClickHouse can be deployed in both shared-disk and shared-nothing architectures. | ✅ | ✅ | +| **Monitoring and alerting** | Monitoring and alerting about the status of your services is critical to ensuring optimal performance and a proactive approach to detect and triage potential issues. | ✅ | ✅ | +| **ClickPipes** | ClickPipes is ClickHouse's managed ingestion pipeline that allows you to seamlessly connect your external data sources like databases, APIs, and streaming services into ClickHouse Cloud, eliminating the need for managing pipelines, custom jobs, or ETL processes. It supports workloads of all sizes. | ❌ | ✅ | +| **Pre-built integrations** | ClickHouse provides pre-built integrations that connect ClickHouse to popular tools and services such as data lakes, SQL and language clients, visualization libraries, and more. | ❌ | ✅ | +| **SQL console** | The SQL console offers a fast, intuitive way to connect, explore, and query ClickHouse databases, featuring a slick caption, query interface, data import tools, visualizations, collaboration features, and GenAI-powered SQL assistance. | ❌ | ✅ | +| **Compliance** | ClickHouse Cloud compliance includes CCPA, EU-US DPF, GDPR, HIPAA, ISO 27001, ISO 27001 SoA, PCI DSS, SOC2. ClickHouse Cloud's security, availability, processing integrity, and confidentiality processes are all independently audited. Details: trust.clickhouse.com. | ❌ | ✅ | +| **Enterprise-grade security** | Support for advanced security features such as SSO, multi-factor authentication, role-based access control (RBAC), private and secure connections with support for Private Link and Private Service Connect, IP filtering, customer-managed encryption keys (CMEK), and more. | ❌ | ✅ | +| **Scaling and optimization** | Seamlessly scales up or down based on workload, supporting both horizontal and vertical scaling. With automated backups, replication, and high availability, ClickHouse, it provides users with optimal resource allocation. | ❌ | ✅ | +| **Support services** | Our best-in-class support services and open-source community resources provide coverage for whichever deployment model you choose. | ❌ | ✅ | +| **Database upgrades** | Regular database upgrades are essential to establish a strong security posture and access the latest features and performance improvements. | ❌ | ✅ | +| **Backups** | Backups and restore functionality ensures data durability and supports graceful recovery in the event of outages or other disruptions. | ❌ | ✅ | +| **Compute-compute separation** | Users can scale compute resources independently of storage, so teams and workloads can share the same storage and maintain dedicated compute resources. This ensures that the performance of one workload doesn't interfere with another, enhancing flexibility, performance, and cost-efficiency. | ❌ | ✅ | +| **Managed services** | With a cloud-managed service, teams can focus on business outcomes and accelerate time-to-market without having to worry about the operational overhead of sizing, setup, and maintenance of ClickHouse. | ❌ | ✅ | diff --git a/docs/cloud/onboard/01_discover/02_use_cases/00_overview.md b/docs/cloud/onboard/01_discover/02_use_cases/00_overview.md new file mode 100644 index 00000000000..623b8fc605f --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/00_overview.md @@ -0,0 +1,20 @@ +--- +slug: /cloud/get-started/cloud/use-cases/overview +title: 'Building on ClickHouse Cloud' +keywords: ['use cases', 'Cloud'] +sidebar_label: 'Overview' +--- + +ClickHouse Cloud is suitable for use as both a **primary data store** and as an **analytics +layer**. + +ClickHouse's columnar architecture, vectorized processing, and cloud-native design +make it uniquely suited for analytical workloads that require both speed and scale. +Broadly, the most common use cases for ClickHouse Cloud are: + +| Use case | Description | +|----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Real-Time analytics](/cloud/get-started/cloud/use-cases/real-time-analytics) | ClickHouse Cloud excels at real-time analytics by delivering sub-second query responses on billions of rows through its columnar storage architecture and vectorized execution engine. The platform handles high-throughput data ingestion of millions of events per second while enabling direct queries on raw data without requiring pre-aggregation. Materialized Views provide real-time aggregations and pre-computed results, while approximate functions for quantiles and counts deliver instant insights perfect for interactive dashboards and real-time decision making.| +| [Data Lake and Warehouse](/cloud/get-started/cloud/use-cases/data_lake_and_warehouse) | As a modern data warehouse solution, ClickHouse Cloud combines native cloud storage integration with S3, GCS, and Azure Blob for cost-effective storage with schema-on-read flexibility that supports semi-structured data like JSON and nested types. The platform achieves massive compression ratios of 10:1 or better, significantly reducing storage costs, while its compute-storage separation architecture allows independent scaling and cost optimization. Users benefit from a standard SQL interface enhanced with advanced analytics functions, making it easy to query and analyze data at any scale.| +| [Observability](/cloud/get-started/cloud/use-cases/observability) | ClickHouse Cloud is purpose-built for observability workloads, featuring specialized engines and functions optimized for time-series data that can ingest and query terabytes of logs, metrics, and traces with ease. Through ClickStack, ClickHouse's comprehensive observability solution, organizations can break down the traditional three silos of logs, metrics, and traces by unifying all observability data in a single platform, enabling correlated analysis and eliminating the complexity of managing separate systems. This unified approach makes it ideal for application performance monitoring, infrastructure monitoring, and security event analysis at enterprise scale, with ClickStack providing the tools and integrations needed for complete observability workflows without data silos.| +| [Machine Learning and GenAI](/cloud/get-started/cloud/use-cases/machine_learning_and_gen_ai) | ClickHouse Cloud powers modern AI applications through four key capabilities: native vector similarity search for RAG applications and embedding storage, comprehensive feature store functionality for real-time ML feature engineering and serving, specialized LLM observability for tracking model performance and usage patterns, and integrated MCP (Model Context Protocol) server support that enables AI agents and LLMs to directly query and analyze data. This unified platform eliminates the complexity of managing separate systems for vector databases, feature stores, and observability tools, providing a single solution for the entire AI/ML data pipeline with ClickHouse's signature performance and scalability.| diff --git a/docs/cloud/onboard/01_discover/02_use_cases/01_real-time-analytics.md b/docs/cloud/onboard/01_discover/02_use_cases/01_real-time-analytics.md new file mode 100644 index 00000000000..fe1e8eada64 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/01_real-time-analytics.md @@ -0,0 +1,159 @@ +--- +slug: /cloud/get-started/cloud/use-cases/real-time-analytics +title: 'Real-time analytics' +keywords: ['use cases', 'real-time analytics'] +sidebar_label: 'Real-time analytics' +--- + +import Image from '@theme/IdealImage'; +import rta_0 from '@site/static/images/cloud/onboard/discover/use_cases/0_rta.png'; +import rta_1 from '@site/static/images/cloud/onboard/discover/use_cases/1_rta.png'; +import rta_2 from '@site/static/images/cloud/onboard/discover/use_cases/2_rta.png'; +import rta_3 from '@site/static/images/cloud/onboard/discover/use_cases/3_rta.png'; + + + +## What is real-time analytics? {#what-is-real-time-analytics} + +Real-time analytics refers to data processing that delivers insights to end users +and customers as soon as the data is generated. It differs from traditional or +batch analytics, where data is collected in batches and processed, often a long +time after it was generated. + +Real-time analytics systems are built on top of event streams, which consist of +a series of events ordered in time. An event is something that’s already happened. +It could be the addition of an item to the shopping cart on an e-commerce website, +the emission of a reading from an Internet of Things (IoT) sensor, or a shot on +goal in a football (soccer) match. + +An event (from an imaginary IoT sensor) is shown below, as an example: + +```json +{ + "deviceId": "sensor-001", + "timestamp": "2023-10-05T14:30:00Z", + "eventType": "temperatureAlert", + "data": { + "temperature": 28.5, + "unit": "Celsius", + "thresholdExceeded": true + } +} +``` + +Organizations can discover insights about their customers by aggregating and +analyzing events like this. This has traditionally been done using batch analytics, +and in the next section, we’ll compare batch and real-time analytics. + +## Real-Time analytics vs batch analytics {#real-time-analytics-vs-batch-analytics} + +The diagram below shows what a typical batch analytics system would look like +from the perspective of an individual event: + +batch analytics diagram + +You can see that there’s quite a big gap from when the event happens until we +process and gain some insight from it. Traditionally, this was the only means of +data analysis, and we’d need to create artificial time boundaries to process +the data in batches. For example, we might process all the data collected at the +end of a day. This worked for many use cases, but for others, it’s sub-optimal +because we’re working with stale data, and it doesn’t allow us to react to the +data quickly enough. + +By contrast, in real-time analytics systems, we react to an event as soon as it +happens, as shown in the following diagram: + +Real-time analytics diagram + +We can now derive insights from events almost as soon as they’re generated. But +why is this useful? + +## Benefits of real-time analytics {#benefits-of-real-time-analytics} + +In today's fast-paced world, organizations rely on real-time analytics to stay +agile and responsive to ever-changing conditions. A real-time analytics system +can benefit a business in many ways. + +### Better decision-making {#better-decision-making} + +Decision-making can be improved by having access to actionable insights via +real-time analytics. When business operators can see events as they’re happening, +it makes it much easier to make timely interventions. + +For example, if we make changes to an application and want to know whether it’s +having a detrimental effect on the user experience, we want to know this as +quickly as possible so that we can revert the changes if necessary. With a less +real-time approach, we might have to wait until the next day to do this +analysis, by which type we’ll have a lot of unhappy users. + +### New products and revenue streams {#new-products-and-revenue-streams} + +Real-time analytics can help businesses generate new revenue streams. Organizations +can develop new data-centered products and services that give users access to +analytical querying capabilities. These products are often compelling enough for +users to pay for access. + +In addition, existing applications can be made stickier, increasing user +engagement and retention. This will result in more application use, creating more +revenue for the organization. + +### Improved customer experience {#improved-customer-experience} + +With real-time analytics, businesses can gain instant insights into customer +behavior, preferences, and needs. This lets businesses offer timely assistance, +personalize interactions, and create more engaging experiences that keep +customers returning. + +## Real-time analytics use cases {#real-time-analytics-use-cases} + +The actual value of real-time analytics becomes evident when we consider its +practical applications. Let’s examine some of them. + +### Fraud detection {#fraud-detection} + +Fraud detection is about detecting fraudulent patterns, ranging from fake accounts +to payment fraud. We want to detect this fraud as quickly as possible, flagging +suspicious activities, blocking transactions, and disabling accounts when necessary. + +This use case stretches across industries: healthcare, digital banking, financial +services, retail, and more. + +[Instacart](https://www.instacart.com/) is North America's leading online grocery +company, with millions of active customers and shoppers. It uses ClickHouse as +part of Yoda, its fraud detection platform. In addition to the general types of +fraud described above, it also tries to detect collusion between customers and +shoppers. + +Real-time analytics for fraud detection + +They identified the following characteristics of ClickHouse that enable real-time +fraud detection: + +> ClickHouse supports LSM-tree based MergeTree family engines. +> These are optimized for writing which is suitable for ingesting large amounts +> of data in real-time. + +> ClickHouse is designed and optimized explicitly for analytical queries. This +> fits perfectly with the needs of applications where data is continuously +> analyzed for patterns that might indicate fraud. + +### Time-sensitive decision making {#ftime-sensitive-decision-making} + +Time-sensitive decision-making refers to situations where users or organizations +need to make informed choices quickly based on the most current information +available. Real-time analytics empowers users to make informed choices in +dynamic environments, whether they're traders reacting to market fluctuations, +consumers making purchasing decisions, or professionals adapting to real-time +operational changes. + +Coinhall provides its users with real-time insights into price movements over +time via a candlestick chart, which shows the open, high, low, and close prices +for each trading period. They needed to be able to run these types of queries +quickly and with a large number of concurrent users. + +Real-time analutics for time-sensitive decision making + +> In terms of performance, ClickHouse was the clear winner, executing candlestick queries in 20 milliseconds, compared +> to 400 milliseconds or more for the other databases. It ran latest-price queries in 8 milliseconds, outpacing the +> next-best performance (SingleStore) which came in at 45 milliseconds. Finally, it handled ASOF JOIN queries in +> 50 milliseconds, while Snowflake took 20 minutes and Rockset timed out. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/02_observability.md b/docs/cloud/onboard/01_discover/02_use_cases/02_observability.md new file mode 100644 index 00000000000..7ec9034824e --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/02_observability.md @@ -0,0 +1,229 @@ +--- +slug: /cloud/get-started/cloud/use-cases/observability +title: 'Observability' +keywords: ['use cases', 'observability'] +sidebar_label: 'Observability' +--- + + + +Modern software systems are complex. Microservices, cloud infrastructure, and +distributed systems have made it increasingly difficult to understand what's +happening inside our applications. When something goes wrong, teams need to know +where and why quickly. + +This is where observability comes in. It's evolved from simple system monitoring +into a comprehensive approach to understanding system behavior. However, +implementing effective observability isn't straightforward - it requires +understanding technical concepts and organizational challenges. + +## What is Observability? {#what-is-observability} + +Observability is understanding a system's internal state by examining its outputs. +In software systems, this means understanding what's happening inside your +applications and infrastructure through the data they generate. + +This field has evolved significantly and can be understood through two distinct +generations of observability approaches. + +The first generation, often called Observability 1.0, was built around the +traditional "three pillars" approach of metrics, logs, and traces. This approach +required multiple tools and data stores for different types of telemetry. It +often forced engineers to pre-define what they wanted to measure, making it +costly and complex to maintain multiple systems. + +Modern observability, or Observability 2.0, takes a fundamentally different +approach. It's based on collecting wide, structured events for each unit of work +(e.g., an HTTP request and response) in our system. This approach captures +high-cardinality data, such as user IDs, request IDs, Git commit hashes, +instance IDs, Kubernetes pod names, specific route parameters, and vendor +transaction IDs. A rule of thumb is adding a piece of metadata if it could help +us understand how the system behaves. + +This rich data collection enables dynamic slicing and dicing of data without +pre-defining metrics. Teams can derive metrics, traces, and other visualizations +from this base data, allowing them to answer complex questions about system +behavior that weren't anticipated when the instrumentation was first added. + +However, implementing modern observability capabilities presents its challenges. +Organizations need reliable ways to collect, process, and export this rich +telemetry data across diverse systems and technologies. While modern approaches +have evolved beyond traditional boundaries, understanding the fundamental +building blocks of observability remains crucial. + +## The three pillars of observability {#three-pillars-of-observability} + +To better understand how observability has evolved and works in practice, let's +examine the three pillars of observability - logs, metrics, and traces. + +While modern observability has moved beyond treating these as separate concerns, +they remain fundamental concepts for understanding different aspects of system +behavior. + +1. **Logs** - Text-based records of discrete events that occur within a system. +These provide detailed context about specific occurrences, errors, and state changes. +2. **Metrics** - Numerical measurements collected over time. These include counters, +gauges, and histograms that help track system performance, resource usage, and business KPIs. +3. **Traces** - Records that track the journey of requests as they flow through distributed systems. +These help understand the relationships between services and identify performance bottlenecks. + +These pillars enable teams to monitor, troubleshoot, and optimize their systems. +However, the real power comes from understanding how to effectively collect, +analyze, and correlate data across all three pillars to gain meaningful insights +into system behavior. + +## The benefits of observability {#the-benefits-of-observability} + +While the technical aspects of observability - logs, metrics, and traces - are +well understood, the business benefits are equally important to consider. + +In their book ["Observability Engineering"](https://clickhouse.com/engineering-resources/observability#:~:text=Observability%20Engineering) +(O'Reilly, 2022), Charity Majors, Liz Fong-Jones, and George Miranda draw from +industry research and anecdotal feedback to identify four key business benefits +that organizations can expect from implementing proper observability practices. +Let's examine these benefits: + +### Higher incremental revenue {#higher-incremental-revenue} + +The authors note that observability tools that help teams improve uptime and +performance can lead to increased incremental revenue through improved code quality. +This manifests in several ways: + +1. Improved customer experience: Fast problem resolution and prevention of service +degradation leads to higher customer satisfaction and retention +2. Increased system reliability: Better uptime means more successful transactions +and fewer lost business opportunities +3. Enhanced performance: The ability to identify and optimize performance bottlenecks +helps maintain responsive services that keep customers engaged +4. Competitive advantage: Organizations that can maintain high service quality +through comprehensive monitoring and quick issue resolution often gain an edge +over competitors + +### Cost Savings from faster incident response {#cost-savings-from-faster-incident-response} + +One of the most immediate benefits of observability is reduced labor costs +through faster detection and resolution of issues. This comes from: + +* Reduced Mean Time to Detect (MTTD) and Mean Time to Resolve (MTTR) +* Improved query response times, enabling faster investigation +* Quicker identification of performance bottlenecks +* Reduced time spent on-call +* Fewer resources wasted on unnecessary rollbacks + +We see this in practice - [trip.com built their observability system with ClickHouse](trip.com built their observability system with ClickHouse) +and achieved query speeds 4-30x faster than their previous solution, with 90% of +queries completing in under 300ms, enabling rapid issue investigation. + +### Cost savings from incidents avoided {#cost-savings-from-incidents-avoided} + +Observability doesn't just help resolve issues faster - it helps prevent them entirely. +The authors emphasize how teams can prevent critical issues by: + +* Identifying potential problems before they become critical +* Analyzing patterns to prevent recurring issues +* Understanding system behavior under different conditions +* Proactively addressing performance bottlenecks +* Making data-driven decisions about system improvements + +ClickHouse's [own observability platform, LogHouse](https://clickhouse.com/blog/building-a-logging-platform-with-clickhouse-and-saving-millions-over-datadog), +demonstrates this. It enables our core engineers to search historical patterns across all clusters, helping prevent +recurring issues. + +### Cost savings from decreased employee churn {#cost-savings-from-decreased-employee-churn} + +One of the most overlooked benefits is the impact on team satisfaction and retention. +The authors highlight how observability leads to: + +* Improved job satisfaction through better tooling +* Decreased developer burnout from fewer unresolved issues +* Reduced alert fatigue through better signal-to-noise ratio +* Lower on-call stress due to better incident management +* Increased team confidence in system reliability + +We see this in practice - when [Fastly migrated to ClickHouse](https://clickhouse.com/videos/scaling-graphite-with-clickhouse), +their engineers were amazed by the improvement in query performance, noting: + +> "I couldn't believe it. I actually had to go back a couple of times just to +> make sure that I was querying it properly... this is coming back too fast. +> This doesn't make sense." + +As the authors emphasize, while the specific measures of these benefits may vary +depending on the tools and implementation, these fundamental improvements can be +expected across organizations that adopt robust observability practices. The key +is choosing and implementing the right tools effectively to maximize these benefits. + +Achieving these benefits requires overcoming several significant hurdles. Even +organizations that understand the value of observability often find that +implementation presents unexpected complexities and challenges that must be +carefully navigated. + +## Challenges in implementing observability {#challenges-in-implementing-observability} + +Implementing observability within an organization is a transformative step toward +gaining deeper insights into system performance and reliability. However, this +journey is not without its challenges. As organizations strive to harness the +full potential of observability, they encounter various obstacles that can impede +progress. Let’s go through some of them. + +### Data volume and scalability {#data-volume-and-scalability} + +One of the primary hurdles in implementing observability is managing the sheer +volume and scalability of telemetry data generated by modern systems. As +organizations grow, so does the data they need to monitor, necessitating +solutions that efficiently handle large-scale data ingestion and +real-time analytics. + +### Integration with existing systems {#integration-with-existing-systems} + +Integration with existing systems poses another significant challenge. Many +organizations operate in heterogeneous environments with diverse technologies, +making it essential for observability tools to seamlessly integrate with current +infrastructure. Open standards are crucial in facilitating this integration, +ensuring interoperability and reducing the complexity of deploying observability +solutions across varied tech stacks. + +### Skill gaps {#skill-gaps} + +Skill gaps can also impede the successful implementation of observability. The +transition to advanced observability solutions often requires specialized +knowledge of data analytics and specific tools. Teams may need to invest in +training or hiring to bridge these gaps and fully leverage the capabilities of +their observability platforms. + +### Cost management {#cost-management} + +Cost management is critical, as observability solutions can become expensive, +particularly at scale. Organizations must balance the costs of these tools with +the value they provide, seeking cost-effective solutions that offer significant +savings compared to traditional approaches. + +### Data retention and storage {#data-retention-and-storage} + +Data retention and storage management present additional challenges. Deciding +how long to retain observability data without compromising performance or +insights requires careful planning and efficient storage solutions that reduce +storage requirements while maintaining data accessibility. + +### Standardization and vendor lock-in {#standardization-and-vendor-lock-in} + +Ensuring standardization and avoiding vendor lock-in are vital for maintaining +flexibility and adaptability in observability solutions. By adhering to open +standards, organizations can prevent being tied to specific vendors and ensure +their observability stack can evolve with their needs. + +### Security and compliance {#security-and-compliance} + +Security and compliance considerations remain crucial, especially when handling +sensitive data within observability systems. Organizations must ensure that their +observability solutions adhere to relevant regulations and effectively protect +sensitive information. + +These challenges underscore the importance of strategic planning and informed +decision-making in implementing observability solutions that effectively meet +organizational needs. + +To address these challenges, organizations need a well-structured approach to +implementing observability. The standard observability pipeline has evolved to +provide a framework for effectively collecting, processing, and analyzing +telemetry data. One of the earliest and most influential examples of this +evolution comes from Twitter's experience in 2013. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/03_data_lake_and_warehouse.md b/docs/cloud/onboard/01_discover/02_use_cases/03_data_lake_and_warehouse.md new file mode 100644 index 00000000000..acb44128b30 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/03_data_lake_and_warehouse.md @@ -0,0 +1,118 @@ +--- +slug: /cloud/get-started/cloud/use-cases/data_lake_and_warehouse +title: 'Data Lakehouse' +keywords: ['use cases', 'data lake and warehouse'] +sidebar_label: 'Data Lakehouse' +--- + +import Image from '@theme/IdealImage'; +import datalakehouse_01 from '@site/static/images/cloud/onboard/discover/use_cases/datalakehouse_01.png'; + + + +The data lakehouse is a convergent architecture that applies database principles +to data lake infrastructure while maintaining the flexibility and scale of cloud storage systems. + +The lakehouse is not just taking a database apart but building database-like +capabilities onto a fundamentally different foundation (cloud object storage) +that focuses on supporting traditional analytics and modern AI/ML workloads in +a unified platform. + +## What are the components of the data lakehouse? {#components-of-the-data-lakehouse} + +The modern data lakehouse architecture represents a convergence of data warehouse +and data lake technologies, combining the best aspects of both approaches. This +architecture comprises several distinct but interconnected layers providing a +flexible, robust data storage, management, and analysis platform. + +Understanding these components is essential for organizations looking to +implement or optimize their data lakehouse strategy. The layered approach allows +for component substitution and independent evolution of each layer, providing +architectural flexibility and future-proofing. + +Let's explore the core building blocks of a typical data lakehouse architecture +and how they interact to create a cohesive data management platform. + +Components of the data lakehouse + +| Component | Description | +|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Data sources** | Lakehouse data sources include operational databases, streaming platforms, IoT devices, application logs, and external providers. | +| **Query engine** | Processes analytical queries against the data stored in the object storage, leveraging the metadata and optimizations provided by the table format layer. Supports SQL and potentially other query languages to analyze large volumes of data efficiently. | +| **Metadata catalog** | The [data catalog](https://clickhouse.com/engineering-resources/data-catalog) acts as a central repository for metadata, storing and managing table definitions and schemas, partitioning information, and access control policies. Enables data discovery, lineage tracking, and governance across the lakehouse. | +| **Table format layer** | The [table format layer](https://clickhouse.com/engineering-resources/open-table-formats) manages the logical organization of data files into tables, providing database-like features such as ACID transactions, schema enforcement and evolution, time travel capabilities, and performance optimizations like data skipping and clustering. | +| **Object storage** | This layer provides scalable, durable, cost-effective storage for all data files and metadata. It handles the physical persistence of data in an open format, enabling direct access from multiple tools and systems. | +| **Client applications** | Various tools and applications that connect to the lakehouse to query data, visualize insights, or build data products. These can include BI tools, data science notebooks, custom applications, and ETL/ELT tools. | + +## What are the benefits of the data lakehouse? {#benefits-of-the-data-lakehouse} + +The data lakehouse architecture offers several significant advantages when compared +directly to both traditional data warehouses and data lakes: + +### Compared to traditional data warehouses {#compared-to-traditional-data-warehouses} + +| # | Benefit | Description | +|---|--------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 1 | **Cost efficiency** | Lakehouses leverage inexpensive object storage rather than proprietary storage formats, significantly reducing storage costs compared to data warehouses that charge premium prices for their integrated storage. | +| 2 | **Component flexibility and interchangeability** | The lakehouse architecture allows organizations to substitute different components. Traditional systems require wholesale replacement when requirements change or technology advances, while lakehouses enable incremental evolution by swapping out individual components like query engines or table formats. This flexibility reduces vendor lock-in and allows organizations to adapt to changing needs without disruptive migrations. | +| 3 | **Open format support** | Lakehouses store data in open file formats like Parquet, allowing direct access from various tools without vendor lock-in, unlike proprietary data warehouse formats that restrict access to their ecosystem. | +| 4 | **AI/ML integration** | Lakehouses provide direct access to data for machine learning frameworks and Python/R libraries, whereas data warehouses typically require extracting data before using it for advanced analytics. | +| 5 | **Independent scaling** | Lakehouses separate storage from compute, allowing each to scale independently based on actual needs, unlike many data warehouses, where they scale together. | + +### Compared to data lakes {#compared-to-data-lakes} + +| # | Benefit | Description | +|---|-----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 1 | **Query performance** | Lakehouses implement indexing, statistics, and data layout optimizations that enable SQL queries to run at speeds comparable to data warehouses, overcoming the poor performance of raw data lakes. | +| 2 | **Data consistency** | Through ACID transaction support, lakehouses ensure consistency during concurrent operations, solving a major limitation of traditional data lakes, where file conflicts can corrupt data. | +| 3 | **Schema management** | Lakehouses enforce schema validation and track schema evolution, preventing the "data swamp" problem common in data lakes where data becomes unusable due to schema inconsistencies. | +| 4 | **Governance capabilities** | Lakehouses provide fine-grained access control and auditing features at row/column levels, addressing the limited security controls in basic data lakes. | +| 5 | **BI Tool support** | Lakehouses offer SQL interfaces and optimizations that make them compatible with standard BI tools, unlike raw data lakes that require additional processing layers before visualization. | + +## Where does ClickHouse fit in the data lakehouse architecture? {#where-does-clickhouse-fit-in-the-data-lakehouse-architecture} + +ClickHouse is a powerful analytical query engine within the modern data lakehouse +ecosystem. It offers organizations a high-performance option for analyzing data +at scale. ClickHouse is a compelling choice due to its exceptional query speed and +efficiency. + +Within the lakehouse architecture, ClickHouse functions as a specialized +processing layer that can flexibly interact with the underlying data. It can +directly query Parquet files stored in cloud object storage systems like S3, +Azure Blob Storage, or Google Cloud Storage, leveraging its optimized columnar +processing capabilities to deliver rapid results even on massive datasets. +This direct query capability allows organizations to analyze their lake data +without complex data movement or transformation processes. + +ClickHouse integrates with open table formats such as Apache Iceberg, Delta Lake, +or Apache Hudi for more sophisticated data management needs. This integration +enables ClickHouse to take advantage of these formats' advanced features, while +still delivering the exceptional query performance it's known for. Organizations +can integrate these table formats directly or connect through metadata catalogs +like AWS Glue, Unity, or other catalog services. + +By incorporating ClickHouse as a query engine in their lakehouse architecture, +organizations can run lightning-fast analytical queries against their data lake +while maintaining the flexibility and openness that define the lakehouse approach. +This combination delivers the performance characteristics of a specialized +analytical database without sacrificing the core benefits of the lakehouse model, +including component interchangeability, open formats, and unified data management. + +## Hybrid architecture: The best of both worlds {#hybrid-architecture-the-best-of-both-worlds} + +While ClickHouse excels at querying lakehouse components, its highly optimized +storage engine offers an additional advantage. For use cases demanding ultra-low +latency queries - such as real-time dashboards, operational analytics, or +interactive user experiences - organizations can selectively store +performance-critical data directly in ClickHouse's native format. This hybrid +approach delivers the best of both worlds: the unmatched query speed of +ClickHouse's specialized storage for time-sensitive analytics and the flexibility +to query the broader data lakehouse when needed. + +This dual capability allows organizations to implement tiered data strategies +where hot, frequently accessed data resides in ClickHouse's optimized storage +for sub-second query responses, while maintaining seamless access to the complete +data history in the lakehouse. Teams can make architectural decisions based on +performance requirements rather than technical limitations, using ClickHouse as +a lightning-fast analytical database for critical workloads and a flexible query +engine for the broader data ecosystem. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/01_overview.md b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/01_overview.md new file mode 100644 index 00000000000..77e32109dd2 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/01_overview.md @@ -0,0 +1,100 @@ +--- +slug: /cloud/get-started/cloud/use-cases/AI_ML +title: 'Machine learning and generative AI' +keywords: ['use cases', 'Machine Learning', 'Generative AI'] +sidebar_label: 'Overview' +--- + + + +## The rapidly evolving data landscape for Machine Learning and Generative AI {#the-rapidly-evolving-data-landscape-for-machine-learning-and-generative-ai} + +Rapid advancements in Machine Learning and Generative AI are completely reshaping +how business and society operate, driving an ever-increasing demand for data on +an unparalleled scale. +At the time of writing, language training dataset size is growing on average 3.7x +per year, while it is projected that the largest training run will use all +public human-generated text by 2028. At the same time, users of these applications +increasingly expect real-time performance and the success of AI and ML-driven +insights, like personalized recommendations, accurate forecasting, or chatbots, +hinge on the ability to handle massive datasets in real-time. Against the backdrop +of these changes, traditional data architectures often face significant challenges +when it comes to meeting the scale and real-time requirements that modern AI/ML +workloads demand. + +## Challenges of traditional data stacks for AI/ML workloads {#challenges-of-traditional-data-stacks} + +Traditional database systems are often not designed for the massive analytical +workloads and complex queries inherent in modern ML and GenAI applications. +They frequently become bottlenecks as data volume grows and query complexity +increases, hindering the rapid processing required for AI. In addition to this, +machine learning architectures can become fragmented and challenging to handle +due to a proliferation of specialized tools and components which often leads to +higher learning curves, increased points of failure, and escalating expenses. +Real-time processing for ML faces significant challenges, including dealing with +the sheer volume and velocity of incoming data, minimizing latency and response +times, and continuously addressing issues like model drift and ensuring data +quality. These systems, designed for structured data at much smaller scales, often +take days or weeks when faced with terabytes or petabytes of data. Not only do +they become a performance bottleneck, but also a cost bottleneck, often relying +on expensive, close-coupled storage that does not scale cost effectively. + +## ClickHouse as a foundation for real-time AI/ML {#clickhouse-for-real-time-ai-ml} + +ClickHouse was designed and built from the ground up to tackle data at scale in +real-time. As such, it is ideally positioned for handling the requirements of +today’s AI and ML applications. Several core features enable it to ingest, +process and query datasets on the petabyte scale with real-time performance: + +| Feature | Description | +|----------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Columnar Storage** | ClickHouse utilizes a columnar storage model. This means that data from each column of an inserted row is stored together on disk, which enables significantly more efficient compression and boosts query speed by allowing the system to read only the relevant columns required for a query, which drastically reduces disk I/O. This is particularly advantageous for analytical queries common in ML/GenAI that often involve aggregations or filtering on a subset of columns. | +| **High Performance** | ClickHouse offers for its lightning-fast query processing, capable of querying billions of rows in milliseconds. It achieves this through a fully parallelized query pipeline and vectorized query execution engine, which processes multiple rows simultaneously at the CPU level, maximizing efficiency. | +| **Scalability** | Designed for horizontal scalability, ClickHouse allows users to add more servers (nodes) to a cluster to handle increasing data volumes and query loads, distributing data and queries across them. Performance scales linearly with the addition of each new server, enabling it to easily handle petabytes of data. | +| **Real-time data ingestion** | It is built for continuous data ingestion, supporting high rates of inserts and merges (billions of rows per second, gigabytes per second) without disrupting ongoing queries or analytics. This capability is crucial for environments where data arrives in a constant stream, such as from IoT devices or application logs, ensuring that ML models are fueled with the most up-to-date information. | +| **Specialized data types & functions** | In addition to standard SQL data types, syntax and functions, ClickHouse offers a host of additional specialised data types and functions suited for ML use cases. Some of these include Array functions which natively support vector operations, distance calculations, array manipulations; Native JSON support for efficient processing of semi-structured data common to ML feature stores; Approximate algorithms like HyperLogLog, quantiles, and sampling functions for large-scale statistical analysis or numeric indexed vectors for vector aggregation and pointwise operations. | +| **Extensive integration ecosystem** | ClickHouse's extensive integration ecosystem makes it exceptionally valuable for AI/ML applications by seamlessly connecting with every critical component of the ML toolchain—from Python/pandas and Jupyter for data science workflows, to Spark and Kafka for large-scale data processing, to Airflow for pipeline orchestration, and Grafana for model monitoring—eliminating the typical friction and data movement bottlenecks that plague multi-tool ML environments. | + +## How ClickHouse helps simplify the AI/ML Data Stack {#simplify-the-ai-ml-data-stack} + +ClickHouse streamlines the traditionally fragmented AI/ML data infrastructure +by serving as a unified platform that handles multiple data management +functions within a single high-performance system. Rather than maintaining +separate specialized data stores for different ML tasks, ClickHouse provides +a consolidated foundation for analytics, machine learning workloads, and +data preparation and exploration. + +ClickHouse natively integrates with object storage like S3, GCP and Azure. It +integrates with data lakes, enabling direct querying of data in popular formats +like Iceberg, Delta Lake, and Hudi, positioning it as a comprehensive access and +computation layer for ML operations. This unified approach tackles challenges +faced in MLOps by reducing the complexity that typically stems from managing +multiple systems. + +Data fragmentation across separate stores creates many operational pain +points such as escalating costs, increased failure risks, and the need for +duplicate transformation logic between training and inference pipelines. +ClickHouse addresses these issues by consolidating all of this functionality +into a single system, particularly for feature engineering where consistency +between offline training and online serving is critical. + +Through its integration with data catalogs including Unity, AWS Glue, Polaris, +and Hive Metastore, ClickHouse minimizes data movement and duplication. This +architectural approach ensures that feature definitions remain consistent +across models and experiments, reducing the risk of discrepancies that can +undermine model performance. For MLOps teams, this +translates to less time managing infrastructure complexity and more focus on +core activities like model development and deployment, ultimately accelerating +the ML lifecycle while improving the economic viability of AI initiatives at +scale. + +## ClickHouse across the AI/ML Lifecycle {#clickhouse-across-the-ai-ml-lifecycle} + +ClickHouse's capabilities span the entire AI/ML lifecycle, providing a robust and +efficient platform from the very first stages of data preparation all the way to +model deployment and monitoring. + +| Area | Description | +|----------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------| +| [Data preparation and feature engineering](/get-started/cloud/use-cases/AI_ML/feature_engineering) | Learn how ClickHouse is used in the data preparation and feature engineering stages of the AI/ML pipeline | +| [Agent-facing analytics](/cloud/get-started/cloud/use-cases/AI_ML/agent_facing_analytics) | Learn how ClickHouse enables agentic facing analytics | diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/02_data_prep_feature_engineering.md b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/02_data_prep_feature_engineering.md new file mode 100644 index 00000000000..6c1678ec2bb --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/02_data_prep_feature_engineering.md @@ -0,0 +1,240 @@ +--- +slug: /cloud/get-started/cloud/use-cases/AI_ML/feature_engineering +title: 'Data preparation and feature engineering' +keywords: ['use cases', 'Machine Learning', 'Generative AI'] +sidebar_label: 'Data preparation and feature engineering' +--- + +import Image from '@theme/IdealImage'; +import ml_ai_01 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_01.png'; +import ml_ai_02 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_02.png'; +import ml_ai_03 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_03.png'; +import ml_ai_04 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_04.png'; + +## Data preparation and feature engineering {#data-preparation-and-feature-engineering} + +Data preparation bridges raw data and effective machine learning or AI +models, typically consuming the majority of time in AI/ML projects and +directly determining model success. It sits between initial data collection +and model development in the lifecycle, transforming messy, inconsistent +real-world data into clean, structured formats that algorithms can +effectively learn from. `clickhouse-local`, `chDB` (an in-process version +of ClickHouse for Python), open-source ClickHouse server or ClickHouse Cloud +allow developers and data scientists to work with ever-growing amounts of +data interactively and efficiently for ad-hoc querying, data cleaning, and +feature engineering. + +### What is a feature store? {#what-is-a-feature-store} + +In its simplest form, a feature store is a centralized repository for storing +and managing feature data and acting as the source of truth. By providing +APIs that allow the storage, versioning, and retrieval of features, feature +stores aim to provide a consistent view of features for training and +inference from development to production environments. Whether a custom-built +in-house solution or off-the-shelf product, actual product-level features +provided by a feature store will vary, with some providing a complete data +platform capable of aggregating data into features and even providing a +compute engine for the training of models. + +Irrespective of how many capabilities are inherent to the feature store, all +provide abstractions to the underlying data with which data scientists and +engineers will be familiar. As well as delivering data as versioned +entities, features, and classes, most expose concepts of feature groups, +training sets, batching, streaming, and point-in-time queries (such as the +ability to identify the values for a feature at either a specific point, +e.g. the latest value). + +Feature store + +### Why might you use one? {#why-use-one} + +In theory, a feature store ties disparate systems and capabilities together to +form a complete ML data layer, capable of both acting as the source of truth for +training data and also being used to provide context when predictions are being +made. + +While the exact capabilities they provide vary, the objectives remain the same: + +- **improve collaboration and reusability** between data scientists and data +engineers by centralizing features and their transformation logic +- **reduce model iteration time** during both experimentation and deployment by +allowing feature re-use at both training and inference time +- **governance and compliance** through rules and versioning which can restrict +model access to sensitive data (and features) +- **improve model performance and reliability** by abstracting the complexity of +data engineering from data scientists and ensuring they work with only quality +consistent features delivered through an API. + +While these represent a very high-level overview of some of the problems a +feature store solves, the predominant benefit here is the ability to share +features across teams and utilize the same data for training and inference. + +Feature stores also address a number of other challenges present in MLOps, +such as how to backfill feature data, handle incremental updates to the +source data (to update features), or monitor new data for drift. More +recently, they have also integrated vector databases to act as the +orchestration layer for RAG pipelines or to help find similar features +using embeddings - a useful capability during some model training. + +### Components of a feature store {#components-of-a-feature-store} + +Before we explore how ClickHouse might fit into a feature store, understanding +the common components is helpful for context. Typically, a feature store will +consist of up to 4 main components: + +Components of a feature store + +- **Data source** - While this can be as simple as a CSV file, it is often a +database or data lake with files in a format like Iceberg and accessible +through a query engine. + +- **Transformation engine (optional)** - Raw data needs to be transformed into +features. In a simple case, a feature can be correlated with a column's +values. More likely, it is the result of a transformation process involving +joins, aggregations, and expressions changing the structure and/or type of +column values. Some feature stores (see Types of Feature Store) might +provide built-in capabilities to achieve this; others may offload the work +to local Python functions or, for larger datasets, the database (maybe even +using dbt under the hood) via materializations, or a processing engine such +as Spark. With ClickHouse, this is achievable through Materialized Views. +Features that are continuously subject to update often require some form of +streaming pipeline, typically implemented with tooling such as Flink or +Spark Streaming. Normally, some form of directed acyclic graph (DAG) is +required, if these transformations are chained, and dependencies need to be +tracked. + +- **Offline (Training) Store** - The offline store holds the features +resulting from the previous transformation pipeline. These features are +typically grouped as entities and associated with a label (the target +prediction). Usually, models need to consume these features selectively, +either iteratively or through aggregations, potentially multiple times and +in random order. Models often require more than one feature, requiring +features to be grouped together in a "feature group" - usually by an entity +ID and time dimension. This requires the offline store to be able to deliver +the correct version of a feature and label for a specific point in time. +This "point-in-time correctness" is often fundamental to models, which need +to be trained incrementally. + +- **Online (Interference) Store** - Once a model has been trained, it can be +deployed and used for making predictions. This inference process requires +information that is only available at the moment of prediction, e.g. the +user's ID for a transaction. However, it can also require features for the +prediction, which may be precomputed, e.g. features representing historical +purchases. These are often too expensive to compute at inference time, even +for ClickHouse. These features need to be served in latency-sensitive +situations, based on the most recent version of the data, especially in +scenarios, where predictions need to be made in real-time, such as fraud +detection. Features may be materialized from the offline store to the online +store for serving. + +### Feature stores and ClickHouse {#feature-stores-and-clickhouse} + +As a real-time data warehouse, ClickHouse can fulfill the role of a number +of the components - potentially significantly simplifying the feature store +architecture. + +Feature stores and ClickHouse + +Specifically, ClickHouse can act as a: + +- **Data source** - With the ability to query or ingest data in over 70 +different file formats, including data lake formats such as Iceberg and +Delta Lake, ClickHouse makes an ideal long-term store holding or querying +data. By separating storage and compute using object storage, ClickHouse +Cloud additionally allows data to be held indefinitely - with compute scaled +down or made completely idle to minimize costs. Flexible codecs, coupled +with column-oriented storage and ordering of data on disk, maximize +compression rates, thus minimizing the required storage. Users can easily +combine ClickHouse with data lakes, with built-in functions to query data in +place on object storage. + +- **Transformation engine** - SQL provides a natural means of declaring data + transformations. When extended with ClickHouse's analytical and statistical + functions, these transformations become succinct and optimized. As well as + applying to either ClickHouse tables, in cases where ClickHouse is used as a + data store, table functions allow SQL queries to be written against data + stored in formats such as Parquet, on-disk or object storage, or even other + data stores such as Postgres and MySQL. A completely parallelization query + execution engine, combined with a column-oriented storage format, allows + ClickHouse to perform aggregations over PBs of data in seconds - unlike + transformations on in memory data frames, users are not memory-bound. + Furthermore, materialized views allow data to be transformed at insert time, + thus overloading compute to data load time from query time. These views can + exploit the same range of analytical and statistical functions ideal for + data analysis and summarization. Should any of ClickHouse's existing + analytical functions be insufficient or custom libraries need to be + integrated, users can also utilize User Defined Functions (UDFs). + + While users can transform data directly in ClickHouse or prior to insertion + using SQL queries, ClickHouse can also be used in programming environments + such as Python via chDB. This allows embedded ClickHouse to be exposed as a + Python module and used to transform and manipulate large data frames within + notebooks. This allows transformation work to be performed client-side by + data engineers, with results potentially materialized as feature tables in + a centralized ClickHouse instance. + +- **Offline store** - With the above capabilities to read data from multiple + sources and apply transformations via SQL, the results of these queries can + also be persisted in ClickHouse via `INSERT INTO SELECT` statements. With + transformations often grouped by an entity ID and returning a number of + columns as results, ClickHouse's schema inference can automatically detect + the required types from these results and produce an appropriate table + schema to store them. Functions for generating random numbers and + statistical sampling allow data to be efficiently iterated and scaled at + millions or rows per second for feeding to model training pipelines. + + Often, features are represented in tables with a timestamp indicating the + value for an entity and feature at a specific point in time. As described + earlier, training pipelines often need the state of features at specific + points in time and in groups. ClickHouse's sparse indices allow fast + filtering of data to satisfy point-in-time queries and feature selection + filters. While other technologies such as Spark, Redshift, and BigQuery + rely on slow stateful windowed approaches to identify the state of features + at a specific point in time, ClickHouse supports the `ASOF` (as-of-this-time) + `LEFT JOIN` query and `argMax` function. As well as simplifying syntax, this + approach is highly performant on large datasets through the use of a sort + and merge algorithm. This allows feature groups to be built quickly, + reducing data preparation time prior to training. + + ClickHouse as an offline store + +- **Online store** - As a real-time analytics database, ClickHouse can serve highly + concurrent query workloads at low latency. While this requires data to be typically + denormalized, this aligns with the storage of feature groups used at both training + and inference time. Importantly, ClickHouse is able to deliver this query + performance while being subject to high write workloads thanks to its log-structured + merge tree. These properties are required in an online store to keep features + up-to-date. Since the features are already available within the offline store, + they can easily be materialized to new tables within either the same ClickHouse + cluster or a different instance via existing capabilities, e.g., [`remoteSecure`](/sql-reference/table-functions/remote#parameters). + + :::note + For use cases requiring very high request concurrency i.e., thousands per second, + and very low latency, we recommend users still consider a dedicated data store, + e.g., Redis, designed for these workloads. + ::: + +- **Vector database** - ClickHouse has built-in support for vector embeddings + through floating point arrays. These can be searched and compared through + [distance functions](https://clickhouse.com/docs/en/sql-reference/functions/distance-functions#cosinedistance), + allowing ClickHouse to be used as a vector database. This linear comparison can + be easily scaled and parallelized for larger datasets. Additionally, ClickHouse + has maturing support for [Approximate Nearest Neighbour (ANN)](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/annindexes) + indices, as well as [hyperplane indexes using pure-SQL](https://clickhouse.com/blog/approximate-nearest-neighbour-ann-with-sql-powered-local-sensitive-hashing-lsh-random-projections), + as required for larger vector datasets. + +By satisfying each of the above roles, ClickHouse can dramatically simplify +the feature store architecture. Aside from the simplification of operations, +this architecture allows features to be built and deployed faster. A single +instance of ClickHouse can be scaled vertically to handle PBs of data, with +additional instances simply added for high availability. This minimizes the +movement of data between data stores, minimizing the typical network +bottlenecks. ClickHouse Cloud expands on this further by storing only a +single copy of the data in object storage and allowing nodes to be scaled +vertically or horizontally dynamically in response to load as required. + +The above architecture still requires several key components not satisfied +by ClickHouse: a streaming engine such as Kafka + Flink and a framework to +provide compute for model training. A means of hosting models is also +required. For simplicity, we assume the use of a cloud-hosted solution to +these, such as Confluent and Amazon SageMaker. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/03_agent_facing_analytics.md b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/03_agent_facing_analytics.md new file mode 100644 index 00000000000..0cb2e68a439 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/03_agent_facing_analytics.md @@ -0,0 +1,168 @@ +--- +slug: /cloud/get-started/cloud/use-cases/AI_ML/agent_facing_analytics +title: 'Agent facing analytics' +keywords: ['use cases', 'Machine Learning', 'Generative AI', 'agent facing analytics', 'agents'] +sidebar_label: 'Agent facing analytics' +--- + +import Image from '@theme/IdealImage'; +import ml_ai_05 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_05.png'; +import ml_ai_06 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_06.png'; +import ml_ai_07 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_07.png'; +import ml_ai_08 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_08.png'; +import ml_ai_09 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_09.png'; + +## Agent-facing analytics concepts {#agent-facing-analytics} + +### What are "agents"? {#agents} + +One can think of AI agents as digital assistants that have evolved beyond +simple task execution (or function calling): they can understand context, +make decisions, and take meaningful actions toward specific goals. They +operate in a "sense-think-act" loop (see ReAct agents), processing various +inputs (text, media, data), analyzing situations, and then doing something +useful with that information. Most importantly, depending on the application +domain, they can theoretically operate at various levels of autonomy, +requiring or not human supervision. + +The game changer here has been the advent of Large Language Models (LLMs). +While we had the notion of AI agents for quite a while, LLMs like the GPT +series have given them a massive upgrade in their ability to "understand" +and communicate. It's as if they've suddenly become more fluent in "human" +aka. able to grasp requests and respond with relevant contextual information +drawn from the model's training. + +### AI agents superpowers: “Tools” {#tools} + +These agents really shine through their access to “tools”. Tools enhance AI agents +by giving them abilities to perform tasks. Rather than just being conversational +interfaces, they can now get things done whether it’s crunching numbers, searching +for information, or managing customer communications. Think of it as the difference +between having someone who can describe how to solve a problem and someone who +can actually solve it. + +For example, ChatGPT is now shipped by default with a search tool. This +integration with search providers allows the model to pull current information +from the web during conversations. This means it can fact-check responses, access +recent events and data, and provide up-to-date information rather than relying +solely on its training data. + +Agents equipped with tools + +Tools can also be used to simplify the implementation of Retrieval-Augmented +Generation (RAG) pipelines. Instead of relying only on what an AI model +learned during training, RAG lets the model pull in relevant information +before formulating a response. Here's an example: Using an AI assistant to +help with customer support (e.g. Salesforce AgentForce, ServiceNow AI +Agents). Without RAG, it would only use its general training to answer +questions. But with RAG, when a customer asks about the latest product +feature, the system retrieves the most recent documentation, release notes, +and historical support tickets before crafting its response. This means that +answers are now grounded in the latest information available to the AI +model. + +### Reasoning models {#reasoning-models} + +Another development in the AI space, and perhaps one of the most +interesting, is the emergence of reasoning models. Systems like OpenAI o1, +Anthropic Claude, or DeepSeek-R1 take a more methodical approach by +introducing a "thinking" step before responding to a prompt. Instead of +generating the answer straightaway, reasoning models use prompting +techniques like Chain-of-Thought (CoT) to analyze problems from multiple +angles, break them down into steps, and use the tools available to them to +gather contextual information when needed. + +This represents a shift toward more capable systems that can handle more +complex tasks through a combination of reasoning and practical tools. One of +the latest examples in this area is the introduction of OpenAI's deep +research, an agent that can autonomously conduct complex multi-step research +tasks online. It processes and synthesizes information from various sources, +including text, images, and PDFs, to generate comprehensive reports within five +to thirty minutes, a task that would traditionally take a human several hours. + +Reasoning models + +## Real-time analytics for AI agents {#real-time-analytics-for-ai-agents} + +Let's take the case of an agentic AI assistant with access to a +real-time analytics database containing the company's CRM data. When a user asks +about the latest (up-to-the-minute) sales trends, the AI assistant queries the +connected data source. It iteratively analyzes the data to identify meaningful +patterns and trends, such as month-over-month growth, seasonal variations, or +emerging product categories. Finally, it generates a natural language response +explaining key findings, often with supporting visualizations. When the main +interface is chat-based like in this case, performance matters since these +iterative explorations trigger a series of queries that can scan large amounts of +data to extract relevant insights. + +Some properties make real-time databases especially suitable for such +workloads. For example, real-time analytics databases are designed to work +with near real-time data, allowing them to process and deliver insights +almost immediately as new data arrives. This is crucial for AI agents, as +they can require up-to-date information to make (or help make) timely and +relevant decisions. + +The core analytical capabilities are also important. Real-time analytics +databases shine in performing complex aggregations and pattern detection +across large datasets. Unlike operational databases focusing primarily on +raw data storage or retrieval, these systems are optimized for analyzing +vast amounts of information. This makes them particularly well-suited for AI +agents that need to uncover trends, detect anomalies, and derive actionable +insights. + +Real-time analytics databases are also expected to deliver fast +performance for interactive querying, essential for chat-based interaction +and high-frequency explorative workloads. They ensure consistent performance +even with large data volumes and high query concurrency, enabling responsive +dialogues and a smoother user experience. + +Finally, real-time analytics databases often serve as the ultimate "data +sinks" effectively consolidating valuable domain-specific data in a single +location. By co-locating essential data across different sources and formats +under the same tent, these databases ensure that AI agents have access to a +unified view of the domain information, decoupled from operational systems. + +Classic real-time analytics + +Agent real-time analytics + +These properties already empower real-time databases to play a vital role +in serving AI data retrieval use cases at scale (e.g. OpenAI's acquisition +of Rockset). They can also enable AI agents to provide fast data-driven +responses while offloading the heavy computational work. + +It positions the real-time analytics database as a preferred "context +provider" for AI agents when it comes to insights. + +## AI agents as an emerging user persona {#ai-agents-as-an-emerging-user-persona} + +A useful way to think about AI agents leveraging real-time analytics databases +is to perceive them as a new category of users, or in product manager speak: +a user persona. + +Agents as an emerging user persona + +From the database perspective, we can expect a potentially unlimited number of +AI agents, concurrently running a large number of queries on behalf of users, +or in autonomy, to perform investigations, refine iterative research and insights, +and execute tasks. + +Over the years, real-time databases have had the time to adapt to human +interactive users, directly connected to the system or via a middleware +application layer. Classic personas examples include database administrators, +business analysts, data scientists, or software developers building applications +on top of the database. The industry has progressively learned their usage +patterns and requirements and organically, provided the interfaces, the operators, +the UIs, the formats, the clients, and the performance to satisfy their various +use cases. + +The question now becomes, are we ready to accommodate the AI agent's workloads? +What specific features do we need to re-think or create from scratch for these +usage patterns? + +ClickHouse is rapidly providing answers to some of these questions through a host +of features aimed at providing a feature-complete AI experience. + +## ClickHouse.ai {#clickhouse-ai} + +For more information about features coming soon to ClickHouse Cloud, see [ClickHouse.ai](https://clickhouse.com/clickhouse-ai/). diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/_category_.json b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/_category_.json new file mode 100644 index 00000000000..7b4415fff32 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/_category_.json @@ -0,0 +1,6 @@ +{ + "position": 2.5, + "label": "Machine Learning and GenAI", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/01_discover/02_use_cases/_category_.json b/docs/cloud/onboard/01_discover/02_use_cases/_category_.json new file mode 100644 index 00000000000..c066b9c4fdc --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/_category_.json @@ -0,0 +1,6 @@ +{ + "position": 2.5, + "label": "Use cases", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/manage/cloud-tiers.md b/docs/cloud/onboard/01_discover/04_cloud-tiers.md similarity index 100% rename from docs/cloud/manage/cloud-tiers.md rename to docs/cloud/onboard/01_discover/04_cloud-tiers.md diff --git a/docs/integrations/migration/overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/01_overview.md similarity index 100% rename from docs/integrations/migration/overview.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/01_overview.md diff --git a/docs/migrations/postgres/overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/01_overview.md similarity index 97% rename from docs/migrations/postgres/overview.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/01_overview.md index ca1d195b914..b8be25dcc58 100644 --- a/docs/migrations/postgres/overview.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/01_overview.md @@ -1,10 +1,13 @@ --- slug: /migrations/postgresql/overview -title: 'Migrating from PostgreSQL to ClickHouse' +title: 'Comparing PostgreSQL and ClickHouse' description: 'A guide to migrating from PostgreSQL to ClickHouse' keywords: ['postgres', 'postgresql', 'migrate', 'migration'] +sidebar_label: 'Overview' --- +# Comparing ClickHouse and PostgreSQL + ## Why use ClickHouse over Postgres? {#why-use-clickhouse-over-postgres} TLDR: Because ClickHouse is designed for fast analytics, specifically `GROUP BY` queries, as an OLAP database whereas Postgres is an OLTP database designed for transactional workloads. diff --git a/docs/migrations/postgres/appendix.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/appendix.md similarity index 100% rename from docs/migrations/postgres/appendix.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/appendix.md diff --git a/docs/migrations/postgres/index.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/index.md similarity index 100% rename from docs/migrations/postgres/index.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/index.md diff --git a/docs/migrations/postgres/dataset.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/01_migration_guide_part1.md similarity index 99% rename from docs/migrations/postgres/dataset.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/01_migration_guide_part1.md index 2574252e1da..fc97c8a76dc 100644 --- a/docs/migrations/postgres/dataset.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/01_migration_guide_part1.md @@ -4,6 +4,7 @@ title: 'Migrating data' description: 'Dataset example to migrate from PostgreSQL to ClickHouse' keywords: ['Postgres'] show_related_blogs: true +sidebar_label: 'Part 1' --- import postgres_stackoverflow_schema from '@site/static/images/migrations/postgres-stackoverflow-schema.png'; diff --git a/docs/migrations/postgres/rewriting-queries.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/02_migration_guide_part2.md similarity index 99% rename from docs/migrations/postgres/rewriting-queries.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/02_migration_guide_part2.md index 451d1b37d9a..a77b38ed5e5 100644 --- a/docs/migrations/postgres/rewriting-queries.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/02_migration_guide_part2.md @@ -3,6 +3,7 @@ slug: /migrations/postgresql/rewriting-queries title: 'Rewriting PostgreSQL Queries' keywords: ['postgres', 'postgresql', 'rewriting queries'] description: 'Part 2 of a guide on migrating from PostgreSQL to ClickHouse' +sidebar_label: 'Part 2' --- > This is **Part 2** of a guide on migrating from PostgreSQL to ClickHouse. Using a practical example, it demonstrates how to efficiently carry out the migration with a real-time replication (CDC) approach. Many of the concepts covered are also applicable to manual bulk data transfers from PostgreSQL to ClickHouse. diff --git a/docs/migrations/postgres/data-modeling-techniques.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/03_migration_guide_part3.md similarity index 99% rename from docs/migrations/postgres/data-modeling-techniques.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/03_migration_guide_part3.md index f864bd8fb3e..db4468289d8 100644 --- a/docs/migrations/postgres/data-modeling-techniques.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/03_migration_guide_part3.md @@ -1,9 +1,10 @@ --- slug: /migrations/postgresql/data-modeling-techniques title: 'Data modeling techniques' -description: 'Data modeling for migrating from PostgreSQL to ClickHouse' +description: 'Part 3 of a guide on migrating from PostgreSQL to ClickHouse' keywords: ['postgres', 'postgresql'] show_related_blogs: true +sidebar_label: 'Part 3' --- import postgres_b_tree from '@site/static/images/migrations/postgres-b-tree.png'; diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/_category_.json new file mode 100644 index 00000000000..ad514aeb890 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Migration guide", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/migrations/bigquery/equivalent-concepts.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/01_overview.md similarity index 98% rename from docs/migrations/bigquery/equivalent-concepts.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/01_overview.md index ee330a0610c..729112ee81e 100644 --- a/docs/migrations/bigquery/equivalent-concepts.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/01_overview.md @@ -4,12 +4,13 @@ slug: /migrations/bigquery/biquery-vs-clickhouse-cloud description: 'How BigQuery differs from ClickHouse Cloud' keywords: ['BigQuery'] show_related_blogs: true +sidebar_label: 'Overview' --- import bigquery_1 from '@site/static/images/migrations/bigquery-1.png'; import Image from '@theme/IdealImage'; -# BigQuery vs ClickHouse Cloud: equivalent and different concepts +# Comparing ClickHouse Cloud and BigQuery ## Resource organization {#resource-organization} @@ -21,7 +22,7 @@ The way resources are organized in ClickHouse Cloud is similar to [BigQuery's re Similar to BigQuery, organizations are the root nodes in the ClickHouse cloud resource hierarchy. The first user you set up in your ClickHouse Cloud account is automatically assigned to an organization owned by the user. The user may invite additional users to the organization. -### BigQuery projects vs ClickHouse Cloud services {#bigquery-projects-vs-clickhouse-cloud-services} +### BigQuery Projects vs ClickHouse Cloud Services {#bigquery-projects-vs-clickhouse-cloud-services} Within organizations, you can create services loosely equivalent to BigQuery projects because stored data in ClickHouse Cloud is associated with a service. There are [several service types available](/cloud/manage/cloud-tiers) in ClickHouse Cloud. Each ClickHouse Cloud service is deployed in a specific region and includes: @@ -29,15 +30,15 @@ Within organizations, you can create services loosely equivalent to BigQuery pro 2. An object storage folder where the service stores all the data. 3. An endpoint (or multiple endpoints created via ClickHouse Cloud UI console) - a service URL that you use to connect to the service (for example, `https://dv2fzne24g.us-east-1.aws.clickhouse.cloud:8443`) -### BigQuery datasets vs ClickHouse Cloud databases {#bigquery-datasets-vs-clickhouse-cloud-databases} +### BigQuery Datasets vs ClickHouse Cloud Databases {#bigquery-datasets-vs-clickhouse-cloud-databases} ClickHouse logically groups tables into databases. Like BigQuery datasets, ClickHouse databases are logical containers that organize and control access to table data. -### BigQuery folders {#bigquery-folders} +### BigQuery Folders {#bigquery-folders} ClickHouse Cloud currently has no concept equivalent to BigQuery folders. -### BigQuery slot reservations and quotas {#bigquery-slot-reservations-and-quotas} +### BigQuery Slot reservations and Quotas {#bigquery-slot-reservations-and-quotas} Like BigQuery slot reservations, you can [configure vertical and horizontal autoscaling](/manage/scaling#configuring-vertical-auto-scaling) in ClickHouse Cloud. For vertical autoscaling, you can set the minimum and maximum size for the memory and CPU cores of the compute nodes for a service. The service will then scale as needed within those bounds. These settings are also available during the initial service creation flow. Each compute node in the service has the same size. You can change the number of compute nodes within a service with [horizontal scaling](/manage/scaling#manual-horizontal-scaling). @@ -78,7 +79,7 @@ When presented with multiple options for ClickHouse types, consider the actual r ## Query acceleration techniques {#query-acceleration-techniques} -### Primary and foreign keys and primary index {#primary-and-foreign-keys-and-primary-index} +### Primary and Foreign keys and Primary index {#primary-and-foreign-keys-and-primary-index} In BigQuery, a table can have [primary key and foreign key constraints](https://cloud.google.com/bigquery/docs/information-schema-table-constraints). Typically, primary and foreign keys are used in relational databases to ensure data integrity. A primary key value is normally unique for each row and is not `NULL`. Each foreign key value in a row must be present in the primary key column of the primary key table or be `NULL`. In BigQuery, these constraints are not enforced, but the query optimizer may use this information to optimize queries better. diff --git a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/02_migrating-to-clickhouse-cloud.md similarity index 99% rename from docs/migrations/bigquery/migrating-to-clickhouse-cloud.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/02_migrating-to-clickhouse-cloud.md index 44f8c8c7d20..0118a912fec 100644 --- a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/02_migrating-to-clickhouse-cloud.md @@ -4,6 +4,7 @@ slug: /migrations/bigquery/migrating-to-clickhouse-cloud description: 'How to migrate your data from BigQuery to ClickHouse Cloud' keywords: ['BigQuery'] show_related_blogs: true +sidebar_label: 'Migration guide' --- import bigquery_2 from '@site/static/images/migrations/bigquery-2.png'; diff --git a/docs/migrations/bigquery/loading-data.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/03_loading-data.md similarity index 96% rename from docs/migrations/bigquery/loading-data.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/03_loading-data.md index 8e2558fe073..0bfdff8b2eb 100644 --- a/docs/migrations/bigquery/loading-data.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/03_loading-data.md @@ -24,7 +24,9 @@ Exporting data from BigQuery to ClickHouse is dependent on the size of your data | [contracts](https://github.com/ClickHouse/examples/blob/main/ethereum/schemas/contracts.md) | 57,225,837 | 350 | 45.35GB | 16 sec | 1 hr 51 min | 39.4 secs | | Total | 8.26 billion | 23,577 | 3.982TB | 8 min 3 sec | \> 6 days 5 hrs | 53 mins 45 secs | -## 1. Export table data to GCS {#1-export-table-data-to-gcs} + + +## Export table data to GCS {#1-export-table-data-to-gcs} In this step, we utilize the [BigQuery SQL workspace](https://cloud.google.com/bigquery/docs/bigquery-web-ui) to execute our SQL commands. Below, we export a BigQuery table named `mytable` to a GCS bucket using the [`EXPORT DATA`](https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements) statement. @@ -60,7 +62,7 @@ This approach has a number of advantages: - Exports produce multiple files automatically, limiting each to a maximum of 1GB of table data. This is beneficial to ClickHouse since it allows imports to be parallelized. - Parquet, as a column-oriented format, represents a better interchange format since it is inherently compressed and faster for BigQuery to export and ClickHouse to query -## 2. Importing data into ClickHouse from GCS {#2-importing-data-into-clickhouse-from-gcs} +## Importing data into ClickHouse from GCS {#2-importing-data-into-clickhouse-from-gcs} Once the export is complete, we can import this data into a ClickHouse table. You can use the [ClickHouse SQL console](/integrations/sql-clients/sql-console) or [`clickhouse-client`](/interfaces/cli) to execute the commands below. @@ -111,7 +113,7 @@ In the above query, we use the [`ifNull` function](/sql-reference/functions/func Alternatively, you can `SET input_format_null_as_default=1` and any missing or NULL values will be replaced by default values for their respective columns, if those defaults are specified. ::: -## 3. Testing successful data export {#3-testing-successful-data-export} +## Testing successful data export {#3-testing-successful-data-export} To test whether your data was properly inserted, simply run a `SELECT` query on your new table: @@ -121,6 +123,8 @@ SELECT * FROM mytable LIMIT 10; To export more BigQuery tables, simply redo the steps above for each additional table. + + ## Further reading and support {#further-reading-and-support} In addition to this guide, we also recommend reading our blog post that shows [how to use ClickHouse to speed up BigQuery and how to handle incremental imports](https://clickhouse.com/blog/clickhouse-bigquery-migrating-data-for-realtime-queries). diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/_04_sql_translation_reference.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/_04_sql_translation_reference.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docs/migrations/bigquery/index.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/index.md similarity index 100% rename from docs/migrations/bigquery/index.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/index.md diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/01_overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/01_overview.md new file mode 100644 index 00000000000..980cfed6061 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/01_overview.md @@ -0,0 +1,184 @@ +--- +sidebar_label: 'Overview' +slug: /migrations/snowflake-overview +description: 'Migrating from Snowflake to ClickHouse' +keywords: ['Snowflake'] +title: 'Migrate from Snowflake to ClickHouse' +show_related_blogs: true +--- + +import snowflake_architecture from '@site/static/images/cloud/onboard/discover/use_cases/snowflake_architecture.png'; +import cloud_architecture from '@site/static/images/cloud/onboard/discover/use_cases/cloud_architecture.png'; +import Image from '@theme/IdealImage'; + +# Snowflake to ClickHouse migration + +> This document provides an introduction to migrating data from Snowflake to ClickHouse. + +Snowflake is a cloud data warehouse primarily focused on migrating legacy on-premise +data warehousing workloads to the cloud. It is well-optimized for executing +long-running reports at scale. As datasets migrate to the cloud, data owners start +thinking about how else they can extract value from this data, including using +these datasets to power real-time applications for internal and external use cases. +When this happens, they often realize they need a database optimized for +powering real-time analytics, like ClickHouse. + +## Comparison {#comparison} + +In this section, we'll compare the key features of ClickHouse and Snowflake. + +### Similarities {#similarities} + +Snowflake is a cloud-based data warehousing platform that provides a scalable +and efficient solution for storing, processing, and analyzing large amounts of +data. +Like ClickHouse, Snowflake is not built on existing technologies but relies +on its own SQL query engine and custom architecture. + +Snowflake’s architecture is described as a hybrid between a shared-storage (shared-disk) +architecture and a shared-nothing architecture. A shared-storage architecture is +one where data is both accessible from all compute nodes using object +stores such as S3. A shared-nothing architecture is one where each compute node +stores a portion of the entire data set locally to respond to queries. This, in +theory, delivers the best of both models: the simplicity of a shared-disk +architecture and the scalability of a shared-nothing architecture. + +This design fundamentally relies on object storage as the primary storage medium, +which scales almost infinitely under concurrent access while providing high +resilience and scalable throughput guarantees. + +The image below from [docs.snowflake.com](https://docs.snowflake.com/en/user-guide/intro-key-concepts) +shows this architecture: + +Snowflake architecture + +Conversely, as an open-source and cloud-hosted product, ClickHouse can be deployed +in both shared-disk and shared-nothing architectures. The latter is typical for +self-managed deployments. While allowing for CPU and memory to be easily scaled, +shared-nothing configurations introduce classic data management challenges and +overhead of data replication, especially during membership changes. + +For this reason, ClickHouse Cloud utilizes a shared-storage architecture that is +conceptually similar to Snowflake. Data is stored once in an object store +(single copy), such as S3 or GCS, providing virtually infinite storage with +strong redundancy guarantees. Each node has access to this single copy of the +data as well as its own local SSDs for cache purposes. Nodes can, in turn, be +scaled to provide additional CPU and memory resources as required. Like Snowflake, +S3’s scalability properties address the classic limitation of shared-disk +architectures (disk I/O and network bottlenecks) by ensuring the I/O throughput +available to current nodes in a cluster is not impacted as additional nodes are +added. + +ClickHouse Cloud architecture + +### Differences {#differences} + +Aside from the underlying storage formats and query engines, these architectures +differ in a few subtle ways: + +* Compute resources in Snowflake are provided through a concept of [warehouses](https://docs.snowflake.com/en/user-guide/warehouses). + These consist of a number of nodes, each of a set size. While Snowflake + doesn't publish the specific architecture of their warehouses, it is + [generally understood](https://select.dev/posts/snowflake-warehouse-sizing) + that each node consists of 8 vCPUs, 16GiB, and 200GB of local storage (for cache). + The number of nodes depends on a t-shirt size, e.g. an x-small has one node, + a small 2, medium 4, large 8, etc. These warehouses are independent of the data + and can be used to query any database residing on object storage. When idle + and not subjected to query load, warehouses are paused - resuming when a query + is received. While storage costs are always reflected in billing, warehouses + are only charged when active. + +* ClickHouse Cloud utilizes a similar principle of nodes with local cache + storage. Rather than t-shirt sizes, users deploy a service with a total + amount of compute and available RAM. This, in turn, transparently + auto-scales (within defined limits) based on the query load - either + vertically by increasing (or decreasing) the resources for each node or + horizontally by raising/lowering the total number of nodes. ClickHouse + Cloud nodes currently have a 1 CPU-to-memory ratio, unlike Snowflake's 1. + While a looser coupling is possible, services are currently coupled to the + data, unlike Snowflake warehouses. Nodes will also pause if idle and + resume if subjected to queries. Users can also manually resize services if + needed. + +* ClickHouse Cloud's query cache is currently node specific, unlike + Snowflake's, which is delivered at a service layer independent of the + warehouse. Based on benchmarks, ClickHouse Cloud's node cache outperforms + Snowflake's. + +* Snowflake and ClickHouse Cloud take different approaches to scaling to + increase query concurrency. Snowflake addresses this through a feature + known as [multi-cluster warehouses](https://docs.snowflake.com/en/user-guide/warehouses-multicluster#benefits-of-multi-cluster-warehouses). + This feature allows users to add clusters to a warehouse. While this offers no + improvement to query latency, it does provide additional parallelization and + allows higher query concurrency. ClickHouse achieves this by adding more memory + and CPU to a service through vertical or horizontal scaling. We do not explore the + capabilities of these services to scale to higher concurrency in this blog, + focusing instead on latency, but acknowledge that this work should be done + for a complete comparison. However, we would expect ClickHouse to perform + well in any concurrency test, with Snowflake explicitly limiting the number + of concurrent queries allowed for a [warehouse to 8 by default](https://docs.snowflake.com/en/sql-reference/parameters#max-concurrency-level). + In comparison, ClickHouse Cloud allows up to 1000 queries to be executed per + node. + +* Snowflake's ability to switch compute size on a dataset, coupled with fast + resume times for warehouses, makes it an excellent experience for ad hoc + querying. For data warehouse and data lake use cases, this provides an + advantage over other systems. + +### Real-time analytics {#real-time-analytics} + +Based on public [benchmark](https://benchmark.clickhouse.com/#system=+%E2%98%81w|%EF%B8%8Fr|C%20c|nfe&type=-&machine=-ca2|gl|6ax|6ale|3al&cluster_size=-&opensource=-&tuned=+n&metric=hot&queries=-) data, +ClickHouse outperforms Snowflake for real-time analytics applications in the following areas: + +* **Query latency**: Snowflake queries have a higher query latency even + when clustering is applied to tables to optimize performance. In our + testing, Snowflake requires over twice the compute to achieve equivalent + ClickHouse performance on queries where a filter is applied that is part + of the Snowflake clustering key or ClickHouse primary key. While + Snowflake's [persistent query cache](https://docs.snowflake.com/en/user-guide/querying-persisted-results) + offsets some of these latency challenges, this is ineffective in cases + where the filter criteria are more diverse. This query cache effectiveness + can be further impacted by changes to the underlying data, with cache + entries invalidated when the table changes. While this is not the case in + the benchmark for our application, a real deployment would require the new, + more recent data to be inserted. Note that ClickHouse's query cache is + node specific and not [transactionally consistent](https://clickhouse.com/blog/introduction-to-the-clickhouse-query-cache-and-design), + making it [better suited ](https://clickhouse.com/blog/introduction-to-the-clickhouse-query-cache-and-design) + to real-time analytics. Users also have granular control over its use + with the ability to control its use on a [per-query basis](/operations/settings/settings#use-query-cache), + its [precise size](/operations/settings/settings#query-cache-max-size-in-bytes), + whether a [query is cached](/operations/settings/settings#enable-writes-to-query-cache) + (limits on duration or required number of executions), and whether it is + only [passively used](https://clickhouse.com/blog/introduction-to-the-clickhouse-query-cache-and-design#using-logs-and-settings). + +* **Lower cost**: Snowflake warehouses can be configured to suspend after + a period of query inactivity. Once suspended, charges are not incurred. + Practically, this inactivity check can [only be lowered to 60s](https://docs.snowflake.com/en/sql-reference/sql/alter-warehouse). + Warehouses will automatically resume, within several seconds, once a query + is received. With Snowflake only charging for resources when a warehouse + is under use, this behavior caters to workloads that often sit idle, like + ad-hoc querying. + + However, many real-time analytics workloads require ongoing real-time data + ingestion and frequent querying that doesn't benefit from idling (like + customer-facing dashboards). This means warehouses must often be fully + active and incurring charges. This negates the cost-benefit of idling as + well as any performance advantage that may be associated with Snowflake's + ability to resume a responsive state faster than alternatives. This active + state requirement, when combined with ClickHouse Cloud's lower per-second + cost for an active state, results in ClickHouse Cloud offering a + significantly lower total cost for these kinds of workloads. + +* **Predictable pricing of features:** Features such as materialized views + and clustering (equivalent to ClickHouse's ORDER BY) are required to reach + the highest levels of performance in real-time analytics use cases. These + features incur additional charges in Snowflake, requiring not only a + higher tier, which increases costs per credit by 1.5x, but also + unpredictable background costs. For instance, materialized views incur a + background maintenance cost, as does clustering, which is hard to predict + prior to use. In contrast, these features incur no additional cost in + ClickHouse Cloud, except additional CPU and memory usage at insert time, + typically negligible outside of high insert workload use cases. We have + observed in our benchmark that these differences, along with lower query + latencies and higher compression, result in significantly lower costs with + ClickHouse. diff --git a/docs/migrations/snowflake.md b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md similarity index 77% rename from docs/migrations/snowflake.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md index 38d3b8dfac1..468a8b6193b 100644 --- a/docs/migrations/snowflake.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md @@ -1,23 +1,27 @@ --- -sidebar_label: 'Snowflake' -sidebar_position: 20 +sidebar_label: 'Migration guide' slug: /migrations/snowflake description: 'Migrating from Snowflake to ClickHouse' keywords: ['Snowflake'] title: 'Migrating from Snowflake to ClickHouse' -show_related_blogs: true +show_related_blogs: false --- import migrate_snowflake_clickhouse from '@site/static/images/migrations/migrate_snowflake_clickhouse.png'; import Image from '@theme/IdealImage'; -# Migrating from Snowflake to ClickHouse +# Migrate from Snowflake to ClickHouse -This guide shows how to migrate data from Snowflake to ClickHouse. +> This guide shows you how to migrate data from Snowflake to ClickHouse. -Migrating data between Snowflake and ClickHouse requires the use of an object store, such as S3, as an intermediate storage for transfer. The migration process also relies on using the commands `COPY INTO` from Snowflake and `INSERT INTO SELECT` of ClickHouse. +Migrating data between Snowflake and ClickHouse requires the use of an object store, +such as S3, as an intermediate storage for transfer. The migration process also +relies on using the commands `COPY INTO` from Snowflake and `INSERT INTO SELECT` +of ClickHouse. -## 1. Exporting data from Snowflake {#1-exporting-data-from-snowflake} + + +## Export data from Snowflake {#1-exporting-data-from-snowflake} Migrating from Snowflake to ClickHouse @@ -54,7 +58,7 @@ COPY INTO @external_stage/mydataset from mydataset max_file_size=157286400 heade For a dataset around 5TB of data with a maximum file size of 150MB, and using a 2X-Large Snowflake warehouse located in the same AWS `us-east-1` region, copying data to the S3 bucket will take around 30 minutes. -## 2. Importing to ClickHouse {#2-importing-to-clickhouse} +## Import to ClickHouse {#2-importing-to-clickhouse} Once the data is staged in intermediary object storage, ClickHouse functions such as the [s3 table function](/sql-reference/table-functions/s3) can be used to insert the data into a table, as shown below. @@ -65,10 +69,10 @@ Assuming the following table target schema: ```sql CREATE TABLE default.mydataset ( - `timestamp` DateTime64(6), - `some_text` String, - `some_file` Tuple(filename String, version String), - `complex_data` Tuple(name String, description String), + `timestamp` DateTime64(6), + `some_text` String, + `some_file` Tuple(filename String, version String), + `complex_data` Tuple(name String, description String), ) ENGINE = MergeTree ORDER BY (timestamp) @@ -79,16 +83,16 @@ We can then use the `INSERT INTO SELECT` command to insert the data from S3 into ```sql INSERT INTO mydataset SELECT - timestamp, - some_text, - JSONExtract( - ifNull(some_file, '{}'), - 'Tuple(filename String, version String)' - ) AS some_file, - JSONExtract( - ifNull(complex_data, '{}'), - 'Tuple(filename String, description String)' - ) AS complex_data, + timestamp, + some_text, + JSONExtract( + ifNull(some_file, '{}'), + 'Tuple(filename String, version String)' + ) AS some_file, + JSONExtract( + ifNull(complex_data, '{}'), + 'Tuple(filename String, description String)' + ) AS complex_data, FROM s3('https://mybucket.s3.amazonaws.com/mydataset/mydataset*.parquet') SETTINGS input_format_null_as_default = 1, -- Ensure columns are inserted as default if values are null input_format_parquet_case_insensitive_column_matching = 1 -- Column matching between source data and target table should be case insensitive @@ -100,10 +104,12 @@ The `VARIANT` and `OBJECT` columns in the original Snowflake table schema will b Nested structures such as `some_file` are converted to JSON strings on copy by Snowflake. Importing this data requires us to transform these structures to Tuples at insert time in ClickHouse, using the [JSONExtract function](/sql-reference/functions/json-functions#jsonextract) as shown above. ::: -## 3. Testing successful data export {#3-testing-successful-data-export} +## Test successful data export {#3-testing-successful-data-export} To test whether your data was properly inserted, simply run a `SELECT` query on your new table: ```sql SELECT * FROM mydataset LIMIT 10; ``` + + \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/03_sql_translation_reference.md b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/03_sql_translation_reference.md new file mode 100644 index 00000000000..a5f41b52605 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/03_sql_translation_reference.md @@ -0,0 +1,114 @@ +--- +sidebar_label: 'SQL translation reference' +slug: /migrations/snowflake-translation-reference +description: 'SQL translation reference' +keywords: ['Snowflake'] +title: 'Migrating from Snowflake to ClickHouse' +show_related_blogs: true +--- + +# Snowflake SQL translation guide + +## Data types {#data-types} + +### Numerics {#numerics} + +Users moving data between ClickHouse and Snowflake will immediately notice that +ClickHouse offers more granular precision concerning declaring numerics. For example, +Snowflake offers the type Number for numerics. This requires the user to specify a +precision (total number of digits) and scale (digits to the right of the decimal place) +up to a total of 38. Integer declarations are synonymous with Number, and simply +define a fixed precision and scale where the range is the same. This convenience +is possible as modifying the precision (scale is 0 for integers) does not impact the +size of data on disk in Snowflake - the minimal required bytes are used for a +numeric range at write time at a micro partition level. The scale does, however, +impact storage space and is offset with compression. A `Float64` type offers a +wider range of values with a loss of precision. + +Contrast this with ClickHouse, which offers multiple signed and unsigned +precision for floats and integers. With these, ClickHouse users can be explicit about +the precision required for integers to optimize storage and memory overhead. A +Decimal type, equivalent to Snowflake’s Number type, also offers twice the +precision and scale at 76 digits. In addition to a similar `Float64` value, +ClickHouse also provides a `Float32` for when precision is less critical and +compression paramount. + +### Strings {#strings} + +ClickHouse and Snowflake take contrasting approaches to the storage of string +data. The `VARCHAR` in Snowflake holds Unicode characters in UTF-8, allowing the +user to specify a maximum length. This length has no impact on storage or +performance, with the minimum number of bytes always used to store a string, and +rather provides only constraints useful for downstream tooling. Other types, such +as `Text` and `NChar`, are simply aliases for this type. ClickHouse conversely +stores all [string data as raw bytes](/sql-reference/data-types/string) with a `String` +type (no length specification required), deferring encoding to the user, with +[query time functions](/sql-reference/functions/string-functions#lengthutf8) +available for different encodings. We refer the reader to ["Opaque data argument"](https://utf8everywhere.org/#cookie) +for the motivation as to why. The ClickHouse `String` is thus more comparable +to the Snowflake Binary type in its implementation. Both [Snowflake](https://docs.snowflake.com/en/sql-reference/collation) +and [ClickHouse](/sql-reference/statements/select/order-by#collation-support) +support “collation”, allowing users to override how strings are sorted and compared. + +### Semi-structured types {#semi-structured-data} + +Snowflake supports the `VARIANT`, `OBJECT` and `ARRAY` types for semi-structured +data. + +ClickHouse offers the equivalent [`Variant`](/sql-reference/data-types/variant), +[`Object`](/sql-reference/data-types/object-data-type) (deprecated) and [`Array`](/sql-reference/data-types/array) +types. Additionally, ClickHouse has the [`JSON`](/sql-reference/data-types/newjson) +type which replaces the now deprecated `Object('json')` type and is particularly +performant and storage efficient in [comparison to other native JSON types](https://jsonbench.com/). + +ClickHouse also supports named [`Tuple`s](/sql-reference/data-types/tuple) and arrays of Tuples +via the [`Nested`](/sql-reference/data-types/nested-data-structures/nested) type, +allowing users to explicitly map nested structures. This allows codecs and type +optimizations to be applied throughout the hierarchy, unlike Snowflake, which +requires the user to use the `OBJECT`, `VARIANT`, and `ARRAY` types for the outer +object and does not allow [explicit internal typing](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#characteristics-of-an-object). +This internal typing also simplifies queries on nested numerics in ClickHouse, +which do not need to be cast and can be used in index definitions. + +In ClickHouse, codecs and optimized types can also be applied to substructures. +This provides an added benefit that compression with nested structures remains +excellent, and comparable, to flattened data. In contrast, as a result of the +inability to apply specific types to substructures, Snowflake recommends [flattening +data to achieve optimal compression](https://docs.snowflake.com/en/user-guide/semistructured-considerations#storing-semi-structured-data-in-a-variant-column-vs-flattening-the-nested-structure). +Snowflake also [imposes size restrictions](https://docs.snowflake.com/en/user-guide/semistructured-considerations#data-size-limitations) +for these data types. + +### Type reference {#type-reference} + +| Snowflake | ClickHouse | Note | +|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [`NUMBER`](https://docs.snowflake.com/en/sql-reference/data-types-numeric) | [`Decimal`](/sql-reference/data-types/decimal) | ClickHouse supports twice the precision and scale than Snowflake - 76 digits vs. 38. | +| [`FLOAT`, `FLOAT4`, `FLOAT8`](https://docs.snowflake.com/en/sql-reference/data-types-numeric#data-types-for-floating-point-numbers) | [`Float32`, `Float64`](/sql-reference/data-types/float) | All floats in Snowflake are 64 bit. | +| [`VARCHAR`](https://docs.snowflake.com/en/sql-reference/data-types-text#varchar) | [`String`](/sql-reference/data-types/string) | | +| [`BINARY`](https://docs.snowflake.com/en/sql-reference/data-types-text#binary) | [`String`](/sql-reference/data-types/string) | | +| [`BOOLEAN`](https://docs.snowflake.com/en/sql-reference/data-types-logical) | [`Bool`](/sql-reference/data-types/boolean) | | +| [`DATE`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#date) | [`Date`](/sql-reference/data-types/date), [`Date32`](/sql-reference/data-types/date32) | `DATE` in Snowflake offers a wider date range than ClickHouse e.g. min for `Date32` is `1900-01-01` and `Date` `1970-01-01`. `Date` in ClickHouse provides more cost efficient (two byte) storage. | +| [`TIME(N)`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#time) | No direct equivalent but can be represented by [`DateTime`](/sql-reference/data-types/datetime) and [`DateTime64(N)`](/sql-reference/data-types/datetime64). | `DateTime64` uses the same concepts of precision. | +| [`TIMESTAMP`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp) - [`TIMESTAMP_LTZ`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp-ltz-timestamp-ntz-timestamp-tz), [`TIMESTAMP_NTZ`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp-ltz-timestamp-ntz-timestamp-tz), [`TIMESTAMP_TZ`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp-ltz-timestamp-ntz-timestamp-tz) | [`DateTime`](/sql-reference/data-types/datetime) and [`DateTime64`](/sql-reference/data-types/datetime64) | `DateTime` and `DateTime64` can optionally have a TZ parameter defined for the column. If not present, the server's timezone is used. Additionally a `--use_client_time_zone` parameter is available for the client. | +| [`VARIANT`](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#variant) | [`JSON`, `Tuple`, `Nested`](/integrations/data-formats/json) | `JSON` type is experimental in ClickHouse. This type infers the column types at insert time. `Tuple`, `Nested` and `Array` can also be used to build explicitly type structures as an alternative. | +| [`OBJECT`](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#object) | [`Tuple`, `Map`, `JSON`](/integrations/data-formats/json) | Both `OBJECT` and `Map` are analogous to `JSON` type in ClickHouse where the keys are a `String`. ClickHouse requires the value to be consistent and strongly typed whereas Snowflake uses `VARIANT`. This means the values of different keys can be a different type. If this is required in ClickHouse, explicitly define the hierarchy using `Tuple` or rely on `JSON` type. | +| [`ARRAY`](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#array) | [`Array`](/sql-reference/data-types/array), [`Nested`](/sql-reference/data-types/nested-data-structures/nested) | `ARRAY` in Snowflake uses `VARIANT` for the elements - a super type. Conversely these are strongly typed in ClickHouse. | +| [`GEOGRAPHY`](https://docs.snowflake.com/en/sql-reference/data-types-geospatial#geography-data-type) | [`Point`, `Ring`, `Polygon`, `MultiPolygon`](/sql-reference/data-types/geo) | Snowflake imposes a coordinate system (WGS 84) while ClickHouse applies at query time. | +| [`GEOMETRY`](https://docs.snowflake.com/en/sql-reference/data-types-geospatial#geometry-data-type) | [`Point`, `Ring`, `Polygon`, `MultiPolygon`](/sql-reference/data-types/geo) | | | + +| ClickHouse Type | Description | +|-------------------|-----------------------------------------------------------------------------------------------------| +| `IPv4` and `IPv6` | IP-specific types, potentially allowing more efficient storage than Snowflake. | +| `FixedString` | Allows a fixed length of bytes to be used, which is useful for hashes. | +| `LowCardinality` | Allows any type to be dictionary encoded. Useful for when the cardinality is expected to be < 100k. | +| `Enum` | Allows efficient encoding of named values in either 8 or 16-bit ranges. | +| `UUID` | For efficient storage of UUIDs. | +| `Array(Float32)` | Vectors can be represented as an Array of Float32 with supported distance functions. | + +Finally, ClickHouse offers the unique ability to store the intermediate +[state of aggregate functions](/sql-reference/data-types/aggregatefunction). This +state is implementation-specific, but allows the result of an aggregation to be +stored and later queried (with corresponding merge functions). Typically, this +feature is used via a materialized view and, as demonstrated below, offers the +ability to improve performance of specific queries with minimal storage cost by +storing the incremental result of queries over inserted data (more details here). diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/_category_.json new file mode 100644 index 00000000000..50b05cb45a0 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Snowflake", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/01_overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/01_overview.md new file mode 100644 index 00000000000..5b6a7476b7c --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/01_overview.md @@ -0,0 +1,12 @@ +--- +sidebar_label: 'Overview' +slug: /migrations/elastic-overview +description: 'Migrating from Snowflake to ClickHouse' +keywords: ['Snowflake'] +title: 'Migrate from Snowflake to ClickHouse' +show_related_blogs: true +--- + +# Elasticsearch to ClickHouse migration + +This document provides an introduction to migrating data from Elasticsearch to ClickHouse. diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/_category_.json new file mode 100644 index 00000000000..4f49621cf3d --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Elasticsearch", + "collapsible": true, + "collapsed": true +} \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/01_overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/01_overview.md new file mode 100644 index 00000000000..785eba5d98a --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/01_overview.md @@ -0,0 +1,57 @@ +--- +sidebar_label: 'Overview' +slug: /migrations/redshift-overview +description: 'Migrating from Amazon Redshift to ClickHouse' +keywords: ['Redshift'] +title: 'Comparing ClickHouse Cloud and Amazon Redshift' +--- + +# Amazon Redshift to ClickHouse migration + +> This document provides an introduction to migrating data from Amazon +Redshift to ClickHouse. + +## Introduction {#introduction} + +Amazon Redshift is a cloud data warehouse that provides reporting and +analytics capabilities for structured and semi-structured data. It was +designed to handle analytical workloads on big data sets using +column-oriented database principles similar to ClickHouse. As part of the +AWS offering, it is often the default solution AWS users turn to for their +analytical data needs. + +While attractive to existing AWS users due to its tight integration with the +Amazon ecosystem, Redshift users that adopt it to power real-time analytics +applications find themselves in need of a more optimized solution for this +purpose. As a result, they increasingly turn to ClickHouse to benefit from +superior query performance and data compression, either as a replacement or +a "speed layer" deployed alongside existing Redshift workloads. + +## ClickHouse vs Redshift {#clickhouse-vs-redshift} + +For users heavily invested in the AWS ecosystem, Redshift represents a +natural choice when faced with data warehousing needs. Redshift differs from +ClickHouse in this important aspect – it optimizes its engine for data +warehousing workloads requiring complex reporting and analytical queries. +Across all deployment modes, the following two limitations make it difficult +to use Redshift for real-time analytical workloads: +* Redshift [compiles code for each query execution plan](https://docs.aws.amazon.com/redshift/latest/dg/c-query-performance.html), +which adds significant overhead to first-time query execution. This overhead can +be justified when query patterns are predictable and compiled execution plans +can be stored in a query cache. However, this introduces challenges for interactive +applications with variable queries. Even when Redshift is able to exploit this +code compilation cache, ClickHouse is faster on most queries. See ["ClickBench"](https://benchmark.clickhouse.com/#system=+%E2%98%81w|%EF%B8%8Fr|C%20c|Rf&type=-&machine=-ca2|gl|6ax|6ale|3al&cluster_size=-&opensource=-&tuned=+n&metric=hot&queries=-). +* Redshift [limits concurrency to 50 across all queues](https://docs.aws.amazon.com/redshift/latest/dg/c_workload_mngmt_classification.html), +which (while adequate for BI) makes it inappropriate for highly concurrent +analytical applications. + +Conversely, while ClickHouse can also be utilized for complex analytical queries +it is optimized for real-time analytical workloads, either powering applications +or acting as a warehouse acceleration later. As a result, Redshift users typically +replace or augment Redshift with ClickHouse for the following reasons: + +| Advantage | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Lower query latencies** | ClickHouse achieves lower query latencies, including for varied query patterns, under high concurrency and while subjected to streaming inserts. Even when your query misses a cache, which is inevitable in interactive user-facing analytics, ClickHouse can still process it fast. | +| **Higher concurrent query limits** | ClickHouse places much higher limits on concurrent queries, which is vital for real-time application experiences. In ClickHouse, self-managed as well as cloud, you can scale up your compute allocation to achieve the concurrency your application needs for each service. The level of permitted query concurrency is configurable in ClickHouse, with ClickHouse Cloud defaulting to a value of 1000. | +| **Superior data compression** | ClickHouse offers superior data compression, which allows users to reduce their total storage (and thus cost) or persist more data at the same cost and derive more real-time insights from their data. See "ClickHouse vs Redshift Storage Efficiency" below. | diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/02_migration_guide.md b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/02_migration_guide.md new file mode 100644 index 00000000000..506c9957e58 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/02_migration_guide.md @@ -0,0 +1,13 @@ +--- +sidebar_label: 'Migration guide' +slug: /migrations/redshift/migration-guide +description: 'Migrating from Amazon Redshift to ClickHouse' +keywords: ['Redshift'] +title: 'Amazon Redshift to ClickHouse migration guide' +--- + +import MigrationGuide from '@site/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md' + +# Amazon Redshift to ClickHouse migration guide + + \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/03_sql_translation_reference.md b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/03_sql_translation_reference.md new file mode 100644 index 00000000000..67585e4ea72 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/03_sql_translation_reference.md @@ -0,0 +1,95 @@ +--- +sidebar_label: 'SQL translation reference' +slug: /migrations/redshift/sql-translation-reference +description: 'SQL translation reference for Amazon Redshift to ClickHouse' +keywords: ['Redshift'] +title: 'Amazon Redshift SQL translation guide' +--- + +# Amazon Redshift SQL translation guide + +## Data types {#data-types} + +Users moving data between ClickHouse and Redshift will immediately notice +that ClickHouse offers a more extensive range of types, which are also less +restrictive. While Redshift requires users to specify possible string +lengths, even if variable, ClickHouse removes this restriction and burden +from the user by storing strings without encoding as bytes. The ClickHouse +String type thus has no limits or length specification requirements. + +Furthermore, users can exploit Arrays, Tuples, and Enums - absent from +Redshift as first-class citizens (although Arrays/Structs can be imitated +with `SUPER`) and a common frustration of users. ClickHouse additionally +allows the persistence, either at query time or even in a table, of +aggregation states. This will enable data to be pre-aggregated, typically +using a materialized view, and can dramatically improve query performance +for common queries. + +Below we map the equivalent ClickHouse type for each Redshift type: + +| Redshift | ClickHouse | +|------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [`SMALLINT`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-integer-types) | [`Int8`](/sql-reference/data-types/int-uint) * | +| [`INTEGER`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-integer-types) | [`Int32`](/sql-reference/data-types/int-uint) * | +| [`BIGINT`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-integer-types) | [`Int64`](/sql-reference/data-types/int-uint) * | +| [`DECIMAL`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-decimal-or-numeric-type) | [`UInt128`, `UInt256`, `Int128`, `Int256`](/sql-reference/data-types/int-uint), [`Decimal(P, S)`, `Decimal32(S)`, `Decimal64(S)`, `Decimal128(S)`, `Decimal256(S)`](/sql-reference/data-types/decimal) - (high precision and ranges possible) | +| [`REAL`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-floating-point-types) | [`Float32`](/sql-reference/data-types/float) | +| [`DOUBLE PRECISION`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-floating-point-types) | [`Float64`](/sql-reference/data-types/float) | +| [`BOOLEAN`](https://docs.aws.amazon.com/redshift/latest/dg/r_Boolean_type.html) | [`Bool`](/sql-reference/data-types/boolean) | +| [`CHAR`](https://docs.aws.amazon.com/redshift/latest/dg/r_Character_types.html#r_Character_types-char-or-character) | [`String`](/sql-reference/data-types/string), [`FixedString`](/sql-reference/data-types/fixedstring) | +| [`VARCHAR`](https://docs.aws.amazon.com/redshift/latest/dg/r_Character_types.html#r_Character_types-varchar-or-character-varying) ** | [`String`](/sql-reference/data-types/string) | +| [`DATE`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-date) | [`Date32`](/sql-reference/data-types/date32) | +| [`TIMESTAMP`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-timestamp) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`TIMESTAMPTZ`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-timestamptz) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`GEOMETRY`](https://docs.aws.amazon.com/redshift/latest/dg/geospatial-overview.html) | [Geo Data Types](/sql-reference/data-types/geo) | +| [`GEOGRAPHY`](https://docs.aws.amazon.com/redshift/latest/dg/geospatial-overview.html) | [Geo Data Types](/sql-reference/data-types/geo) (less developed e.g. no coordinate systems - can be emulated [with functions](/sql-reference/functions/geo/)) | +| [`HLLSKETCH`](https://docs.aws.amazon.com/redshift/latest/dg/r_HLLSKTECH_type.html) | [`AggregateFunction(uniqHLL12, X)`](/sql-reference/data-types/aggregatefunction) | +| [`SUPER`](https://docs.aws.amazon.com/redshift/latest/dg/r_SUPER_type.html) | [`Tuple`](/sql-reference/data-types/tuple), [`Nested`](/sql-reference/data-types/nested-data-structures/nested), [`Array`](/sql-reference/data-types/array), [`JSON`](/sql-reference/data-types/newjson), [`Map`](/sql-reference/data-types/map) | +| [`TIME`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-time) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`TIMETZ`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-timetz) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`VARBYTE`](https://docs.aws.amazon.com/redshift/latest/dg/r_VARBYTE_type.html) ** | [`String`](/sql-reference/data-types/string) combined with [`Bit`](/sql-reference/functions/bit-functions) and [Encoding](/sql-reference/functions/encoding-functions/#hex) functions | + +* ClickHouse additionally supports unsigned integers with extended ranges i.e. `UInt8`, `UInt32`, `UInt32` and `UInt64`.
+**ClickHouse’s String type is unlimited by default but can be constrained to specific lengths using Constraints. + +## DDL syntax {#compression} + +### Sorting keys {#sorting-keys} + +Both ClickHouse and Redshift have the concept of a “sorting key”, which define +how data is sorted when being stored. Redshift defines the sorting key using the +`SORTKEY` clause: + +```sql +CREATE TABLE some_table(...) SORTKEY (column1, column2) +``` + +Comparatively, ClickHouse uses an `ORDER BY` clause to specify the sort order: + +```sql +CREATE TABLE some_table(...) ENGINE = MergeTree ORDER BY (column1, column2) +``` + +In most cases, you can use the same sorting key columns and order in ClickHouse +as Redshift, assuming you are using the default `COMPOUND` type. When data is +added to Redshift, you should run the `VACUUM` and `ANALYZE` commands to re-sort +newly added data and update the statistics for the query planner - otherwise, the +unsorted space grows. No such process is required for ClickHouse. + +Redshift supports a couple of convenience features for sorting keys. The first is +automatic sorting keys (using `SORTKEY AUTO`). While this may be appropriate for +getting started, explicit sorting keys ensure the best performance and storage +efficiency when the sorting key is optimal. The second is the `INTERLEAVED` sort key, +which gives equal weight to a subset of columns in the sort key to improve +performance when a query uses one or more secondary sort columns. ClickHouse +supports explicit [projections](/data-modeling/projections), which achieve the +same end-result with a slightly different setup. + +Users should be aware that the “primary key” concept represents different things +in ClickHouse and Redshift. In Redshift, the primary key resembles the traditional +RDMS concept intended to enforce constraints. However, they are not strictly +enforced in Redshift and instead act as hints for the query planner and data +distribution among nodes. In ClickHouse, the primary key denotes columns used +to construct the sparse primary index, used to ensure the data is ordered on +disk, maximizing compression while avoiding pollution of the primary index and +wasting memory. diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/_category_.json new file mode 100644 index 00000000000..95419dcb41c --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Redshift", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/integrations/migration/clickhouse-to-cloud.md b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/01_clickhouse-to-cloud.md similarity index 99% rename from docs/integrations/migration/clickhouse-to-cloud.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/01_clickhouse-to-cloud.md index 551314651e2..08ffe526dd7 100644 --- a/docs/integrations/migration/clickhouse-to-cloud.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/01_clickhouse-to-cloud.md @@ -1,6 +1,5 @@ --- -sidebar_position: 10 -sidebar_label: 'ClickHouse to ClickHouse Cloud' +sidebar_label: 'ClickHouse OSS' slug: /cloud/migration/clickhouse-to-cloud title: 'Migrating between self-managed ClickHouse and ClickHouse Cloud' description: 'Page describing how to migrate between self-managed ClickHouse and ClickHouse Cloud' diff --git a/docs/integrations/migration/clickhouse-local-etl.md b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/02_clickhouse-local-etl.md similarity index 99% rename from docs/integrations/migration/clickhouse-local-etl.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/02_clickhouse-local-etl.md index 2faf0a935d7..5e3eabc70c9 100644 --- a/docs/integrations/migration/clickhouse-local-etl.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/02_clickhouse-local-etl.md @@ -1,6 +1,5 @@ --- sidebar_label: 'Using clickhouse-local' -sidebar_position: 20 keywords: ['clickhouse', 'migrate', 'migration', 'migrating', 'data', 'etl', 'elt', 'clickhouse-local', 'clickhouse-client'] slug: /cloud/migration/clickhouse-local title: 'Migrating to ClickHouse using clickhouse-local' diff --git a/docs/integrations/migration/etl-tool-to-clickhouse.md b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/03_etl-tool-to-clickhouse.md similarity index 98% rename from docs/integrations/migration/etl-tool-to-clickhouse.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/03_etl-tool-to-clickhouse.md index f66e6ff2c47..32a0c168c5a 100644 --- a/docs/integrations/migration/etl-tool-to-clickhouse.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/03_etl-tool-to-clickhouse.md @@ -1,6 +1,5 @@ --- sidebar_label: 'Using a 3rd-party ETL Tool' -sidebar_position: 20 keywords: ['clickhouse', 'migrate', 'migration', 'migrating', 'data', 'etl', 'elt', 'clickhouse-local', 'clickhouse-client'] slug: /cloud/migration/etl-tool-to-clickhouse title: 'Using a 3rd-party ETL Tool' diff --git a/docs/integrations/migration/object-storage-to-clickhouse.md b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/04_object-storage-to-clickhouse.md similarity index 97% rename from docs/integrations/migration/object-storage-to-clickhouse.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/04_object-storage-to-clickhouse.md index 2f323db04ef..5638fb48571 100644 --- a/docs/integrations/migration/object-storage-to-clickhouse.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/04_object-storage-to-clickhouse.md @@ -1,5 +1,5 @@ --- -title: 'Object Storage to ClickHouse Cloud' +title: 'Using object storage' description: 'Moving data from object storage to ClickHouse Cloud' keywords: ['object storage', 's3', 'azure blob', 'gcs', 'migration'] slug: /integrations/migration/object-storage-to-clickhouse diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/_category_.json new file mode 100644 index 00000000000..61c592ce8a0 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/07_other_methods/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Other...", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/_category_.json new file mode 100644 index 00000000000..aca0c529bce --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Migration guides", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/03_tune/_snippets/_monitoring_table_of_contents.md b/docs/cloud/onboard/03_tune/_snippets/_monitoring_table_of_contents.md new file mode 100644 index 00000000000..e5d813d8226 --- /dev/null +++ b/docs/cloud/onboard/03_tune/_snippets/_monitoring_table_of_contents.md @@ -0,0 +1,3 @@ +| Page | Description | +|------|-------------| +| | | diff --git a/docs/cloud/onboard/03_tune/resource_tour.md b/docs/cloud/onboard/03_tune/resource_tour.md new file mode 100644 index 00000000000..2a4a23c3e64 --- /dev/null +++ b/docs/cloud/onboard/03_tune/resource_tour.md @@ -0,0 +1,54 @@ +--- +slug: /cloud/get-started/cloud/resource-tour +title: 'Resource tour' +keywords: ['clickhouse cloud'] +hide_title: true +--- + +import TableOfContentsBestPractices from '@site/docs/best-practices/_snippets/_table_of_contents.md'; +import TableOfContentsOptimizationAndPerformance from '@site/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md'; +import TableOfContentsSecurity from '@site/docs/cloud/_snippets/_security_table_of_contents.md'; + +# Resource tour + +This article is intended to provide you with an overview of the resources available +to you in the docs to learn how to get the most out of your ClickHouse Cloud deployment. +Explore resource organised by the following topics: + +- [Query optimization techniques and performance tuning](#query-optimization) +- [Scaling strategies and resource management](#scaling) +- [Monitoring](#monitoring) +- [Security best practices and compliance features](#security) +- [Cost optimization and billing](#cost-optimization) +- Troubleshooting common issues (coming soon) +- Production readiness checklist (coming soon) + +Before diving into more specific topics, we recommend you start with our general +ClickHouse best practice guides which cover general best practices to follow when +using ClickHouse: + + + +## Query optimization techniques and performance tuning {#query-optimization} + + + +## Scaling strategies and resource management {#scaling} + +## Monitoring {#monitoring} + +| Page | Description | +|-----------------------------------------------------------------|-------------------------------------------------------------------------------| +| [Advanced dashboard](/cloud/manage/monitor/advanced-dashboard) | Use the built in advanced dashboard to monitor service health and performance | +| [Prometheus integration](/integrations/prometheus) | Use Prometheus to monitor Cloud services | + +## Security {#security} + + + +## Cost optimization and billing {#cost-optimization} + +| Page | Description | +|-----------------------------------------------------|-----------------------------------------------------------------------------------------------------------| +| [Data transfer](/cloud/manage/network-data-transfer)| Understand how ClickHouse Cloud meters data transferred ingress and egress | +| [Notifications](/cloud/notifications) | Set up notifications for your ClickHouse Cloud service. For example, when credit usage passes a threshold | diff --git a/docs/cloud/onboard/index.md b/docs/cloud/onboard/index.md new file mode 100644 index 00000000000..403ef1a094c --- /dev/null +++ b/docs/cloud/onboard/index.md @@ -0,0 +1,46 @@ +--- +slug: /cloud/get-started +title: 'Get started with ClickHouse Cloud' +hide_title: true +--- + +# Get started with ClickHouse Cloud + +New to ClickHouse Cloud and not sure where to begin? In this section of the docs, +we'll walk you through everything you need to get up and running quickly. We've +arranged this getting started section into three subsections to help guide +you through each step of the process as you explore ClickHouse Cloud. + + + +## Discover ClickHouse Cloud {#discover-clickhouse-cloud} + +- Learn about what ClickHouse Cloud is, and how it differs from the open-source version +- Discover the main use-cases of ClickHouse Cloud +- Learn about ClickHouse Cloud pricing + +## Get set up with ClickHouse Cloud {#get-set-up-with-clickhouse-cloud} + +Now that you know what ClickHouse Cloud is, we'll walk you through the process +of getting your data into ClickHouse Cloud, show you the main features available +and point you towards some general best practices you should know. + +Topics include: + +- Migration guides from various platforms +- Cloud architecture + +## Tune your ClickHouse Cloud deployment {#evaluate-clickhouse-cloud} + +Now that your data is in ClickHouse Cloud, we'll walk you through some more advanced +topics to help you get the most out of your ClickHouse Cloud experience and explore +what the platform has to offer. + +Topics include: + +- Query performance +- Monitoring +- Security considerations +- Troubleshooting tips + + \ No newline at end of file diff --git a/docs/cloud/reference/changelog.md b/docs/cloud/reference/01_changelog/01_changelog.md similarity index 99% rename from docs/cloud/reference/changelog.md rename to docs/cloud/reference/01_changelog/01_changelog.md index 8f67d0076de..c792b3f4f93 100644 --- a/docs/cloud/reference/changelog.md +++ b/docs/cloud/reference/01_changelog/01_changelog.md @@ -63,10 +63,10 @@ to get up and running. - New services now store database and table metadata in a central **SharedCatalog**, a new model for coordination and object lifecycles which enables: - - **Cloud-scale DDL**, even under high concurrency - - **Resilient deletion and new DDL operations** - - **Fast spin-up and wake-ups** as stateless nodes now launch with no disk dependencies - - **Stateless compute across both native and open formats**, including Iceberg and Delta Lake + - **Cloud-scale DDL**, even under high concurrency + - **Resilient deletion and new DDL operations** + - **Fast spin-up and wake-ups** as stateless nodes now launch with no disk dependencies + - **Stateless compute across both native and open formats**, including Iceberg and Delta Lake Read more about SharedCatalog in our [blog](https://clickhouse.com/blog/clickhouse-cloud-stateless-compute) @@ -767,12 +767,12 @@ This release upgrades the core database version, adds ability to set up private ### Integrations changes {#integrations-changes-4} * Kafka Connect - * Support async_insert for exactly once (disabled by default) + * Support async_insert for exactly once (disabled by default) * Golang client - * Fixed DateTime binding - * Improved batch insert performance + * Fixed DateTime binding + * Improved batch insert performance * Java client - * Fixed request compression problem + * Fixed request compression problem ### Settings changes {#settings-changes} * `use_mysql_types_in_show_columns` is no longer required. It will be automatically enabled when you connect through the MySQL interface. diff --git a/docs/cloud/changelogs/24_02.md b/docs/cloud/reference/01_changelog/02_release_notes/24_02.md similarity index 100% rename from docs/cloud/changelogs/24_02.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_02.md diff --git a/docs/cloud/changelogs/24_05.md b/docs/cloud/reference/01_changelog/02_release_notes/24_05.md similarity index 100% rename from docs/cloud/changelogs/24_05.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_05.md diff --git a/docs/cloud/changelogs/24_06.md b/docs/cloud/reference/01_changelog/02_release_notes/24_06.md similarity index 100% rename from docs/cloud/changelogs/24_06.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_06.md diff --git a/docs/cloud/changelogs/24_08.md b/docs/cloud/reference/01_changelog/02_release_notes/24_08.md similarity index 100% rename from docs/cloud/changelogs/24_08.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_08.md diff --git a/docs/cloud/changelogs/24_10.md b/docs/cloud/reference/01_changelog/02_release_notes/24_10.md similarity index 100% rename from docs/cloud/changelogs/24_10.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_10.md diff --git a/docs/cloud/changelogs/24_12.md b/docs/cloud/reference/01_changelog/02_release_notes/24_12.md similarity index 100% rename from docs/cloud/changelogs/24_12.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_12.md diff --git a/docs/cloud/changelogs/25_04.md b/docs/cloud/reference/01_changelog/02_release_notes/25_04.md similarity index 100% rename from docs/cloud/changelogs/25_04.md rename to docs/cloud/reference/01_changelog/02_release_notes/25_04.md diff --git a/docs/cloud/changelogs/25_06.md b/docs/cloud/reference/01_changelog/02_release_notes/25_06.md similarity index 100% rename from docs/cloud/changelogs/25_06.md rename to docs/cloud/reference/01_changelog/02_release_notes/25_06.md diff --git a/docs/cloud/reference/01_changelog/02_release_notes/_category_.json b/docs/cloud/reference/01_changelog/02_release_notes/_category_.json new file mode 100644 index 00000000000..4eeae460788 --- /dev/null +++ b/docs/cloud/reference/01_changelog/02_release_notes/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Release notes", + "collapsible": true, + "collapsed": true, + "link": { "type": "doc", "id": "cloud/reference/changelog/release_notes/index" } +} \ No newline at end of file diff --git a/docs/cloud/reference/release-notes-index.md b/docs/cloud/reference/01_changelog/02_release_notes/index.md similarity index 100% rename from docs/cloud/reference/release-notes-index.md rename to docs/cloud/reference/01_changelog/02_release_notes/index.md diff --git a/docs/cloud/reference/01_changelog/_category_.json b/docs/cloud/reference/01_changelog/_category_.json new file mode 100644 index 00000000000..60a9e95ee7e --- /dev/null +++ b/docs/cloud/reference/01_changelog/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Change logs", + "collapsible": true, + "collapsed": true, + "link": { "type": "doc", "id": "cloud/reference/changelog/index" } +} \ No newline at end of file diff --git a/docs/cloud/reference/changelogs-index.md b/docs/cloud/reference/01_changelog/index.md similarity index 91% rename from docs/cloud/reference/changelogs-index.md rename to docs/cloud/reference/01_changelog/index.md index c23e70f4ea2..cfdb11087f8 100644 --- a/docs/cloud/reference/changelogs-index.md +++ b/docs/cloud/reference/01_changelog/index.md @@ -7,4 +7,4 @@ description: 'Landing page for Cloud changelogs' | Page | Description | |---------------------------------------------------------------|-------------------------------------------------| | [Cloud Changelog](/whats-new/cloud) | Changelog for ClickHouse Cloud | -| [Release Notes](/cloud/reference/changelogs/release-notes) | Release notes for all ClickHouse Cloud releases | +| [Release Notes](/cloud/reference/changelogs/release-notes) | Release notes for all ClickHouse Cloud releases | \ No newline at end of file diff --git a/docs/cloud/reference/architecture.md b/docs/cloud/reference/02_architecture.md similarity index 98% rename from docs/cloud/reference/architecture.md rename to docs/cloud/reference/02_architecture.md index 9c3d7cf5f56..6e3294d3a97 100644 --- a/docs/cloud/reference/architecture.md +++ b/docs/cloud/reference/02_architecture.md @@ -1,7 +1,7 @@ --- sidebar_label: 'Architecture' slug: /cloud/reference/architecture -title: 'ClickHouse Cloud Architecture' +title: 'ClickHouse Cloud architecture' description: 'This page describes the architecture of ClickHouse Cloud' --- diff --git a/docs/cloud/manage/billing.md b/docs/cloud/reference/03_billing/01_billing_overview.md similarity index 99% rename from docs/cloud/manage/billing.md rename to docs/cloud/reference/03_billing/01_billing_overview.md index 3745df1d2aa..0d6993702d7 100644 --- a/docs/cloud/manage/billing.md +++ b/docs/cloud/reference/03_billing/01_billing_overview.md @@ -5,7 +5,7 @@ title: 'Pricing' description: 'Overview page for ClickHouse Cloud pricing' --- -import ClickPipesFAQ from './jan2025_faq/_snippets/_clickpipes_faq.md' +import ClickPipesFAQ from '../09_jan2025_faq/_snippets/_clickpipes_faq.md' For pricing information, see the [ClickHouse Cloud Pricing](https://clickhouse.com/pricing#pricing-calculator) page. ClickHouse Cloud bills based on the usage of compute, storage, [data transfer](/cloud/manage/network-data-transfer) (egress over the internet and cross-region), and [ClickPipes](/integrations/clickpipes). diff --git a/docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md b/docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-committed.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md rename to docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-committed.md diff --git a/docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md b/docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-payg.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md rename to docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-payg.md diff --git a/docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md b/docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-committed.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md rename to docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-committed.md diff --git a/docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md b/docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-payg.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md rename to docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-payg.md diff --git a/docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md b/docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-committed.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md rename to docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-committed.md diff --git a/docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md b/docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-payg.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md rename to docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-payg.md diff --git a/docs/cloud/manage/billing/marketplace/index.md b/docs/cloud/reference/03_billing/02_marketplace/index.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/index.md rename to docs/cloud/reference/03_billing/02_marketplace/index.md diff --git a/docs/cloud/manage/billing/marketplace/overview.md b/docs/cloud/reference/03_billing/02_marketplace/overview.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/overview.md rename to docs/cloud/reference/03_billing/02_marketplace/overview.md diff --git a/docs/cloud/manage/billing/payment-thresholds.md b/docs/cloud/reference/03_billing/03_payment-thresholds.md similarity index 97% rename from docs/cloud/manage/billing/payment-thresholds.md rename to docs/cloud/reference/03_billing/03_payment-thresholds.md index 0c2b6948d0e..2d9ce5f188a 100644 --- a/docs/cloud/manage/billing/payment-thresholds.md +++ b/docs/cloud/reference/03_billing/03_payment-thresholds.md @@ -1,7 +1,7 @@ --- sidebar_label: 'Payment Thresholds' slug: /cloud/billing/payment-thresholds -title: 'Payment Thresholds' +title: 'Payment thresholds' description: 'Payment thresholds and automatic invoicing for ClickHouse Cloud.' keywords: ['billing', 'payment thresholds', 'automatic invoicing', 'invoice'] --- diff --git a/docs/cloud/reference/03_billing/04_network-data-transfer.mdx b/docs/cloud/reference/03_billing/04_network-data-transfer.mdx new file mode 100644 index 00000000000..4013e1477b7 --- /dev/null +++ b/docs/cloud/reference/03_billing/04_network-data-transfer.mdx @@ -0,0 +1,56 @@ +--- +sidebar_label: 'Data Transfer' +slug: /cloud/manage/network-data-transfer +title: 'Data Transfer' +description: 'Understand how ClickHouse Cloud meters data transferred ingress and egress' +--- + +import NetworkPricing from '@site/docs/cloud/reference/_snippets/_network_transfer_rates.md'; + +ClickHouse Cloud meters data transferred ingress and egress. +This includes any data in and out of ClickHouse Cloud as well as any intra-region +and cross-region data transfer. This usage is tracked at the service level. Based +on this usage, customers incur data transfer charges that are then added to their +monthly bill. + +ClickHouse Cloud charges for: +- Data egress from ClickHouse Cloud to the public Internet, including to other +regions of other cloud providers. +- Data egress to another region in the same cloud provider. + +There are no charges for intra-region data transfer or Private Link/Private +Service Connect use and data transfer.However, we reserve the right to implement +additional data transfer pricing dimensions if we see usage patterns that impact +our ability to charge users appropriately. + +Data transfer charges vary by Cloud Service Provider (CSP) and region. +Public internet egress pricing is based only on the origin region. +Inter-region (or cross-region) pricing depends on both the origin and destination +regions. + +**Best Practices to minimize Data Transfer Costs** + +There are some patterns to keep in mind when ingressing and egressing data to +minimize data transfer costs. + +1. When ingressing or egressing data from Clickhouse Cloud, use compression where +possible, to minimize the amount of data transferred and the associated cost. + +2. Be aware that when doing an INSERT over the native protocol with non-inlined +values (e.g. `INSERT INTO [TABLE] FROM INFILE [FILE] FORMAT NATIVE`), ClickHouse +clients pull metadata from servers to pack the data. If the metadata is larger +than the `INSERT` payload, you might counterintuitively see more egress than +there is ingress from the server perspective. If this is unacceptable, consider +inlining data with `VALUES` syntax or using the HTTP protocol. + +The tables below shows how data transfer charges for egress vary across public +internet or cross-region by cloud provider and region. + +:::note +ClickHouse Cloud meters inter-region usage in terms of tiers, Tier 1 through +Tier 4, depending on the origin and destination regions. The table below shows +the tier for each combination of inter-region data transfer. In the Billing usage +screen on ClickHouse Cloud you will see data transfer usage broken out by tiers. +::: + + diff --git a/docs/cloud/manage/troubleshooting-billing-issues.md b/docs/cloud/reference/03_billing/05_billing_compliance.md similarity index 100% rename from docs/cloud/manage/troubleshooting-billing-issues.md rename to docs/cloud/reference/03_billing/05_billing_compliance.md diff --git a/docs/cloud/manage/billing/index.md b/docs/cloud/reference/03_billing/index.md similarity index 100% rename from docs/cloud/manage/billing/index.md rename to docs/cloud/reference/03_billing/index.md diff --git a/docs/cloud/reference/supported-regions.md b/docs/cloud/reference/05_supported-regions.md similarity index 98% rename from docs/cloud/reference/supported-regions.md rename to docs/cloud/reference/05_supported-regions.md index f434b8786e1..4086227f4ab 100644 --- a/docs/cloud/reference/supported-regions.md +++ b/docs/cloud/reference/05_supported-regions.md @@ -1,6 +1,6 @@ --- title: 'Supported Cloud Regions' -sidebar_label: 'Supported Cloud Regions' +sidebar_label: 'Supported Cloud regions' keywords: ['aws', 'gcp', 'google cloud', 'azure', 'cloud', 'regions'] description: 'Supported regions for ClickHouse Cloud' slug: /cloud/reference/supported-regions diff --git a/docs/cloud/manage/service-uptime.md b/docs/cloud/reference/06_service-uptime.md similarity index 95% rename from docs/cloud/manage/service-uptime.md rename to docs/cloud/reference/06_service-uptime.md index 3a31e459eaf..33397a626be 100644 --- a/docs/cloud/manage/service-uptime.md +++ b/docs/cloud/reference/06_service-uptime.md @@ -1,7 +1,7 @@ --- sidebar_label: 'Service Uptime and SLA' slug: /cloud/manage/service-uptime -title: 'Service Uptime' +title: 'Service uptime' description: 'Users can now see regional uptimes on the status page and subscribe to alerts on service disruptions.' --- diff --git a/docs/cloud/manage/settings.md b/docs/cloud/reference/08_settings.md similarity index 94% rename from docs/cloud/manage/settings.md rename to docs/cloud/reference/08_settings.md index a766ef59c13..9926c5833cb 100644 --- a/docs/cloud/manage/settings.md +++ b/docs/cloud/reference/08_settings.md @@ -1,7 +1,7 @@ --- -sidebar_label: 'Configuring Settings' +sidebar_label: 'Configuring settings' slug: /manage/settings -title: 'Configuring Settings' +title: 'Configuring settings' description: 'How to configure settings for your ClickHouse Cloud service for a specific user or role' --- diff --git a/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md b/docs/cloud/reference/09_jan2025_faq/_snippets/_clickpipes_faq.md similarity index 100% rename from docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md rename to docs/cloud/reference/09_jan2025_faq/_snippets/_clickpipes_faq.md diff --git a/docs/cloud/manage/jan2025_faq/backup.md b/docs/cloud/reference/09_jan2025_faq/backup.md similarity index 100% rename from docs/cloud/manage/jan2025_faq/backup.md rename to docs/cloud/reference/09_jan2025_faq/backup.md diff --git a/docs/cloud/manage/jan2025_faq/billing.md b/docs/cloud/reference/09_jan2025_faq/billing.md similarity index 100% rename from docs/cloud/manage/jan2025_faq/billing.md rename to docs/cloud/reference/09_jan2025_faq/billing.md diff --git a/docs/cloud/manage/jan2025_faq/dimensions.md b/docs/cloud/reference/09_jan2025_faq/dimensions.md similarity index 94% rename from docs/cloud/manage/jan2025_faq/dimensions.md rename to docs/cloud/reference/09_jan2025_faq/dimensions.md index c4dd9268593..01e4937000f 100644 --- a/docs/cloud/manage/jan2025_faq/dimensions.md +++ b/docs/cloud/reference/09_jan2025_faq/dimensions.md @@ -1,5 +1,5 @@ --- -title: 'New Pricing Dimensions' +title: 'New pricing dimensions' slug: /cloud/manage/jan-2025-faq/pricing-dimensions keywords: ['new pricing', 'dimensions'] description: 'Pricing dimensions for data transfer and ClickPipes' @@ -9,7 +9,7 @@ import Image from '@theme/IdealImage'; import clickpipesPricingFaq1 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_1.png'; import clickpipesPricingFaq2 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_2.png'; import clickpipesPricingFaq3 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_3.png'; -import NetworkPricing from '@site/docs/cloud/manage/_snippets/_network_transfer_rates.md'; +import NetworkPricing from '@site/docs/cloud/reference/_snippets/_network_transfer_rates.md'; import ClickPipesFAQ from './_snippets/_clickpipes_faq.md' The following dimensions have been added to the new ClickHouse Cloud pricing. diff --git a/docs/cloud/manage/jan2025_faq/index.md b/docs/cloud/reference/09_jan2025_faq/index.md similarity index 100% rename from docs/cloud/manage/jan2025_faq/index.md rename to docs/cloud/reference/09_jan2025_faq/index.md diff --git a/docs/cloud/manage/jan2025_faq/new_tiers.md b/docs/cloud/reference/09_jan2025_faq/new_tiers.md similarity index 99% rename from docs/cloud/manage/jan2025_faq/new_tiers.md rename to docs/cloud/reference/09_jan2025_faq/new_tiers.md index b90874aedb8..6aa943808d2 100644 --- a/docs/cloud/manage/jan2025_faq/new_tiers.md +++ b/docs/cloud/reference/09_jan2025_faq/new_tiers.md @@ -1,5 +1,5 @@ --- -title: 'Description of New Tiers' +title: 'Description of new tiers' slug: /cloud/manage/jan-2025-faq/new-tiers keywords: ['new tiers', 'features', 'pricing', 'description'] description: 'Description of new tiers and features' diff --git a/docs/cloud/manage/jan2025_faq/plan_migrations.md b/docs/cloud/reference/09_jan2025_faq/plan_migrations.md similarity index 99% rename from docs/cloud/manage/jan2025_faq/plan_migrations.md rename to docs/cloud/reference/09_jan2025_faq/plan_migrations.md index fffdebfe45b..fc5cfc56233 100644 --- a/docs/cloud/manage/jan2025_faq/plan_migrations.md +++ b/docs/cloud/reference/09_jan2025_faq/plan_migrations.md @@ -1,5 +1,5 @@ --- -title: 'Migrating to New Plans' +title: 'Migrating to new plans' slug: /cloud/manage/jan-2025-faq/plan-migrations keywords: ['migration', 'new tiers', 'pricing', 'cost', 'estimation'] description: 'Migrating to new plans, tiers, pricing, how to decide and estimate costs' diff --git a/docs/cloud/manage/jan2025_faq/scaling.md b/docs/cloud/reference/09_jan2025_faq/scaling.md similarity index 100% rename from docs/cloud/manage/jan2025_faq/scaling.md rename to docs/cloud/reference/09_jan2025_faq/scaling.md diff --git a/docs/cloud/manage/jan2025_faq/summary.md b/docs/cloud/reference/09_jan2025_faq/summary.md similarity index 100% rename from docs/cloud/manage/jan2025_faq/summary.md rename to docs/cloud/reference/09_jan2025_faq/summary.md diff --git a/docs/cloud/reference/09_security/_category_.json b/docs/cloud/reference/09_security/_category_.json new file mode 100644 index 00000000000..aed26fa7f7a --- /dev/null +++ b/docs/cloud/reference/09_security/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Security", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/audit-logging.md b/docs/cloud/reference/09_security/audit-logging.md similarity index 100% rename from docs/cloud/security/audit-logging.md rename to docs/cloud/reference/09_security/audit-logging.md diff --git a/docs/cloud/reference/09_security/privacy_and_compliance/_category_.json b/docs/cloud/reference/09_security/privacy_and_compliance/_category_.json new file mode 100644 index 00000000000..99beeb3e924 --- /dev/null +++ b/docs/cloud/reference/09_security/privacy_and_compliance/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Privacy and compliance", + "collapsible": true, + "collapsed": true, + "link": { "type": "doc", "id": "cloud/reference/security/privacy_and_compliance/index" } +} \ No newline at end of file diff --git a/docs/cloud/security/compliance-overview.md b/docs/cloud/reference/09_security/privacy_and_compliance/compliance-overview.md similarity index 95% rename from docs/cloud/security/compliance-overview.md rename to docs/cloud/reference/09_security/privacy_and_compliance/compliance-overview.md index 4653c0f09c1..8b5be8b5766 100644 --- a/docs/cloud/security/compliance-overview.md +++ b/docs/cloud/reference/09_security/privacy_and_compliance/compliance-overview.md @@ -1,9 +1,3 @@ ---- -sidebar_label: 'Security and Compliance' -slug: /cloud/security/security-and-compliance -title: 'Security and Compliance' -description: 'This page describes the security and compliance measures implemented by ClickHouse Cloud to protect customer data.' ---- import BetaBadge from '@theme/badges/BetaBadge'; import EnterprisePlanFeatureBadge from '@theme/badges/EnterprisePlanFeatureBadge'; diff --git a/docs/cloud/security/privacy-compliance-overview.md b/docs/cloud/reference/09_security/privacy_and_compliance/index.md similarity index 100% rename from docs/cloud/security/privacy-compliance-overview.md rename to docs/cloud/reference/09_security/privacy_and_compliance/index.md diff --git a/docs/cloud/security/personal-data-access.md b/docs/cloud/reference/09_security/privacy_and_compliance/personal-data-access.md similarity index 98% rename from docs/cloud/security/personal-data-access.md rename to docs/cloud/reference/09_security/privacy_and_compliance/personal-data-access.md index bcf4514b301..3bdc8ca3302 100644 --- a/docs/cloud/security/personal-data-access.md +++ b/docs/cloud/reference/09_security/privacy_and_compliance/personal-data-access.md @@ -1,7 +1,7 @@ --- -sidebar_label: 'Personal Data Access' +sidebar_label: 'Personal data access' slug: /cloud/security/personal-data-access -title: 'Personal Data Access' +title: 'Personal data access' description: 'As a registered user, ClickHouse allows you to view and manage your personal account data, including contact information.' --- diff --git a/docs/cloud/manage/account-close.md b/docs/cloud/reference/10_account-close.md similarity index 98% rename from docs/cloud/manage/account-close.md rename to docs/cloud/reference/10_account-close.md index ac9a79eeeea..021345d4a94 100644 --- a/docs/cloud/manage/account-close.md +++ b/docs/cloud/reference/10_account-close.md @@ -1,11 +1,12 @@ --- -sidebar_label: 'Delete Account' +sidebar_label: 'Account closure' slug: /cloud/manage/close_account -title: 'Account Close & Deletion' +title: 'Account closure and deletion' description: 'We know there are circumstances that sometimes necessitate account closure. This guide will help you through the process.' --- ## Account closure and deletion {#account-close--deletion} + Our goal is to help you be successful in your project. If you have questions that are not answered on this site or need help evaluating a unique use case, please contact us at [support@clickhouse.com](mailto:support@clickhouse.com). diff --git a/docs/cloud/manage/_snippets/_network_transfer_rates.md b/docs/cloud/reference/_snippets/_network_transfer_rates.md similarity index 100% rename from docs/cloud/manage/_snippets/_network_transfer_rates.md rename to docs/cloud/reference/_snippets/_network_transfer_rates.md diff --git a/docs/cloud/security/_category_.yml b/docs/cloud/security/_category_.yml deleted file mode 100644 index b7253753fd5..00000000000 --- a/docs/cloud/security/_category_.yml +++ /dev/null @@ -1,6 +0,0 @@ -label: 'Cloud Security' -collapsible: true -collapsed: true -link: - type: generated-index - title: Cloud Security diff --git a/docs/cloud/security/index.md b/docs/cloud/security/index.md deleted file mode 100644 index b6a2d56ab1b..00000000000 --- a/docs/cloud/security/index.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -slug: /cloud/security -keywords: ['Cloud', 'Security'] -title: 'Overview' -hide_title: true -description: 'Landing page for ClickHouse Cloud Security' ---- - -# ClickHouse Cloud security - -This section delves into security in ClickHouse Cloud and contains the following pages: - -| Page | Description | -|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Shared Responsibility Model](shared-responsibility-model.md) | Information on the security features offered for each service type. | -| [Cloud Access Management](cloud-access-management/index.md) | Information on access control, authentication, SSO setup, common access management queries and how to invite new users. | -| [Connectivity](connectivity-overview.md) | Information on setting IP filters, private networking, secure access of S3 data and Cloud IP addresses. | -| [Enhanced Encryption](cmek.md) | Data at rest is encrypted by default using cloud provider-managed AES 256 keys. Customers may enable Transparent Data Encryption (TDE) to provide an additional layer of protection for service data. | -| [Audit Logging](audit-logging.md) | A guide to audit logging in ClickHouse Cloud. | -| [Privacy and Compliance](privacy-compliance-overview.md) | Information on security and compliance of ClickHouse Cloud, a guide on how to view and correct your personal information. | diff --git a/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md b/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md new file mode 100644 index 00000000000..3cbd396bb55 --- /dev/null +++ b/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md @@ -0,0 +1,17 @@ +| Topic | Description | +|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Query optimization guide](/optimize/query-optimization) | Start here for query optimization fundamentals, covering common scenarios and performance techniques to improve query execution speed. | +| [Primary indexes advanced guide](/guides/best-practices/sparse-primary-indexes) | Deep dive into ClickHouse's unique sparse primary indexing system, how it differs from traditional databases, and best practices for optimal indexing strategies. | +| [Query parallelism](/optimize/query-parallelism) | Learn how ClickHouse parallelizes query execution using processing lanes and `max_threads` settings, including how to inspect and optimize parallel execution. | +| [Partitioning key](/optimize/partitioning-key) | Master partition key selection to dramatically improve query performance by enabling efficient data segment pruning and avoiding common partitioning pitfalls. | +| [Data skipping indexes](/optimize/skipping-indexes) | Apply secondary indexes strategically to skip irrelevant data blocks and accelerate filtered queries on non-primary key columns. | +| [`PREWHERE` optimization](/optimize/prewhere) | Understand how `PREWHERE` automatically reduces I/O by filtering data before reading unnecessary columns, plus how to monitor its effectiveness. | +| [Bulk inserts](/optimize/bulk-inserts) | Maximize ingestion throughput and reduce resource overhead by batching data insertions effectively. | +| [Asynchronous inserts](/optimize/asynchronous-inserts) | Improve insert performance by leveraging server-side batching to reduce client-side complexity and increase throughput for high-frequency insertions. | +| [Avoid mutations](/optimize/avoid-mutations) | Design append-only workflows that eliminate costly `UPDATE` and `DELETE` operations while maintaining data accuracy and performance. | +| [Avoid nullable columns](/optimize/avoid-nullable-columns) | Reduce storage overhead and improve query performance by using default values instead of nullable columns where possible. | +| [Avoid `OPTIMIZE FINAL`](/optimize/avoidoptimizefinal) | Understand when you should and should not use `OPTIMIZE TABLE FINAL` | +| [Analyzer](/operations/analyzer) | Leverage ClickHouse's new query analyzer to identify performance bottlenecks and optimize query execution plans for better efficiency. | +| [Query profiling](/operations/optimizing-performance/sampling-query-profiler) | Use the sampling query profiler to analyze query execution patterns, identify performance hot spots, and optimize resource usage. | +| [Query cache](/operations/query-cache) | Accelerate frequently executed `SELECT` queries by enabling and configuring ClickHouse's built-in query result caching. | +| [Testing hardware](/operations/performance-test) | Run ClickHouse performance benchmarks on any server without installation to evaluate hardware capabilities. (Not applicable to ClickHouse Cloud) | \ No newline at end of file diff --git a/docs/guides/best-practices/index.md b/docs/guides/best-practices/index.md index 0c52281492f..ef320eaf03c 100644 --- a/docs/guides/best-practices/index.md +++ b/docs/guides/best-practices/index.md @@ -5,26 +5,12 @@ description: 'Overview page of Performance and Optimizations' title: 'Performance and Optimizations' --- -# Performance and optimizations +import TableOfContents from '@site/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md'; + +# Performance and Optimizations This section contains tips and best practices for improving performance with ClickHouse. We recommend users read [Core Concepts](/parts) as a precursor to this section, which covers the main concepts required to improve performance. -| Topic | Description | -|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Query Optimization Guide](/optimize/query-optimization) | A good place to start for query optimization, this simple guide describes common scenarios of how to use different performance and optimization techniques to improve query performance. | -| [Primary Indexes Advanced Guide](/guides/best-practices/sparse-primary-indexes) | A deep dive into ClickHouse indexing including how it differs from other DB systems, how ClickHouse builds and uses a table's spare primary index and what some of the best practices are for indexing in ClickHouse. | -| [Query Parallelism](/optimize/query-parallelism) | Explains how ClickHouse parallelizes query execution using processing lanes and the max_threads setting. Covers how data is distributed across lanes, how max_threads is applied, when it isn't fully used, and how to inspect execution with tools like EXPLAIN and trace logs. | -| [Partitioning Key](/optimize/partitioning-key) | Delves into ClickHouse partition key optimization. Explains how choosing the right partition key can significantly improve query performance by allowing ClickHouse to quickly locate relevant data segments. Covers best practices for selecting efficient partition keys and potential pitfalls to avoid. | -| [Data Skipping Indexes](/optimize/skipping-indexes) | Explains data skipping indexes as a way to optimize performance. | -| [PREWHERE Optimization](/optimize/prewhere) | Explains how PREWHERE reduces I/O by avoiding reading unnecessary column data. Shows how it's applied automatically, how the filtering order is chosen, and how to monitor it using EXPLAIN and logs. | -| [Bulk Inserts](/optimize/bulk-inserts) | Explains the benefits of using bulk inserts in ClickHouse. | -| [Asynchronous Inserts](/optimize/asynchronous-inserts) | Focuses on ClickHouse's asynchronous inserts feature. It likely explains how asynchronous inserts work (batching data on the server for efficient insertion) and their benefits (improved performance by offloading insert processing). It might also cover enabling asynchronous inserts and considerations for using them effectively in your ClickHouse environment. | -| [Avoid Mutations](/optimize/avoid-mutations) | Discusses the importance of avoiding mutations (updates and deletes) in ClickHouse. It recommends using append-only inserts for optimal performance and suggests alternative approaches for handling data changes. | -| [Avoid nullable columns](/optimize/avoid-nullable-columns) | Discusses why you may want to avoid nullable columns to save space and increase performance. Demonstrates how to set a default value for a column. | -| [Avoid `OPTIMIZE FINAL`](/optimize/avoidoptimizefinal) | Explains how the `OPTIMIZE TABLE ... FINAL` query is resource-intensive and suggests alternative approaches to optimize ClickHouse performance. | -| [Analyzer](/operations/analyzer) | Looks at the ClickHouse Analyzer, a tool for analyzing and optimizing queries. Discusses how the Analyzer works, its benefits (e.g., identifying performance bottlenecks), and how to use it to improve your ClickHouse queries' efficiency. | -| [Query Profiling](/operations/optimizing-performance/sampling-query-profiler) | Explains ClickHouse's Sampling Query Profiler, a tool that helps analyze query execution. | -| [Query Cache](/operations/query-cache) | Details ClickHouse's Query Cache, a feature that aims to improve performance by caching the results of frequently executed `SELECT` queries. | -| [Testing Hardware](/operations/performance-test) | How to run a basic ClickHouse performance test on any server without installation of ClickHouse packages. (Not applicable to ClickHouse Cloud) | + \ No newline at end of file diff --git a/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md b/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md new file mode 100644 index 00000000000..960120aa751 --- /dev/null +++ b/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md @@ -0,0 +1,254 @@ +import redshiftToClickhouse from '@site/static/images/integrations/data-ingestion/redshift/redshift-to-clickhouse.png'; +import push from '@site/static/images/integrations/data-ingestion/redshift/push.png'; +import pull from '@site/static/images/integrations/data-ingestion/redshift/pull.png'; +import pivot from '@site/static/images/integrations/data-ingestion/redshift/pivot.png'; +import s3_1 from '@site/static/images/integrations/data-ingestion/redshift/s3-1.png'; +import s3_2 from '@site/static/images/integrations/data-ingestion/redshift/s3-2.png'; +import Image from '@theme/IdealImage'; + +## Introduction {#introduction} + +[Amazon Redshift](https://aws.amazon.com/redshift/) is a popular cloud data warehousing solution that is part of the Amazon Web Services offerings. This guide presents different approaches to migrating data from a Redshift instance to ClickHouse. We will cover three options: + +Redshift to ClickHouse Migration Options + +From the ClickHouse instance standpoint, you can either: + +1. **[PUSH](#push-data-from-redshift-to-clickhouse)** data to ClickHouse using a third party ETL/ELT tool or service + +2. **[PULL](#pull-data-from-redshift-to-clickhouse)** data from Redshift leveraging the ClickHouse JDBC Bridge + +3. **[PIVOT](#pivot-data-from-redshift-to-clickhouse-using-s3)** using S3 object storage using an "Unload then load" logic + +:::note +We used Redshift as a data source in this tutorial. However, the migration approaches presented here are not exclusive to Redshift, and similar steps can be derived for any compatible data source. +::: + +## Push Data from Redshift to ClickHouse {#push-data-from-redshift-to-clickhouse} + +In the push scenario, the idea is to leverage a third-party tool or service (either custom code or an [ETL/ELT](https://en.wikipedia.org/wiki/Extract,_transform,_load#ETL_vs._ELT)) to send your data to your ClickHouse instance. For example, you can use a software like [Airbyte](https://www.airbyte.com/) to move data between your Redshift instance (as a source) and ClickHouse as a destination ([see our integration guide for Airbyte](/integrations/data-ingestion/etl-tools/airbyte-and-clickhouse.md)) + +PUSH Redshift to ClickHouse + +### Pros {#pros} + +* It can leverage the existing catalog of connectors from the ETL/ELT software. +* Built-in capabilities to keep data in sync (append/overwrite/increment logic). +* Enable data transformation scenarios (for example, see our [integration guide for dbt](/integrations/data-ingestion/etl-tools/dbt/index.md)). + +### Cons {#cons} + +* Users need to set up and maintain an ETL/ELT infrastructure. +* Introduces a third-party element in the architecture which can turn into a potential scalability bottleneck. + +## Pull Data from Redshift to ClickHouse {#pull-data-from-redshift-to-clickhouse} + +In the pull scenario, the idea is to leverage the ClickHouse JDBC Bridge to connect to a Redshift cluster directly from a ClickHouse instance and perform `INSERT INTO ... SELECT` queries: + +PULL from Redshift to ClickHouse + +### Pros {#pros-1} + +* Generic to all JDBC compatible tools +* Elegant solution to allow querying multiple external data sources from within ClickHouse + +### Cons {#cons-1} + +* Requires a ClickHouse JDBC Bridge instance which can turn into a potential scalability bottleneck + +:::note +Even though Redshift is based on PostgreSQL, using the ClickHouse PostgreSQL table function or table engine is not possible since ClickHouse requires PostgreSQL version 9 or above and the Redshift API is based on an earlier version (8.x). +::: + +### Tutorial {#tutorial} + +To use this option, you need to set up a ClickHouse JDBC Bridge. ClickHouse JDBC Bridge is a standalone Java application that handles JDBC connectivity and acts as a proxy between the ClickHouse instance and the data sources. For this tutorial, we used a pre-populated Redshift instance with a [sample database](https://docs.aws.amazon.com/redshift/latest/dg/c_sampledb.html). + + + +#### Deploy ClickHouse JDBC Bridge {#deploy-clickhouse-jdbc-bridge} + +Deploy the ClickHouse JDBC Bridge. For more details, see our user guide on [JDBC for External Data sources](/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md) + +:::note +If you are using ClickHouse Cloud, you will need to run your ClickHouse JDBC Bridge on a separate environment and connect to ClickHouse Cloud using the [remoteSecure](/sql-reference/table-functions/remote/) function +::: + +#### Configure your Redshift datasource {#configure-your-redshift-datasource} + +Configure your Redshift datasource for ClickHouse JDBC Bridge. For example, `/etc/clickhouse-jdbc-bridge/config/datasources/redshift.json ` + +```json +{ + "redshift-server": { + "aliases": [ + "redshift" + ], + "driverUrls": [ + "https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/2.1.0.4/redshift-jdbc42-2.1.0.4.jar" + ], + "driverClassName": "com.amazon.redshift.jdbc.Driver", + "jdbcUrl": "jdbc:redshift://redshift-cluster-1.ckubnplpz1uv.us-east-1.redshift.amazonaws.com:5439/dev", + "username": "awsuser", + "password": "", + "maximumPoolSize": 5 + } +} +``` + +#### Query your Redshift instance from ClickHouse {#query-your-redshift-instance-from-clickhouse} + +Once ClickHouse JDBC Bridge deployed and running, you can start querying your Redshift instance from ClickHouse + +```sql +SELECT * +FROM jdbc('redshift', 'select username, firstname, lastname from users limit 5') +``` + +```response +Query id: 1b7de211-c0f6-4117-86a2-276484f9f4c0 + +┌─username─┬─firstname─┬─lastname─┐ +│ PGL08LJI │ Vladimir │ Humphrey │ +│ XDZ38RDD │ Barry │ Roy │ +│ AEB55QTM │ Reagan │ Hodge │ +│ OWY35QYB │ Tamekah │ Juarez │ +│ MSD36KVR │ Mufutau │ Watkins │ +└──────────┴───────────┴──────────┘ + +5 rows in set. Elapsed: 0.438 sec. +``` + +```sql +SELECT * +FROM jdbc('redshift', 'select count(*) from sales') +``` + +```response +Query id: 2d0f957c-8f4e-43b2-a66a-cc48cc96237b + +┌──count─┐ +│ 172456 │ +└────────┘ + +1 rows in set. Elapsed: 0.304 sec. +``` + +#### Import Data from Redshift to ClickHouse {#import-data-from-redshift-to-clickhouse} + +In the following, we display importing data using an `INSERT INTO ... SELECT` statement + +```sql +# TABLE CREATION with 3 columns +CREATE TABLE users_imported +( + `username` String, + `firstname` String, + `lastname` String +) +ENGINE = MergeTree +ORDER BY firstname +``` + +```response +Query id: c7c4c44b-cdb2-49cf-b319-4e569976ab05 + +Ok. + +0 rows in set. Elapsed: 0.233 sec. +``` + +```sql +INSERT INTO users_imported (*) SELECT * +FROM jdbc('redshift', 'select username, firstname, lastname from users') +``` + +```response +Query id: 9d3a688d-b45a-40f4-a7c7-97d93d7149f1 + +Ok. + +0 rows in set. Elapsed: 4.498 sec. Processed 49.99 thousand rows, 2.49 MB (11.11 thousand rows/s., 554.27 KB/s.) +``` + + + +## Pivot Data from Redshift to ClickHouse using S3 {#pivot-data-from-redshift-to-clickhouse-using-s3} + +In this scenario, we export data to S3 in an intermediary pivot format and, in a second step, load the data from S3 into ClickHouse. + +PIVOT from Redshift using S3 + +### Pros {#pros-2} + +* Both Redshift and ClickHouse have powerful S3 integration features. +* Leverages the existing features such as the Redshift `UNLOAD` command and ClickHouse S3 table function / table engine. +* Scales seamlessly thanks to parallel reads and high throughput capabilities from/to S3 in ClickHouse. +* Can leverage sophisticated and compressed formats like Apache Parquet. + +### Cons {#cons-2} + +* Two steps in the process (unload from Redshift then load into ClickHouse). + +### Tutorial {#tutorial-1} + + + +#### Export data into an S3 bucket using UNLOAD {#export-data-into-an-s3-bucket-using-unload} + +Using Redshift's [UNLOAD](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html) feature, export the data into an existing private S3 bucket: + +UNLOAD from Redshift to S3 + +It will generate part files containing the raw data in S3 + +Data in S3 + +#### Create the table in ClickHouse {#create-the-table-in-clickhouse} + +Create the table in ClickHouse: + +```sql +CREATE TABLE users +( + username String, + firstname String, + lastname String +) +ENGINE = MergeTree +ORDER BY username +``` + +Alternatively, ClickHouse can try to infer the table structure using `CREATE TABLE ... EMPTY AS SELECT`: + +```sql +CREATE TABLE users +ENGINE = MergeTree ORDER BY username +EMPTY AS +SELECT * FROM s3('https://your-bucket.s3.amazonaws.com/unload/users/*', '', '', 'CSV') +``` + +This works especially well when the data is in a format that contains information about data types, like Parquet. + +#### Load S3 files into ClickHouse {#load-s3-files-into-clickhouse} + +Load the S3 files into ClickHouse using an `INSERT INTO ... SELECT` statement: + +```sql +INSERT INTO users SELECT * +FROM s3('https://your-bucket.s3.amazonaws.com/unload/users/*', '', '', 'CSV') +``` + +```response +Query id: 2e7e219a-6124-461c-8d75-e4f5002c8557 + +Ok. + +0 rows in set. Elapsed: 0.545 sec. Processed 49.99 thousand rows, 2.34 MB (91.72 thousand rows/s., 4.30 MB/s.) +``` + +:::note +This example used CSV as the pivot format. However, for production workloads we recommend Apache Parquet as the best option for large migrations since it comes with compression and can save some storage costs while reducing transfer times. (By default, each row group is compressed using SNAPPY). ClickHouse also leverages Parquet's column orientation to speed up data ingestion. +::: + + \ No newline at end of file diff --git a/docs/integrations/data-ingestion/redshift/index.md b/docs/integrations/data-ingestion/redshift/index.md index 3e936cec37b..217609acecd 100644 --- a/docs/integrations/data-ingestion/redshift/index.md +++ b/docs/integrations/data-ingestion/redshift/index.md @@ -7,17 +7,11 @@ keywords: ['Redshift'] show_related_blogs: true --- -import redshiftToClickhouse from '@site/static/images/integrations/data-ingestion/redshift/redshift-to-clickhouse.png'; -import push from '@site/static/images/integrations/data-ingestion/redshift/push.png'; -import pull from '@site/static/images/integrations/data-ingestion/redshift/pull.png'; -import pivot from '@site/static/images/integrations/data-ingestion/redshift/pivot.png'; -import s3_1 from '@site/static/images/integrations/data-ingestion/redshift/s3-1.png'; -import s3_2 from '@site/static/images/integrations/data-ingestion/redshift/s3-2.png'; -import Image from '@theme/IdealImage'; +import MigrationGuide from '@site/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md'; -# Migrating data from Redshift to ClickHouse +# Migrating Data from Redshift to ClickHouse -## Related content {#related-content} +## Related Content {#related-content}