From c672e124c591e84833e85c304d511213e2fc30bf Mon Sep 17 00:00:00 2001 From: Igor Lukanin Date: Mon, 10 Mar 2025 12:58:25 +0100 Subject: [PATCH] docs: New concurrency pages and env vars --- DEPRECATION.md | 7 ++ docs/pages/product/caching/_meta.js | 1 + .../caching/refreshing-pre-aggregations.mdx | 21 +++++ docs/pages/product/configuration/_meta.js | 4 +- .../product/configuration/concurrency.mdx | 90 +++++++++++++++++++ .../product/configuration/data-sources.mdx | 28 ++---- .../configuration/data-sources/aws-athena.mdx | 15 ++-- .../data-sources/aws-redshift.mdx | 4 +- .../configuration/data-sources/clickhouse.mdx | 4 +- .../data-sources/databricks-jdbc.mdx | 4 +- .../configuration/data-sources/druid.mdx | 4 +- .../configuration/data-sources/duckdb.mdx | 4 +- .../data-sources/elasticsearch.mdx | 4 +- .../configuration/data-sources/firebolt.mdx | 4 +- .../data-sources/google-bigquery.mdx | 4 +- .../configuration/data-sources/hive.mdx | 4 +- .../configuration/data-sources/ksqldb.mdx | 4 +- .../data-sources/materialize.mdx | 4 +- .../configuration/data-sources/mongodb.mdx | 4 +- .../configuration/data-sources/ms-sql.mdx | 4 +- .../configuration/data-sources/mysql.mdx | 4 +- .../configuration/data-sources/oracle.mdx | 4 +- .../configuration/data-sources/pinot.mdx | 3 + .../configuration/data-sources/postgres.mdx | 4 +- .../configuration/data-sources/presto.mdx | 4 +- .../configuration/data-sources/questdb.mdx | 4 +- .../configuration/data-sources/risingwave.mdx | 4 +- .../configuration/data-sources/snowflake.mdx | 4 +- .../configuration/data-sources/sqlite.mdx | 4 +- .../configuration/data-sources/trino.mdx | 4 +- .../configuration/data-sources/vertica.mdx | 4 +- .../{advanced => }/multiple-data-sources.mdx | 7 +- .../{advanced => }/multitenancy.mdx | 0 .../configuration/environment-variables.mdx | 40 +++++++-- docs/redirects.json | 14 ++- 35 files changed, 250 insertions(+), 72 deletions(-) create mode 100644 docs/pages/product/caching/refreshing-pre-aggregations.mdx create mode 100644 docs/pages/product/configuration/concurrency.mdx rename docs/pages/product/configuration/{advanced => }/multiple-data-sources.mdx (97%) rename docs/pages/product/configuration/{advanced => }/multitenancy.mdx (100%) diff --git a/DEPRECATION.md b/DEPRECATION.md index 3c3268d637fd0..a66d5d6a1282f 100644 --- a/DEPRECATION.md +++ b/DEPRECATION.md @@ -64,6 +64,7 @@ features: | Removed | [`initApp` hook](#initapp-hook) | v0.35.0 | v0.35.0 | | Removed | [`/v1/run-scheduled-refresh` REST API endpoint](#v1run-scheduled-refresh-rest-api-endpoint) | v0.35.0 | v0.36.0 | | Deprecated | [Node.js 18](#nodejs-18) | v0.36.0 | | +| Deprecated | [`CUBEJS_SCHEDULED_REFRESH_CONCURRENCY`](#cubejs_scheduled_refresh_concurrency) | v1.2.7 | | ### Node.js 8 @@ -391,3 +392,9 @@ API](https://cube.dev/docs/product/apis-integrations/orchestration-api) and Node.js 18 reaches [End of Life on April 30, 2025][link-nodejs-eol]. This means no more updates. Please upgrade to Node.js 20 or higher. + +### `CUBEJS_SCHEDULED_REFRESH_CONCURRENCY` + +**Deprecated in Release: v1.2.7** + +This environment variable was renamed to [`CUBEJS_SCHEDULED_REFRESH_QUERIES_PER_APP_ID`](https://cube.dev/docs/reference/configuration/environment-variables#cubejs_scheduled_refresh_queries_per_app_id). Please use the new name. \ No newline at end of file diff --git a/docs/pages/product/caching/_meta.js b/docs/pages/product/caching/_meta.js index 1d8700a1c1376..2d7b0cafbb8a3 100644 --- a/docs/pages/product/caching/_meta.js +++ b/docs/pages/product/caching/_meta.js @@ -2,6 +2,7 @@ module.exports = { "getting-started-pre-aggregations": "Getting started with pre-aggregations", "using-pre-aggregations": "Using pre-aggregations", "matching-pre-aggregations": "Matching pre-aggregations", + "refreshing-pre-aggregations": "Refreshing pre-aggregations", "lambda-pre-aggregations": "Lambda pre-aggregations", "running-in-production": "Running in production" } \ No newline at end of file diff --git a/docs/pages/product/caching/refreshing-pre-aggregations.mdx b/docs/pages/product/caching/refreshing-pre-aggregations.mdx new file mode 100644 index 0000000000000..f8e248f1402ea --- /dev/null +++ b/docs/pages/product/caching/refreshing-pre-aggregations.mdx @@ -0,0 +1,21 @@ +# Refreshing pre-aggregations + +_Pre-aggregation refresh_ is the process of building pre-aggregations and updating +them with new data. Pre-aggregation refresh is the responsibility of the _refresh +worker_. + +## Configuration + +You can use the following environment variables to configure the refresh worker +behavior: + +- `CUBEJS_REFRESH_WORKER` (see also `CUBEJS_PRE_AGGREGATIONS_BUILDER`) +- `CUBEJS_PRE_AGGREGATIONS_SCHEMA` +- `CUBEJS_SCHEDULED_REFRESH_TIMEZONES` +- `CUBEJS_DB_QUERY_TIMEOUT` +- `CUBEJS_REFRESH_WORKER_CONCURRENCY` (see also `CUBEJS_CONCURRENCY`) +- `CUBEJS_SCHEDULED_REFRESH_QUERIES_PER_APP_ID` +- `CUBEJS_DROP_PRE_AGG_WITHOUT_TOUCH` + + +[ref-multitenancy]: /product/configuration/advanced/multitenancy \ No newline at end of file diff --git a/docs/pages/product/configuration/_meta.js b/docs/pages/product/configuration/_meta.js index e6485428a4797..a183902a526e3 100644 --- a/docs/pages/product/configuration/_meta.js +++ b/docs/pages/product/configuration/_meta.js @@ -1,5 +1,7 @@ module.exports = { "data-sources": "Data sources", "visualization-tools": "Visualization tools", - "advanced": "Advanced" + "multiple-data-sources": "Multiple data sources", + "concurrency": "Concurrency", + "multitenancy": "Multitenancy" } \ No newline at end of file diff --git a/docs/pages/product/configuration/concurrency.mdx b/docs/pages/product/configuration/concurrency.mdx new file mode 100644 index 0000000000000..2b23880fac625 --- /dev/null +++ b/docs/pages/product/configuration/concurrency.mdx @@ -0,0 +1,90 @@ +# Querying concurrency + +All queries to [data APIs][ref-data-apis] are processed asynchronously via a _query +queue_. It allows to optimize the load and increase querying performance. + +## Query queue + +The query queue allows to deduplicate queries to API instances and insulate upstream +data sources from query spikes. It also allows to execute queries to data sources +concurrently for increased performance. + +By default, Cube uses a _single_ query queue for queries from all API instances and +the refresh worker to all configured data sources. + + + +You can read more about the query queue in the [this blog post](https://cube.dev/blog/how-you-win-by-using-cube-store-part-1#query-queue-in-cube). + + + +### Multiple query queues + +You can use the [`context_to_orchestrator_id`][ref-context-to-orchestrator-id] +configuration option to route queries to multiple queues based on the security +context. + + + +If you're configuring multiple connections to data sources via the [`driver_factory` +configuration option][ref-driver-factory], you __must__ also configure +`context_to_orchestrator_id` to ensure that queries are routed to correct queues. + + + +## Data sources + +Cube supports various kinds of [data sources][ref-data-sources], ranging from cloud +data warehouses to embedded databases. Each data source scales differently, +therefore Cube provides sound defaults for each kind of data source out-of-the-box. + +### Data source concurrency + +By default, Cube uses the following concurrency settings for data sources: + +| Data source | Default concurrency | +| --- | --- | +| [Amazon Athena][ref-athena] | 10 | +| [Amazon Redshift][ref-redshift] | 5 | +| [Apache Pinot][ref-pinot] | 10 | +| [ClickHouse][ref-clickhouse] | 10 | +| [Databricks][ref-databricks] | 10 | +| [Firebolt][ref-firebolt] | 10 | +| [Google BigQuery][ref-bigquery] | 10 | +| [Snowflake][ref-snowflake] | 8 | +| All other data sources | 5 or [less, if specified in the driver][link-github-data-source-concurrency] | + +You can use the `CUBEJS_CONCURRENCY` environment variable to adjust the maximum +number of concurrent queries to a data source. It's recommended to use the default +configuration unless you're sure that your data source can handle more concurrent +queries. + +### Connection pooling + +For data sources that support connection pooling, the maximum number of concurrent +connections to the database can also be set by using the `CUBEJS_DB_MAX_POOL` +environment variable. If changing this from the default, you must ensure that the +new value is greater than the number of concurrent connections used by Cube's query +queues and the refresh worker. + +## Refresh worker + +By default, the refresh worker uses the same concurrency settings as API instances. +However, you can override this behvaior in the refresh worker +[configuration][ref-preagg-refresh]. + + +[ref-data-apis]: /product/apis-integrations +[ref-data-sources]: /product/configuration/data-sources +[ref-context-to-orchestrator-id]: /reference/configuration/config#context_to_orchestrator_id +[ref-driver-factory]: /reference/configuration/config#driver_factory +[ref-preagg-refresh]: /product/caching/refreshing-pre-aggregations#configuration +[ref-athena]: /product/configuration/data-sources/aws-athena +[ref-clickhouse]: /product/configuration/data-sources/clickhouse +[ref-databricks]: /product/configuration/data-sources/databricks-jdbc +[ref-firebolt]: /product/configuration/data-sources/firebolt +[ref-pinot]: /product/configuration/data-sources/pinot +[ref-redshift]: /product/configuration/data-sources/aws-redshift +[ref-snowflake]: /product/configuration/data-sources/snowflake +[ref-bigquery]: /product/configuration/data-sources/google-bigquery +[link-github-data-source-concurrency]: https://github.com/search?q=repo%3Acube-js%2Fcube+getDefaultConcurrency+path%3Apackages%2Fcubejs-&type=code \ No newline at end of file diff --git a/docs/pages/product/configuration/data-sources.mdx b/docs/pages/product/configuration/data-sources.mdx index d884d6a5b4047..3ec81c2d7b0bc 100644 --- a/docs/pages/product/configuration/data-sources.mdx +++ b/docs/pages/product/configuration/data-sources.mdx @@ -8,8 +8,9 @@ redirect_from: Choose a data source to get started with below. -Note that Cube also supports connecting to [multiple data -sources][ref-config-multi-data-src] out of the box. +You can also connect [multiple data sources][ref-config-multi-data-src] at the same +time and adjust the [concurrency settings][ref-data-source-concurrency] for data +sources. ## Data warehouses @@ -251,28 +252,9 @@ users on the [Enterprise Premier](https://cube.dev/pricing) product tier. -## Concurrency and pooling - - - -All Cube database drivers come with presets for concurrency and pooling that -work out-of-the-box. The following information is included as a reference. - - - -For increased performance, Cube uses multiple concurrent connections to -configured data sources. The `CUBEJS_CONCURRENCY` environment variable controls -concurrency settings for query queues and the refresh scheduler as well as the -maximum concurrent connections. - -For databases that support connection pooling, -the maximum number of concurrent connections to the database can also be set by -using the `CUBEJS_DB_MAX_POOL` environment variable; if changing this from the -default, you must ensure that the new value is greater than the number of -concurrent connections used by Cube's query queues and refresh scheduler. - [ref-config-multi-data-src]: /product/configuration/advanced/multiple-data-sources [ref-driver-factory]: /reference/configuration/config#driver_factory [ref-duckdb]: /product/configuration/data-sources/duckdb -[link-github-packages]: https://github.com/cube-js/cube/tree/master/packages \ No newline at end of file +[link-github-packages]: https://github.com/cube-js/cube/tree/master/packages +[ref-data-source-concurrency]: /product/configuration/concurrency#data-sources \ No newline at end of file diff --git a/docs/pages/product/configuration/data-sources/aws-athena.mdx b/docs/pages/product/configuration/data-sources/aws-athena.mdx index 7a24fa4fb7cd9..82bc6036288fb 100644 --- a/docs/pages/product/configuration/data-sources/aws-athena.mdx +++ b/docs/pages/product/configuration/data-sources/aws-athena.mdx @@ -1,9 +1,4 @@ ---- -redirect_from: - - /config/databases/aws-athena ---- - -# AWS Athena +# Amazon Athena ## Prerequisites @@ -39,7 +34,7 @@ Configuration in your deployment. -In Cube Cloud, select **AWS Athena** when creating a new deployment and fill in +In Cube Cloud, select AWS Athena** when creating a new deployment and fill in the required fields: + +Previously, this environment variable was named `CUBEJS_SCHEDULED_REFRESH_CONCURRENCY`. + + + ## `CUBEJS_CUBESTORE_HOST` The hostname of the Cube Store deployment @@ -1038,15 +1070,7 @@ mode](/product/caching/using-pre-aggregations#rollup-only-mode) for details. It can be also set using the [`orchestrator_options.rollupOnlyMode` configuration option](/reference/configuration/config#orchestrator_options). -## `CUBEJS_SCHEDULED_REFRESH_CONCURRENCY` - -How many pre-aggregations refresh worker will build in parallel. Please note -changing this param doesn't change queue concurrency and it should be adjusted -accordingly -| Possible Values | Default in Development | Default in Production | -| ---------------------------------------------- | ---------------------- | --------------------- | -| A valid number of concurrent refresh processes | 10 | 10 | ## `CUBEJS_SCHEDULED_REFRESH_TIMEZONES` diff --git a/docs/redirects.json b/docs/redirects.json index 6155b8fb3a157..0eb1d477a6b3b 100644 --- a/docs/redirects.json +++ b/docs/redirects.json @@ -1,4 +1,14 @@ [ + { + "source": "/product/configuration/advanced/multiple-data-sources", + "destination": "/product/configuration/multiple-data-sources", + "permanent": true + }, + { + "source": "/product/configuration/advanced/multitenancy", + "destination": "/product/configuration/multitenancy", + "permanent": true + }, { "source": "/product/workspace/visual-modeler", "destination": "/product/workspace/visual-model", @@ -901,12 +911,12 @@ }, { "source": "/config/multitenancy", - "destination": "/product/configuration/advanced/multitenancy", + "destination": "/product/configuration/multitenancy", "permanent": true }, { "source": "/config/multiple-data-sources", - "destination": "/product/configuration/advanced/multiple-data-sources", + "destination": "/product/configuration/multiple-data-sources", "permanent": true }, {