diff --git a/antora.yml b/antora.yml index f91c30f7..e6e9a5cc 100644 --- a/antora.yml +++ b/antora.yml @@ -24,7 +24,21 @@ asciidoc: cass-migrator-short: 'CDM' dse: 'DataStax Enterprise (DSE)' dse-short: 'DSE' + hcd: 'Hyper-Converged Database (HCD)' + hcd-short: 'HCD' astra-db: 'Astra DB' + astra: 'Astra' + data-api: 'Data API' + db-serverless: 'Serverless (Non-Vector)' + db-serverless-vector: 'Serverless (Vector)' astra-ui: 'Astra Portal' astra-url: 'https://astra.datastax.com' - support-url: 'https://support.datastax.com' \ No newline at end of file + astra-ui-link: '{astra-url}[{astra-ui}^]' + sstable-sideloader: '{astra-db} Sideloader' + devops-api: 'DevOps API' + devops-api-ref-url: 'xref:astra-api-docs:ROOT:attachment$devops-api/index.html' + support-url: 'https://support.datastax.com' + mc: 'Mission Control (MC)' + mc-short: 'MC' + +#TODO: Bring SCB attributes \ No newline at end of file diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 43175ced..0def7a1e 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -19,7 +19,6 @@ ** xref:migrate-and-validate-data.adoc[] ** xref:cassandra-data-migrator.adoc[{cass-migrator}] ** xref:dsbulk-migrator.adoc[{dsbulk-migrator}] -** https://docs.datastax.com/en/dsbulk/overview/dsbulk-about.html[{dsbulk-loader}] * Phase 3 ** xref:enable-async-dual-reads.adoc[] * Phase 4 @@ -27,26 +26,50 @@ * Phase 5 ** xref:connect-clients-to-target.adoc[] * References -** Troubleshooting -*** xref:troubleshooting.adoc[] -*** xref:troubleshooting-tips.adoc[] -*** xref:troubleshooting-scenarios.adoc[] +** xref:troubleshooting-tips.adoc[] +** xref:troubleshooting-scenarios.adoc[] ** xref:contributions.adoc[] ** xref:faqs.adoc[] ** xref:glossary.adoc[] -** xref:release-notes.adoc[] +** https://github.com/datastax/zdm-proxy/releases[{product-proxy} release notes] +** https://github.com/datastax/zdm-proxy-automation/releases[{product-automation} release notes] .{cass-migrator} -* xref:cdm-overview.adoc[{cass-migrator-short} overview] -* xref:cdm-steps.adoc[Use {cass-migrator-short} to migrate data] +* xref:cdm-overview.adoc[{cass-migrator}] +* https://github.com/datastax/cassandra-data-migrator/releases[{cass-migrator-short} release notes] .{dsbulk-loader} -* https://docs.datastax.com/en/dsbulk/overview/dsbulk-about.html[{dsbulk-loader}] -* https://docs.datastax.com/en/dsbulk/installing/install.html[Installing {dsbulk-loader}] +* xref:dsbulk:overview:dsbulk-about.adoc[{dsbulk-loader}] +* xref:dsbulk:installing:install.adoc[Installing {dsbulk-loader}] * Loading and unloading data -** https://docs.datastax.com/en/dsbulk/getting-started/simple-load.html[Loading data without a configuration file] -** https://docs.datastax.com/en/dsbulk/getting-started/simple-unload.html[Unloading data without a configuration file] -** https://docs.datastax.com/en/dsbulk/developing/loading-unloading-vector-data.html[Loading and unloading vector data] -** https://docs.datastax.com/en/dsbulk/reference/load.html[Loading data examples] -** https://docs.datastax.com/en/dsbulk/reference/unload.html[Unloading data examples] -* https://docs.datastax.com/en/dsbulk/reference/dsbulk-cmd.html#escaping-and-quoting-command-line-arguments[Escaping and quoting command line arguments] \ No newline at end of file +** xref:dsbulk:getting-started:simple-load.adoc[Loading data without a configuration file] +** xref:dsbulk:getting-started:simple-unload.adoc[Unloading data without a configuration file] +** xref:dsbulk:developing:loading-unloading-vector-data.adoc[Loading and unloading vector data] +** xref:dsbulk:reference:load.adoc[Loading data examples] +** xref:dsbulk:reference:unload.adoc[Unloading data examples] +* xref:dsbulk:reference:dsbulk-cmd.adoc#escaping-and-quoting-command-line-arguments[Escaping and quoting command line arguments] +* https://github.com/datastax/dsbulk/releases[{dsbulk-loader} release notes] + +.{sstable-sideloader} +* xref:sideloader:sideloader-overview.adoc[] +* xref:sideloader:prepare-sideloader.adoc[] +* xref:sideloader:migrate-sideloader.adoc[] +* xref:sideloader:stop-restart-sideloader.adoc[] +* xref:sideloader:cleanup-sideloader.adoc[] +* xref:sideloader:troubleshoot-sideloader.adoc[] + +.Product-specific migration paths +* {astra-db} +** xref:astra-db-serverless:databases:migration-path-serverless.adoc[] +* {dse} +** {dse-short} 6.9 +*** xref:6.9@dse:tooling:migration-path-dse.adoc[{dse-short} 6.9 migration tools] +*** xref:6.9@dse:managing:operations/migrate-data.adoc[Migrate data to {dse-short} 6.9] +** {dse-short} 6.8 +*** xref:6.8@dse:tooling:migration-path-dse.adoc[{dse-short} 6.8 migration tools] +*** xref:6.8@dse:managing:operations/migrate-data.adoc[Migrate data to {dse-short} 6.8] +** {dse-short} 5.1 +*** xref:5.1@dse:managing:operations/migrate-data.adoc[Migrate data to {dse-short} 5.1] +* {mc} +** xref:mission-control:migrate:oss-cass-to-mission-control.adoc[Migrate {cass-short} clusters to {mc-short}] +** xref:mission-control:migrate:dse-to-mission-control.adoc[[Migrate {cass-short} clusters to {dse-short}] \ No newline at end of file diff --git a/modules/ROOT/pages/.additional-resources.adoc b/modules/ROOT/pages/.additional-resources.adoc deleted file mode 100644 index d49f2999..00000000 --- a/modules/ROOT/pages/.additional-resources.adoc +++ /dev/null @@ -1,13 +0,0 @@ -= Additional resources -:page-tag: migration,zdm,zero-downtime,zdm-proxy -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] - -This section presents the following: - -* xref:glossary.adoc[Glossary] -* xref:troubleshooting.adoc[Troubleshooting] -** xref:troubleshooting-tips.adoc[Troubleshooting tips] -** xref:troubleshooting-scenarios.adoc[Troubleshooting scenarios] -* xref:contributions.adoc[Contribution guidelines] -* xref:release-notes.adoc[Release Notes] diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index 918b6579..6c5f27e6 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -1,49 +1,322 @@ = {cass-migrator} -:page-aliases: cdm-parameters.adoc +:page-aliases: cdm-parameters.adoc, ROOT:cdm-steps.adoc -Use {cass-migrator} to migrate and validate tables between origin and target {cass-short} clusters, with available logging and reconciliation support. +//This page was an exact duplicate of cdm-overview.adoc and the (now deleted) cdm-steps.adoc, they are just in different parts of the nav. -[[cdm-prerequisites]] -== {cass-migrator} prerequisites +// tag::body[] +You can use {cass-migrator} ({cass-migrator-short}) to migrate and validate tables between the origin and target {cass-short} clusters, with optional logging and reconciliation support. -include::partial$cdm-prerequisites.adoc[] +{cass-migrator-short} facilitates data transfer by creating multiple jobs that access the {cass-short} cluster concurrently, making it an ideal choice for migrating large datasets. +It offers extensive configuration options, including logging, reconciliation, performance optimization, and more. -[[cdm-install-as-container]] -== Install {cass-migrator} as a Container +//TODO: Bring over content from the page that introduces the 3 options, and the features, limitations, and performance recommendations in the README https://github.com/datastax/cassandra-data-migrator?tab=readme-ov-file#features -include::partial$cdm-install-as-container.adoc[] +== Install {cass-migrator} -[[cdm-install-as-jar]] -== Install {cass-migrator} as a JAR file +{company} recommends that you always install the latest version of {cass-migrator-short} to get the latest features, dependencies, and bug fixes. -include::partial$cdm-install-as-jar.adoc[] +[tabs] +====== +Install as a container:: ++ +-- +Get the latest `cassandra-data-migrator` image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. -[[cdm-build-jar-local]] -== Build {cass-migrator} JAR for local development (optional) +The container's `assets` directory includes all required migration tools: `cassandra-data-migrator`, `dsbulk`, and `cqlsh`. +-- -include::partial$cdm-build-jar-local.adoc[] +Install as a JAR file:: ++ +-- +. Install Java 11 or later, which includes Spark binaries. -[[cdm-steps]] -== Use {cass-migrator} +. Install https://spark.apache.org/downloads.html[Apache Spark(TM)] version 3.5.x with Scala 2.13 and Hadoop 3.3 and later. ++ +[tabs] +==== +Single VM:: ++ +For one-off migrations, you can install the Spark binary on a single VM where you will run the {cass-migrator-short} job. ++ +. Get the Spark tarball from the Apache Spark archive. ++ +[source,bash,subs="+quotes"] +---- +wget https://archive.apache.org/dist/spark/spark-3.5.**PATCH**/spark-3.5.**PATCH**-bin-hadoop3-scala2.13.tgz +---- ++ +Replace `**PATCH**` with your Spark patch version. ++ +. Change to the directory where you want install Spark, and then extract the tarball: ++ +[source,bash,subs="+quotes"] +---- +tar -xvzf spark-3.5.**PATCH**-bin-hadoop3-scala2.13.tgz +---- ++ +Replace `**PATCH**` with your Spark patch version. -include::partial$use-cdm-migrator.adoc[] +Spark cluster:: ++ +For large (several terabytes) migrations, complex migrations, and use of {cass-migrator-short} as a long-term data transfer utility, {company} recommends that you use a Spark cluster or Spark Serverless platform. ++ +If you deploy CDM on a Spark cluster, you must modify your `spark-submit` commands as follows: ++ +* Replace `--master "local[*]"` with the host and port for your Spark cluster, as in `--master "spark://**MASTER_HOST**:**PORT**"`. +* Remove parameters related to single-VM installations, such as `--driver-memory` and `--executor-memory`. +==== -[[cdm-validation-steps]] -== Use {cass-migrator} steps in validation mode +. Download the latest `cassandra-data-migrator` JAR file image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?label=GitHub[alt="Latest cassandra-data-migrator release on GitHub",link="https://github.com/datastax/cassandra-data-migrator/packages"] from the https://github.com/datastax/cassandra-data-migrator[{cass-migrator-short} repository]. -include::partial$cdm-validation-steps.adoc[] +. Add the `cassandra-data-migrator` dependency to `pom.xml`: ++ +[source,xml,subs="+quotes"] +---- + + datastax.cdm + cassandra-data-migrator + **VERSION** + +---- ++ +Replace `**VERSION**` with your {cass-migrator-short} version. -[[cdm-partition-ranges]] -== Migrate or validate specific partition ranges +. Run `mvn install`. -include::partial$cdm-partition-ranges.adoc[] +If you need to build the JAR for local development or your environment only has Scala version 2.12.x, see the alternative installation instructions in the https://github.com/datastax/cassandra-data-migrator?tab=readme-ov-file[{cass-migrator-short} README]. +-- +====== -[[cdm-guardrail-checks]] -== Perform large-field guardrail violation checks +== Configure {cass-migrator-short} -include::partial$cdm-guardrail-checks.adoc[] +. Create a `cdm.properties` file. ++ +If you use a different name, make sure you specify the correct filename in your `spark-submit` commands. -[[cdm-next-steps]] -== Next steps +. Configure the properties for your environment. ++ +In the {cass-migrator-short} repository, you can find a https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[sample properties file with default values], as well as a https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[fully annotated properties file]. ++ +{cass-migrator-short} jobs process all uncommented parameters. +Any parameters that are commented out are ignored or use default values. ++ +If you want to reuse a properties file created for a previous {cass-migrator-short} version, make sure it is compatible with the version you are currently using. +Check the https://github.com/datastax/cassandra-data-migrator/releases[{cass-migrator-short} release notes] for possible breaking changes in interim releases. +For example, the 4.x series of {cass-migrator-short} isn't backwards compatible with earlier properties files. -For advanced operations, see documentation at https://github.com/datastax/cassandra-data-migrator[the repository]. +. Store your properties file where it can be accessed while running {cass-migrator-short} jobs using `spark-submit`. + +[#migrate] +== Run a {cass-migrator-short} data migration job + +The following `spark-submit` command migrates one table from the origin to the target cluster, using the configuration in your properties file. +The migration job is specified in the `--class` argument. + +[tabs] +====== +Local installation:: ++ +-- +[source,bash,subs="+quotes,+attributes"] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="**KEYSPACE_NAME**.**TABLE_NAME**" \ +--master "local[{asterisk}]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.Migrate cassandra-data-migrator-**VERSION**.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +Replace or modify the following, if needed: + +* `--properties-file cdm.properties`: If your properties file has a different name, specify the actual name of your properties file. ++ +Depending on where your properties file is stored, you might need to specify the full or relative file path. + +* `**KEYSPACE_NAME**.**TABLE_NAME**`: Specify the name of the table that you want to migrate and the keyspace that it belongs to. ++ +You can also set `spark.cdm.schema.origin.keyspaceTable` in your properties file using the same format of `**KEYSPACE_NAME**.**TABLE_NAME**`. + +* `--driver-memory` and `--executor-memory`: For local installations, specify the appropriate memory settings for your environment. + +* `**VERSION**`: Specify the full {cass-migrator-short} version that you installed, such as `5.2.1`. +-- + +Spark cluster:: ++ +-- +[source,bash,subs="+quotes"] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="**KEYSPACE_NAME**.**TABLE_NAME**" \ +--master "spark://**MASTER_HOST**:**PORT**" \ +--class com.datastax.cdm.job.Migrate cassandra-data-migrator-**VERSION**.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +Replace or modify the following, if needed: + +* `--properties-file cdm.properties`: If your properties file has a different name, specify the actual name of your properties file. ++ +Depending on where your properties file is stored, you might need to specify the full or relative file path. + +* `**KEYSPACE_NAME**.**TABLE_NAME**`: Specify the name of the table that you want to migrate and the keyspace that it belongs to. ++ +You can also set `spark.cdm.schema.origin.keyspaceTable` in your properties file using the same format of `**KEYSPACE_NAME**.**TABLE_NAME**`. + +* `--master`: Provide the URL of your Spark cluster. + +* `**VERSION**`: Specify the full {cass-migrator-short} version that you installed, such as `5.2.1`. +-- +====== + +This command generates a log file (`logfile_name_**TIMESTAMP**.txt`) instead of logging output to the console. + +For additional modifications to this command, see <>. + +[#cdm-validation-steps] +== Run a {cass-migrator-short} data validation job + +After you migrate data, you can use {cass-migrator-short}'s data validation mode to find inconsistencies between the origin and target tables. + +. Use the following `spark-submit` command to run a data validation job using the configuration in your properties file. +The data validation job is specified in the `--class` argument. ++ +[tabs] +====== +Local installation:: ++ +-- +[source,bash,subs="+quotes,+attributes"] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="**KEYSPACE_NAME**.**TABLE_NAME**" \ +--master "local[{asterisk}]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.DiffData cassandra-data-migrator-**VERSION**.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +Replace or modify the following, if needed: + +* `--properties-file cdm.properties`: If your properties file has a different name, specify the actual name of your properties file. ++ +Depending on where your properties file is stored, you might need to specify the full or relative file path. + +* `**KEYSPACE_NAME**.**TABLE_NAME**`: Specify the name of the table that you want to validate and the keyspace that it belongs to. ++ +You can also set `spark.cdm.schema.origin.keyspaceTable` in your properties file using the same format of `**KEYSPACE_NAME**.**TABLE_NAME**`. + +* `--driver-memory` and `--executor-memory`: For local installations, specify the appropriate memory settings for your environment. + +* `**VERSION**`: Specify the full {cass-migrator-short} version that you installed, such as `5.2.1`. +-- + +Spark cluster:: ++ +-- +[source,bash,subs="+quotes"] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="**KEYSPACE_NAME**.**TABLE_NAME**" \ +--master "spark://**MASTER_HOST**:**PORT**" \ +--class com.datastax.cdm.job.DiffData cassandra-data-migrator-**VERSION**.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +Replace or modify the following, if needed: + +* `--properties-file cdm.properties`: If your properties file has a different name, specify the actual name of your properties file. ++ +Depending on where your properties file is stored, you might need to specify the full or relative file path. + +* `**KEYSPACE_NAME**.**TABLE_NAME**`: Specify the name of the table that you want to validate and the keyspace that it belongs to. ++ +You can also set `spark.cdm.schema.origin.keyspaceTable` in your properties file using the same format of `**KEYSPACE_NAME**.**TABLE_NAME**`. + +* `--master`: Provide the URL of your Spark cluster. + +* `**VERSION**`: Specify the full {cass-migrator-short} version that you installed, such as `5.2.1`. +-- +====== + +. Allow the command some time to run, and then open the log file (`logfile_name_**TIMESTAMP**.txt`) and look for `ERROR` entries. ++ +The {cass-migrator-short} validation job records differences as `ERROR` entries in the log file, listed by primary key values. +For example: ++ +[source,plaintext] +---- +23/04/06 08:43:06 ERROR DiffJobSession: Mismatch row found for key: [key3] Mismatch: Target Index: 1 Origin: valueC Target: value999) +23/04/06 08:43:06 ERROR DiffJobSession: Corrected mismatch row in target: [key3] +23/04/06 08:43:06 ERROR DiffJobSession: Missing target row found for key: [key2] +23/04/06 08:43:06 ERROR DiffJobSession: Inserted missing row in target: [key2] +---- ++ +When validating large datasets or multiple tables, you might want to extract the complete list of missing or mismatched records. +There are many ways to do this. +For example, you can grep for all `ERROR` entries in your {cass-migrator-short} log files or use the `log4j2` example provided in the https://github.com/datastax/cassandra-data-migrator?tab=readme-ov-file#steps-for-data-validation[{cass-migrator-short} repository]. + +=== Run a validation job in AutoCorrect mode + +Optionally, you can run {cass-migrator-short} validation jobs in **AutoCorrect** mode, which offers the following functions: + +* `autocorrect.missing`: Add any missing records in the target with the value from the origin. + +* `autocorrect.mismatch`: Reconcile any mismatched records between the origin and target by replacing the target value with the origin value. ++ +[IMPORTANT] +==== +`TIMESTAMP` has an effect on this function. + +If the `WRITETIME` of the origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the target record, then the change doesn't appear in the target cluster. +This comparative state can be challenging to troubleshoot if individual columns or cells were modified in the target cluster. +==== + +* `autocorrect.missing.counter`: By default, counter tables are not copied when missing, unless explicitly set. + +In your `cdm.properties` file, use the following properties to enable (`true`) or disable (`false`) autocorrect functions: + +[source,properties] +---- +spark.cdm.autocorrect.missing false|true +spark.cdm.autocorrect.mismatch false|true +spark.cdm.autocorrect.missing.counter false|true +---- + +The {cass-migrator-short} validation job never deletes records from either the origin or target. +Data validation only inserts or updates data on the target. + +For an initial data validation, consider disabling AutoCorrect so that you can generate a list of data discrepancies, investigate those discrepancies, and then decide whether you want to rerun the validation with AutoCorrect enabled. + +[#advanced] +== Additional {cass-migrator-short} options + +You can modify your properties file or append additional `--conf` arguments to your `spark-submit` commands to customize your {cass-migrator-short} jobs. +For example, you can do the following: + +* Check for large field guardrail violations before migrating. +* Use the `partition.min` and `partition.max` parameters to migrate or validate specific token ranges. +* Use the `track-run` feature to monitor progress and rerun a failed migration or validation job from point of failure. + +For all options, see the https://github.com/datastax/cassandra-data-migrator[{cass-migrator-short} repository]. +Specifically, see the https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[fully annotated properties file]. + +== Troubleshoot {cass-migrator-short} + +.Java NoSuchMethodError +[%collapsible] +==== +If you installed Spark as a JAR file, and your Spark and Scala versions aren't compatible with your installed version of {cass-migrator-short}, {cass-migrator-short} jobs can throw exceptions such a the following: + +[source,console] +---- +Exception in thread "main" java.lang.NoSuchMethodError: 'void scala.runtime.Statics.releaseFence()' +---- + +Make sure that your Spark binary is compatible with your {cass-migrator-short} version. +If you installed an earlier version of {cass-migrator-short}, you might need to install an earlier Spark binary. +==== + +.Rerun a failed or partially completed job +[%collapsible] +==== +You can use the `track-run` feature to track the progress of a migration or validation, and then, if necessary, use the `run-id` to rerun a failed job from the last successful migration or validation point. + +For more information, see the https://github.com/datastax/cassandra-data-migrator[{cass-migrator-short} repository] and the https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[fully annotated properties file]. +==== +// end::body[] \ No newline at end of file diff --git a/modules/ROOT/pages/cdm-overview.adoc b/modules/ROOT/pages/cdm-overview.adoc index 11a4d344..ab33c66d 100644 --- a/modules/ROOT/pages/cdm-overview.adoc +++ b/modules/ROOT/pages/cdm-overview.adoc @@ -1,25 +1,3 @@ = {cass-migrator} ({cass-migrator-short}) overview -{cass-migrator} ({cass-migrator-short}) is a tool designed for migrating and validating data between origin and target {cass-reg}-compatible clusters. It facilitates the transfer of data, creating multiple jobs at once that can access the {cass-short} cluster concurrently. This tool is also useful when dealing with large datasets and requires careful configuration to balance performance impact and migration speed. - -The information below explains how to get started with {cass-migrator-short}. Review your prerequisites and decide between the two installation options: as a container or as a JAR file. - -[[cdm-prerequisites]] -== {cass-migrator} prerequisites - -include::partial$cdm-prerequisites.adoc[] - -== {cass-migrator} installation methods - -Both installation methods require attention to version compatibility, especially with the `cdm.properties` files. -Both environments also use `spark-submit` to run the jobs. - -[[cdm-install-as-container]] -=== Install {cass-migrator} as a Container - -include::partial$cdm-install-as-container.adoc[] - -[[cdm-install-as-jar]] -=== Install {cass-migrator} as a JAR file - -include::partial$cdm-install-as-jar.adoc[] +include::ROOT:cassandra-data-migrator.adoc[tags=body] \ No newline at end of file diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc deleted file mode 100644 index 940615ac..00000000 --- a/modules/ROOT/pages/cdm-steps.adoc +++ /dev/null @@ -1,23 +0,0 @@ -= {cass-migrator} - -Use {cass-migrator} to migrate and validate tables between the origin and target {cass-short} clusters, with available logging and reconciliation support. - -[[cdm-steps]] -== Use {cass-migrator} - -include::partial$use-cdm-migrator.adoc[] - -[[cdm-validation-steps]] -== Use {cass-migrator} steps in validation mode - -include::partial$cdm-validation-steps.adoc[] - -[[cdm--partition-ranges]] -== Migrate or validate specific partition ranges - -include::partial$cdm-partition-ranges.adoc[] - -[[cdm-guardrail-checks]] -== Perform large-field guardrail violation checks - -include::partial$cdm-guardrail-checks.adoc[] diff --git a/modules/ROOT/pages/change-read-routing.adoc b/modules/ROOT/pages/change-read-routing.adoc index 4dbd73ab..a6309f1a 100644 --- a/modules/ROOT/pages/change-read-routing.adoc +++ b/modules/ROOT/pages/change-read-routing.adoc @@ -1,26 +1,22 @@ -= Phase 4: Change read routing to Target += Phase 4: Route reads to the target :page-tag: migration,zdm,zero-downtime,zdm-proxy,read-routing -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] -This topic explains how you can configure the {product-proxy} to route all reads to Target instead of Origin. +This topic explains how you can configure the {product-proxy} to route all reads to the target cluster instead of the origin cluster. -//include::partial$lightbox-tip.adoc[] - -image::{imagesprefix}migration-phase4ra9.png["Phase 4 diagram shows read routing on {product-proxy} was switched to Target."] +image::migration-phase4ra9.png["Phase 4 diagram shows read routing on {product-proxy} was switched to the target."] For illustrations of all the migration phases, see the xref:introduction.adoc#_migration_phases[Introduction]. == Steps -You would typically perform these steps once you have migrated all the existing data from Origin, and completed all validation checks and reconciliation if necessary. +You would typically perform these steps once you have migrated all the existing data from the origin cluster, and completed all validation checks and reconciliation if necessary. This operation is a configuration change that can be carried out as explained xref:manage-proxy-instances.adoc#change-mutable-config-variable[here]. [TIP] ==== -If you performed the optional steps described in the prior topic, xref:enable-async-dual-reads.adoc[] -- to verify that your Target cluster was ready and tuned appropriately to handle the production read load -- be sure to disable async dual reads when you're done testing. -If you haven't already, revert `read_mode` in `vars/zdm_proxy_core_config.yml` to `PRIMARY_ONLY` when switching sync reads to Target. +If you performed the optional steps described in the prior topic, xref:enable-async-dual-reads.adoc[] -- to verify that your target cluster was ready and tuned appropriately to handle the production read load -- be sure to disable async dual reads when you're done testing. +If you haven't already, revert `read_mode` in `vars/zdm_proxy_core_config.yml` to `PRIMARY_ONLY` when switching sync reads to the target cluster. Example: [source,yml] @@ -28,7 +24,7 @@ Example: read_mode: PRIMARY_ONLY ---- -Otherwise, if you don't disable async dual reads, {product-proxy} instances would continue to send async reads to Origin, which, although harmless, is unnecessary. +If you don't disable async dual reads, {product-proxy} instances continue to send async reads to the origin, which, although harmless, is unnecessary. ==== == Changing the read routing configuration @@ -64,40 +60,42 @@ ansible-playbook rolling_update_zdm_proxy.yml -i zdm_ansible_inventory ---- Wait for the {product-proxy} instances to be restarted by Ansible, one by one. -All instances will now send all reads to Target instead of Origin. -In other words, Target is now the primary cluster, but the {product-proxy} is still keeping Origin up-to-date via dual writes. +All instances will now send all reads to the target cluster instead of the origin cluster. + +At this point, the target cluster becomes the primary cluster, but the {product-proxy} still keeps the origin cluster up-to-date through dual writes. == Verifying the read routing change -Once the read routing configuration change has been rolled out, you may want to verify that reads are correctly sent to Target as expected. +Once the read routing configuration change has been rolled out, you may want to verify that reads are correctly sent to the target cluster, as expected. This is not a required step, but you may wish to do it for peace of mind. [TIP] ==== Issuing a `DESCRIBE` or a read to any system table through the {product-proxy} is *not* a valid verification. -The {product-proxy} handles reads to system tables differently, by intercepting them and always routing them to Origin, in some cases partly populating them at proxy level. +The {product-proxy} handles reads to system tables differently, by intercepting them and always routing them to the origin, in some cases partly populating them at proxy level. -This means that system reads are *not representative* of how the {product-proxy} routes regular user reads: even after you switched the configuration to read from Target as the primary cluster, all system reads will still go to Origin. +This means that system reads are *not representative* of how the {product-proxy} routes regular user reads. +Even after you switched the configuration to read the target cluster as the primary cluster, all system reads still go to the origin. Although `DESCRIBE` requests are not system requests, they are also generally resolved in a different way to regular requests, and should not be used as a means to verify the read routing behavior. - ==== Verifying that the correct routing is taking place is a slightly cumbersome operation, due to the fact that the purpose of the {product-short} process is to align the clusters and therefore, by definition, the data will be identical on both sides. For this reason, the only way to do a manual verification test is to force a discrepancy of some test data between the clusters. To do this, you could consider using the xref:connect-clients-to-proxy.adoc#_themis_client[Themis sample client application]. -This client application connects directly to Origin, Target and the {product-proxy}, inserts some test data in its own table and allows you to view the results of reads from each source. -Please refer to its README for more information. +This client application connects directly to the origin cluster, the target cluster, and the {product-proxy}. +It inserts some test data in its own table, and then you can view the results of reads from each source. +Refer to the Themis README for more information. Alternatively, you could follow this manual procedure: * Create a small test table on both clusters, for example a simple key/value table (it could be in an existing keyspace, or in one that you create specifically for this test). For example `CREATE TABLE test_keyspace.test_table(k TEXT PRIMARY KEY, v TEXT);`. -* Use `cqlsh` to connect *directly to Origin*. -Insert a row with any key, and with a value specific to Origin, for example `INSERT INTO test_keyspace.test_table(k, v) VALUES ('1', 'Hello from Origin!');`. -* Now, use `cqlsh` to connect *directly to Target*. -Insert a row with the same key as above, but with a value specific to Target, for example `INSERT INTO test_keyspace.test_table(k, v) VALUES ('1', 'Hello from Target!');`. -* Now, use `cqlsh` to connect to the {product-proxy} (see xref:connect-clients-to-proxy.adoc#_connecting_cqlsh_to_the_zdm_proxy[here] for how to do this) and issue a read request for this test table: `SELECT * FROM test_keyspace.test_table WHERE k = '1';`. +* Use `cqlsh` to connect *directly to the origin cluster*. +Insert a row with any key, and with a value specific to the origin cluster, for example `INSERT INTO test_keyspace.test_table(k, v) VALUES ('1', 'Hello from the origin cluster!');`. +* Now, use `cqlsh` to connect *directly to the target cluster*. +Insert a row with the same key as above, but with a value specific to the target cluster, for example `INSERT INTO test_keyspace.test_table(k, v) VALUES ('1', 'Hello from the target cluster!');`. +* Now, use `cqlsh` to xref:connect-clients-to-proxy.adoc#_connecting_cqlsh_to_the_zdm_proxy[connect to the {product-proxy}], and then issue a read request for this test table: `SELECT * FROM test_keyspace.test_table WHERE k = '1';`. The result will clearly show you where the read actually comes from. diff --git a/modules/ROOT/pages/components.adoc b/modules/ROOT/pages/components.adoc index a2acaa17..1350062d 100644 --- a/modules/ROOT/pages/components.adoc +++ b/modules/ROOT/pages/components.adoc @@ -1,7 +1,5 @@ = Components :page-tag: migration,zdm,zero-downtime,zdm-proxy,components -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] The main component of the {company} {product} product suite is **{product-proxy}**, which by design is a simple and lightweight proxy that handles all the real-time requests generated by your client applications. @@ -16,15 +14,16 @@ The {product-proxy} itself doesn't have any capability to migrate data or knowle == Role of {product-proxy} -We created {product-proxy} to function between the application and both databases (Origin and Target). +{company} created {product-proxy} to function between the application and both the origin and target databases. The databases can be any CQL-compatible data store, such as {cass-reg}, {dse}, and {astra-db}. The proxy always sends every write operation (Insert, Update, Delete) synchronously to both clusters at the desired Consistency Level: * If the write is successful in both clusters, it returns a successful acknowledgement to the client application. * If the write fails on either cluster, the failure is passed back to the client application so that it can retry it as appropriate, based on its own retry policy. -This design ensures that new data is always written to both clusters, and that any failure on either cluster is always made visible to the client application. -{product-proxy} also sends all reads to the primary cluster (initially Origin, and later Target) and returns the result to the client application. +This design ensures that new data is always written to both clusters, and that any failure on either cluster is always made visible to the client application. +{product-proxy} also sends all reads to the primary cluster, and then returns the result to the client application. +The primary cluster is initially the origin cluster, and you change it to the target cluster at the end of the migration process. {product-proxy} is designed to be highly available. It can be scaled horizontally, so typical deployments are made up of a minimum of 3 servers. {product-proxy} can be restarted in a rolling fashion, for example, to change configuration for different phases of the migration. @@ -37,32 +36,32 @@ Unless it is for a demo or local testing environment, a {product-proxy} deployme The term {product-proxy} indicates the whole deployment, and {product-proxy} instance refers to an individual proxy process in the deployment. ==== -== Key features of {product-proxy} +=== Key features of {product-proxy} -* Allows you to lift-and-shift existing application code from **Origin** to **Target** with a simple change of a connection string. +* Allows you to lift-and-shift existing application code from your origin cluster to your target cluster by changing only the connection string. -* Reduces risks to upgrades and migrations by decoupling Origin from Target, and allowing there to be an explicit cut-over point once you're satisfied with Target. +* Reduces risks to upgrades and migrations by decoupling the origin cluster from the target cluster, and allowing you to determine an explicit cut-over point once you're ready to commit to using the target cluster permanently. * Bifurcates writes synchronously to both clusters during the migration process. * Returns (for read operations) the response from the primary cluster, which is its designated source of truth. -During a migration, Origin is typically the primary cluster. -Near the end of the migration, you'll shift the primary cluster to be Target. +During a migration, the primary cluster is typically the origin cluster. +Near the end of the migration, you shift the primary cluster to be the target cluster. -* Can be configured to also read asynchronously from Target. -This capability is called **Asynchronous Dual Reads** (also known as **Read Mirroring**) and allows you to observe what read latencies and throughput Target can achieve under the actual production load. -** Results from the asynchronous reads executed on Target are not sent back to the client application. -** This design implies that failure on asynchronous reads from Target does not cause an error on the client application. +* Can be configured to also read asynchronously from the target cluster. +This capability is called **Asynchronous Dual Reads** (also known as **Read Mirroring**), and it allows you to observe what read latencies and throughput the target cluster can achieve under the actual production load. +** Results from the asynchronous reads executed on the target cluster are not sent back to the client application. +** This design implies that a failure on asynchronous reads from the target cluster does not cause an error on the client application. ** Asynchronous dual reads can be enabled and disabled dynamically with a rolling restart of the {product-proxy} instances. [NOTE] ==== -When using Asynchronous Dual Reads, any additional read load on Target may impact its ability to keep up with writes. +When using Asynchronous Dual Reads, any additional read load on the target cluster may impact its ability to keep up with writes. This behavior is expected and desired. -The idea is to mimic the full read and write load on Target so there are no surprises during the last migration phase; that is, after cutting over completely to Target. +The idea is to mimic the full read and write load on the target cluster so there are no surprises during the last migration phase; that is, after cutting over completely to the target cluster. ==== -== {product-utility} and {product-automation} +=== {product-utility} and {product-automation} https://www.ansible.com/[Ansible] is a suite of software tools that enables infrastructure as code. It is open source and its capabilities include software provisioning, configuration management, and application deployment functionality. @@ -92,24 +91,24 @@ Other technologies such as Apache Spark(TM) can be used to write your own custom [TIP] ==== -An important **prerequisite** to use {cass-migrator} is that you already have the matching schema on Target. +To use {cass-migrator}, the schema on your origin and target clusters must match. ==== Use {cass-migrator} to: -* Migrate your data from any CQL-supported Origin to any CQL-supported Target. +* Migrate your data from any CQL-supported origin cluster to any CQL-supported target cluster. Examples of databases that support CQL are {cass-reg}, {dse}, and {astra-db}. * Validate migration accuracy and performance using examples that provide a smaller, randomized data set. * Preserve internal `writetime` timestamps and Time To Live (TTL) values. * Take advantage of advanced data types (Sets, Lists, Maps, UDTs). -* Filter records from the Origin data, using {cass-short}'s internal `writetime` timestamp. +* Filter records from the origin cluster's data, using {cass-short}'s internal `writetime` timestamp. * Use SSL Support, including custom cipher algorithms. {cass-migrator} is designed to: -* Connect to and compare your Target database with Origin. +* Connect to and compare your target database/cluster with the origin database/cluster. * Report differences in a detailed log file. -* Optionally reconcile any missing records and fix any data inconsistencies in Target by enabling `autocorrect` in a config file. +* Optionally reconcile any missing records and fix any data inconsistencies in the target cluster by enabling `autocorrect` in a config file. === {dsbulk-migrator} diff --git a/modules/ROOT/pages/connect-clients-to-proxy.adoc b/modules/ROOT/pages/connect-clients-to-proxy.adoc index d3c1de61..1d1e3a92 100644 --- a/modules/ROOT/pages/connect-clients-to-proxy.adoc +++ b/modules/ROOT/pages/connect-clients-to-proxy.adoc @@ -1,8 +1,6 @@ = Connect your client applications to {product-proxy} :navtitle: Connect client applications to {product-proxy} :page-tag: migration,zdm,zero-downtime,zdm-proxy,connect-apps -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] The {product-proxy} is designed to be similar to a conventional {cass-reg} cluster. You communicate with it using the CQL query language used in your existing client applications. @@ -13,7 +11,7 @@ On this page, we explain how to connect your client applications to a {cass-shor We then move on to discuss how this process changes when connecting to a {product-proxy}. We conclude by describing two sample client applications that serve as real-world examples of how to build a client application that works effectively with {product-proxy}. -You can use the provided sample client applications, in addition to your own, as a quick way to validate that the deployed {product-proxy} is reading and writing data from the expected Origin and Target clusters. +You can use the provided sample client applications, in addition to your own, as a quick way to validate that the deployed {product-proxy} is reading and writing data from the expected origin and target clusters. Finally, we will explain how to connect the `cqlsh` command-line client to the {product-proxy}. @@ -103,19 +101,19 @@ This is disabled by default in all drivers, but if it was enabled in your client The credentials provided by the client application are used when forwarding its requests. However, the client application has no notion that there are two clusters involved: from its point of view, it talks to just one cluster as usual. -For this reason, the {product-proxy} will only use the client application credentials when forwarding requests to one cluster (typically Target), and it will resort to using the credentials in its own configuration to forward requests to the other cluster (typically Origin). +For this reason, the {product-proxy} will only use the client application credentials when forwarding requests to one cluster (typically the target), and it will resort to using the credentials in its own configuration to forward requests to the other cluster (typically the origin). -This means that, if your {product-proxy} is configured with an Origin or Target cluster with **user authentication enabled**, your client application has to provide credentials when connecting to the proxy: +This means that, if your {product-proxy} is configured with an origin or target cluster with **user authentication enabled**, your client application has to provide credentials when connecting to the proxy: -* If both clusters require authentication, your client application must pass the credentials for Target. -This is also the case if only Target requires authentication but Origin does not. -* If Origin requires authentication but Target does not, your client application must supply credentials for Origin. +* If both clusters require authentication, your client application must pass the credentials for the target. +This is also the case if authentication is required by the target only, but not the origin. +* If the origin requires authentication but the target does not, then your client application must supply credentials for the origin. * If neither cluster requires authentication, no credentials are needed. [cols="1,1,1"] |=== -|Auth enabled on Origin -|Auth enabled on Target +|Auth enabled on the origin +|Auth enabled on the target |Client application credentials |Yes @@ -141,14 +139,13 @@ image::zdm-proxy-credential-usage.png[{product-proxy} credentials usage, 550] === A note on the Secure Connect Bundle -If your {product-proxy} is configured to use {astra-db} as an Origin or Target, your client application **does not need** to provide a Secure Connect Bundle (SCB) when connecting to the proxy. +If your {product-proxy} is configured to use {astra-db} as the origin or target cluster, then your client application **does not need** to provide a Secure Connect Bundle (SCB) when connecting to the proxy. It will, however, have to supply an {astra-db} application token's client ID and client secret as a username and password (respectively). - == Sample client applications The documentation for the {company} drivers provides information about how to connect these drivers to your {cass-short} cluster or {product-proxy} and how to use them to issue queries, update data and perform other actions. -In addition to the smaller code samples provided in the documentation, we also provide a few sample client applications which demonstrate the use of the {company} Java driver to interact with {product-proxy} as well as Origin and Target for that proxy. +In addition to the smaller code samples provided in the documentation, we also provide a few sample client applications which demonstrate the use of the {company} Java driver to interact with {product-proxy} as well as the origin and target for that proxy. === {product-demo} @@ -162,9 +159,9 @@ You can find the details of building and running {product-demo} in the https://g https://github.com/absurdfarce/themis[Themis] is a Java command-line client application that allows you to insert randomly-generated data into some combination of these three sources: -* Directly into Origin -* Directly into Target -* Into the {product-proxy}, and subsequently on to Origin and Target +* Directly into the origin +* Directly into the target +* Into the {product-proxy}, and subsequently on to the origin and target The client application can then be used to query the inserted data. This allows you to validate that the {product-proxy} is reading and writing data from the expected sources. @@ -183,7 +180,7 @@ Using CQLSH to connect to a {product-proxy} instance is very easy: * Download CQLSH for free from https://downloads.datastax.com/#cqlsh[here] on a machine that has connectivity to the {product-proxy} instances: ** To connect to the {product-proxy}, any version is fine. -** The {astra-db}-compatible version additionally supports connecting directly to an {astra-db} cluster by passing the cluster's Secure Connect Bundle and valid credentials. +** The {astra}-compatible version additionally supports connecting directly to an {astra-db} cluster by passing the cluster's Secure Connect Bundle and valid credentials. * Install it by uncompressing the archive: `tar -xvf cqlsh-<...>.tar.gz`. * Navigate to the `cqlsh-<...>/bin` directory, for example `cd cqlsh-astra/bin`. * Launch CQLSH: diff --git a/modules/ROOT/pages/connect-clients-to-target.adoc b/modules/ROOT/pages/connect-clients-to-target.adoc index a64f5509..534c5557 100644 --- a/modules/ROOT/pages/connect-clients-to-target.adoc +++ b/modules/ROOT/pages/connect-clients-to-target.adoc @@ -1,8 +1,6 @@ -= Step 5: Connect your client applications directly to Target -:navtitle: Phase 5: Connect client applications directly to Target += Phase 5: Connect your client applications directly to the target +:navtitle: Phase 5: Connect client applications directly to the target :page-tag: migration,zdm,zero-downtime,zdm-proxy,connect-apps,target -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] At this point in our migration phases, we've completed: @@ -10,41 +8,51 @@ At this point in our migration phases, we've completed: * Phase 2: Migrated and validated our data with {cass-migrator} and/or {dsbulk-migrator}. -* Phase 3: Enabled async data reads (an optional step) to check that Target can handle the read/write traffic. +* Phase 3: Optionally enabled async data reads to check that the target cluster can handle the full production workload of read/write traffic. -* Phase 4: Changed read routing to Target. +* Phase 4: Changed read routing to the target cluster. -Now we're ready to perform Phase 5, in which we will configure our client applications to connect directly to Target. -The way you do this varies based on whether your Target is {astra-db}, {cass-reg}, or {dse} cluster. +In Phase 5 you will configure your client applications to connect directly to the target cluster. +How you do this depends on whether your target cluster is {astra-db}, {cass-reg}, or {dse-short}. -//include::partial$lightbox-tip.adoc[] - -image::{imagesprefix}migration-phase5ra.png[Phase 5 diagram shows apps no longer using proxy and instead connected directly to Target.] +image::migration-phase5ra.png[In Phase 5, your apps no longer using the proxy and, instead, connect directly to the target.] //For illustrations of all the migration phases, see the xref:introduction.adoc#_migration_phases[Introduction]. == Configuring your driver to connect to a generic CQL cluster -If your Target is a generic CQL cluster (such as {cass-short} or {dse-short}), you can connect your client application to it in a similar way as you previously connected it to Origin, but with the appropriate contact points and any additional configuration that your Target may require. +If your target cluster is a generic CQL cluster, such as {cass-short} or {dse-short}, then you can connect your client application to it in a similar way as you previously connected it to the origin cluster, but with the appropriate contact points and any additional configuration that your target cluster may require. For further information, please refer to the documentation of the driver language and version that you are using. == Configuring your driver to connect to {astra-db} -To connect to {astra-db}, you need: +//TODO: You can use an AstraCS token and the literal string `token` instead of clientID and client secret. -* The ClientID and Client Secret from an {astra-db} application token with *Organization Administrator* permissions for the organization to which your {astra-db} database belongs: -** Note: You will already have used these credentials when you configured the {product-proxy} to connect to your {astra-db} database as Target. -** For more information on creating credentials (tokens), see https://docs.datastax.com/en/astra/astra-db-vector/administration/manage-application-tokens.html[here]. -* The Secure Connect Bundle (SCB) for your {astra-db} database: -** This is a zip archive containing connection metadata and files to automatically enable Mutual TLS encryption between your client application and {astra-db}. -** There is one SCB for each {astra-db} database (or one for each region of an {astra-db} multi-region database). -** The SCB **does not contain** your DB credentials. +To connect to {astra-db}, you need: -include::partial$tip-scb.adoc[] +* The ClientID and Client Secret from an {astra-db} application token with *Organization Administrator* permissions for the organization to which your {astra-db} database belongs. ++ +You will already have used these credentials when you configured the {product-proxy} to connect to your {astra-db} database as the target cluster. +For more information on creating credentials (tokens), see xref:astra-db-serverless:administration:manage-application-tokens.adoc[]. + +* Your {astra-db} database's Secure Connect Bundle (SCB). ++ +The SCB is a zip file that contains TLS encryption certificates and other metadata required to connect to your database. +Databases can have one or more SCBs. +For more information, see xref:astra-db-serverless:drivers:secure-connect-bundle.adoc[]. ++ +[IMPORTANT] +==== +The SCB contains sensitive information that establishes a connection to your database, including key pairs and certificates. +Treat is as you would any other sensitive values, such as passwords or tokens. +==== -Make sure you choose a driver language and version that is compatible with {astra-db}. +* Recommended: A driver language and version that is compatible with {astra-db}. For more information, see xref:datastax-drivers:compatibility:driver-matrix.adoc[]. +If your client application uses an old version of a driver without built-in SCB support, {company} strongly recommends upgrading to a compatible driver to simplify configuration and get the latest features and bug fixes. +However, you can still connect to {astra-db} for this migration by using https://github.com/datastax/cql-proxy[CQL Proxy] or extracting the SCB archive and using the individual files to enable mTLS in your driver's configuration. + // The SCB support was made available beginning the following versions in the drivers: // // * https://docs.datastax.com/en/developer/cpp-driver/latest/changelog/#2-14-0[Beginning `2.14.0` of {company} C++ Driver]. @@ -59,18 +67,11 @@ For more information, see xref:datastax-drivers:compatibility:driver-matrix.adoc // // Based on this, follow the instructions in the relevant section below. -[TIP] -==== -You **do not need to upgrade your client application driver** to enable it to connect to {astra-db}. - -If your client application uses an old version of the driver without built-in SCB support, upgrading it would make the configuration easier, but it is not required and can be also done at a later time if desired. -==== - -=== Drivers with built-in support for the Secure Connect Bundle +If your driver has built-in support for the {astra-db} Secure Connect Bundle (SCB), the changes to enable your application to connect to {astra-db} are minimal. -The great news is that, if your driver has built-in support for the {astra-db} Secure Connect Bundle (SCB), the changes to enable your application to connect to {astra-db} are minimal. +//Recalling the xref:connect-clients-to-proxy.adoc#_connecting_company_drivers_to_cassandra[pseudocode to enable your client application to connect to the proxy], here it is how your code needs to change to connect directly to {astra-db}: -Recalling the xref:connect-clients-to-proxy.adoc#_connecting_company_drivers_to_cassandra[pseudocode] to enable your client application to connect to the proxy, here it is how it needs to change to connect directly to {astra-db}: +The following pseudocode provides guidance on how you might change your driver's code to connect directly to {astra-db}: [source] ---- @@ -97,7 +98,8 @@ print(release_version) ---- As noted before, this pseudocode is just a guideline to illustrate the changes that are needed. -For the specific syntax that applies to your driver, please refer to the documentation for your driver language and version: +For the specific syntax that applies to your driver, see the following documentation: +//TODO: Bring migration steps to this page instead of on the astra db pages where they don't seem to belong. * https://docs.datastax.com/en/astra-serverless/docs/connect/drivers/connect-cplusplus.html[C++ driver]. @@ -109,30 +111,9 @@ For the specific syntax that applies to your driver, please refer to the documen * https://docs.datastax.com/en/astra-serverless/docs/connect/drivers/connect-python.html[Python driver]. -That's it! Your client application is now able to connect directly to your {astra-db} database. -=== Drivers without support for the Secure Connect Bundle - -It is possible to configure older or community-contributed drivers to connect to {astra-db} even if they lack built-in SCB support. - -To do so, you will need to extract the files from the SCB and use them to enable Mutual TLS in the configuration of your driver. -Please see https://docs.datastax.com/en/astra-serverless/docs/connect/drivers/legacy-drivers.html[here] for detailed instructions for each driver. - -Alternatively, you could also consider using https://www.datastax.com/blog/easily-connect-apache-cassandra-workloads-to-datastaxs-serverless-dbaas-with-our-cql-proxy[CQL Proxy], which is an open-source lightweight proxy that abstracts away all {astra-db}-specific connection configuration from your client application. - -=== A word on the cloud-native drivers - -Now that your client application is running on {astra-db}, you can take advantage of many additional features and APIs that {astra-db} offers such as gRPC, GraphQL, Document REST APIs and many more. -To access these features, you may wish to consider moving to a cloud-native driver. -This can be done at any time, as part of the future development and evolution of your client application. - -Here are the cloud-native drivers currently available: - -* https://docs.datastax.com/en/astra-serverless/docs/connect/drivers/connect-java.html#_connecting_with_java_cloud_native_driver[Java cloud-native driver]. -* https://docs.datastax.com/en/astra-serverless/docs/connect/drivers/connect-nodejs.html#_connecting_with_node_js_cloud_native_driver[Node.js cloud-native driver]. - == Phase 5 of migration completed -Until this point, in case of any issues, you could have abandoned the migration and rolled back to connect directly to Origin at any time. -From this point onward, the clusters will diverge, and Target is the source of truth for your client applications and data. \ No newline at end of file +Until this point, in case of any issues, you could have abandoned the migration and rolled back to connect directly to the origin cluster at any time. +From this point onward, the clusters will diverge, and the target cluster becomes the source of truth for your client applications and data. \ No newline at end of file diff --git a/modules/ROOT/pages/contributions.adoc b/modules/ROOT/pages/contributions.adoc index 70a7209e..14e76d96 100644 --- a/modules/ROOT/pages/contributions.adoc +++ b/modules/ROOT/pages/contributions.adoc @@ -1,8 +1,7 @@ = Contribution guidelines :page-tag: migration,zdm,zero-downtime,zdm-proxy,contributions -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] +//TODO: remove this. There is a contribution guide on GH. https://github.com/datastax/zdm-proxy/blob/main/CONTRIBUTING.md {company} {product} ({product-short}) provides a simple and reliable way for users to migrate an existing {cass-reg} or {dse} cluster to {astra-db}, or to any {cass-short} or {dse-short} cluster, without any interruption of service to the client applications and data. The {product-proxy} is open source software (OSS). We welcome contributions from the developer community via Pull Requests on a fork, for evaluation by the {product-short} team. @@ -26,8 +25,7 @@ Refer to the https://cla.datastax.com/[CLA terms] and, if you agree, indicate yo The overall procedure: -. Start on the open-source public repo, https://github.com/datastax/zdm-proxy/. -. Fork the repo by clicking the Fork button in the GitHub UI. +. Fork the https://github.com/datastax/zdm-proxy/[{product-proxy} open-source public repo]. . Make your changes locally on your fork. Git commit and push only to your fork. . Wait for CI to run successfully in GitHub Actions before submitting a PR. . Submit a Pull Request (PR) with your forked updates. diff --git a/modules/ROOT/pages/create-target.adoc b/modules/ROOT/pages/create-target.adoc index 18c0e753..32f189b6 100644 --- a/modules/ROOT/pages/create-target.adoc +++ b/modules/ROOT/pages/create-target.adoc @@ -1,39 +1,28 @@ = Create the target environment for your migration :navtitle: Create target environment for migration :page-tag: migration,zdm,zero-downtime,zdm-proxy,target -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] -In this topic, we'll see how to create and prepare a new cluster to be used as Target. +You must create and prepare a new cluster to be the target for your migration. This section covers in detail the steps to prepare an {astra-db} Serverless database, and also outlines how to create and prepare a different cluster, which could be for example {cass-short} 4.0.x or {dse-short} 6.8.x. -== Overview +== Using an {astra-db} database as the target -If you intend to use {astra-db} as Target for the migration, you will need to: +If you intend to use {astra-db} as the target for the migration, you will need to: * Create an {astra-db} Serverless database. * Retrieve its Secure Connect Bundle (SCB) and upload it to the application instances. * Create {astra-db} access credentials for your database. * Create the client application schema. -To use a generic {cass-short} or {dse-short} cluster, you will have to: - -* Provision the infrastructure for your new cluster. -* Create the cluster with the desired version of {cass-short} or {dse-short}. -* Configure the cluster according to your requirements. -* Create the client application schema. - -== Using an {astra-db} database as Target - === Prerequisites -* An active {astra-url}[{astra-db} account^] +* An active {astra-url}[{astra} account^] === Create an {astra-db} Serverless database Log into the {astra-ui} and create an {astra-db} Serverless database. -You can start with a Free plan, but consider upgrading during your migration project to an {astra-db} Pay As You Go or Enterprise plan, to take advantage of additional functionality -- such as Exporting Metrics to external third-party applications, Bring Your Own Keys, and other features. +You can start with a Free plan, but consider upgrading during your migration project to an {astra} Pay As You Go or Enterprise plan, to take advantage of additional functionality -- such as Exporting Metrics to external third-party applications, Bring Your Own Keys, and other features. The Pay As You Go and Enterprise plans have many benefits over the Free plan, such as the ability to lift rate limiting, and avoiding hibernation timeouts. @@ -51,19 +40,20 @@ Save the generate token and credentials (Client ID, Client Secret, and Token) in === Get the Secure Connect Bundle and upload to client instances -Your database's https://docs.datastax.com/en/astra/astra-db-vector/drivers/secure-connect-bundle.html#download-the-secure-connect-bundle[Secure Connect Bundle] (SCB) is a zip file that contains the TLS encryption certificates and other metadata to connect to your database. -It will be needed by: +//TODO: Bring SCB attributes -* Your client application, to connect directly to {astra-db} near the end of the migration; -* {cass-migrator} or {dsbulk-migrator}, to migrate and validate data into {astra-db}. +xref:astra-db-serverless:drivers:secure-connect-bundle.adoc[Download your {astra-db} database's Secure Connect Bundle (SCB)]. +The SCB is a zip file that contains TLS encryption certificates and other metadata required to connect to your database. -Note that the credentials are **not contained** in the SCB. - -// * The {company} Bulk Migrator to import the existing data into {astra-db} +[IMPORTANT] +==== +The SCB contains sensitive information that establishes a connection to your database, including key pairs and certificates. +Treat is as you would any other sensitive values, such as passwords or tokens. +==== -include::partial$tip-scb.adoc[] +Your client application uses the SCB to connect directly to {astra-db} near the end of the migration, and {cass-migrator} or {dsbulk-migrator} use the SCB to migrate and validate data in {astra-db}. -To copy the SCB to your client application instance, use `scp`: +Use `scp` to copy the SCB to your client application instance: [source,bash] ---- @@ -72,28 +62,39 @@ scp -i secure-connect-.zip @:/health/liveness -http://:/health/readiness +http://**ZDM_PROXY_PRIVATE_IP**:**METRICS_PORT**/health/liveness +http://**ZDM_PROXY_PRIVATE_IP**:**METRICS_PORT**/health/readiness ---- Readiness expanded GET format: @@ -210,26 +223,21 @@ Readiness expanded GET format: curl -G "http://{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] }}:{{ metrics_port }}/health/readiness" ---- -The default port for metrics collection is `14001`. -Optionally, you may have overridden this port when you deployed the {product-proxy} specifying a custom, non-default port that was set by changing the value of the configuration variable `metrics_port`. -See xref:deploy-proxy-monitoring.adoc#_ports[this section] for more information. +The default port for metrics collection is `14001`. +You can override this port if you deploy the {product-proxy} with `metrics_port` set to a non-default port. +For more information, see <>. +Readiness example: -[tabs] -==== -Readiness example:: -+ --- [source,bash] ---- curl -G "http://172.18.10.40:14001/health/readiness" ---- --- -+ -Result:: -+ --- -[source,bash] + +.Result +[%collapsible] +==== +[source,json] ---- { "OriginStatus":{ @@ -247,7 +255,6 @@ Result:: "Status":"UP" } ---- --- ==== === Check {product-proxy} instances via docker logs @@ -302,7 +309,7 @@ If the {product-proxy} instances fail to start up due to mistakes in the configu [NOTE] ==== -With the exception of the Origin and Target credentials and the `primary_cluster` variable, which can all be changed for existing deployments in a rolling fashion, all cluster connection configuration variables are considered immutable and can only be changed by recreating the deployment. +With the exception of the origin credentials, target credentials, and the `primary_cluster` variable, which can all be changed for existing deployments in a rolling fashion, all cluster connection configuration variables are considered immutable and can only be changed by recreating the deployment. If you wish to change any of the cluster connection configuration variables (other than credentials and `primary_cluster`) on an existing deployment, you will need to re-run the `deploy_zdm_proxy.yml` playbook. This playbook can be run as many times as necessary. @@ -320,7 +327,7 @@ It includes the following components, all deployed as Docker containers: * Prometheus node exporter, which runs on each {product-proxy} host and makes OS- and host-level metrics available to Prometheus. * Prometheus server, to collect metrics from the {product-proxy} process, its Golang runtime and the Prometheus node exporter. -* Grafana, to visualize all these metrics in three preconfigured dashboards (see xref:troubleshooting-tips.adoc#how-to-leverage-metrics[this section] of the troubleshooting tips for details). +* Grafana, to visualize all these metrics in three preconfigured dashboards (see xref:ROOT:metrics.adoc[]). After running the playbook described here, you will have a fully configured monitoring stack connected to your {product-proxy} deployment. @@ -349,7 +356,7 @@ ubuntu@52772568517c:~$ === Configure the Grafana credentials -Edit the file `zdm_monitoring_config.yml`, located in `zdm-proxy-automation/ansible/vars`: +Edit the file `zdm_monitoring_config.yml`, stored at `zdm-proxy-automation/ansible/vars`: * `grafana_admin_user`: leave unchanged (defaults to `admin`) * `grafana_admin_password`: set to the password of your choice @@ -374,5 +381,5 @@ Login with: [TIP] ==== -Details about the metrics you can observe are available in xref:troubleshooting-tips.adoc#how-to-leverage-metrics[this section] of the troubleshooting tips. +Details about the metrics you can observe are available in xref:ROOT:metrics.adoc[]. ==== diff --git a/modules/ROOT/pages/deployment-infrastructure.adoc b/modules/ROOT/pages/deployment-infrastructure.adoc index c3a60552..7c2ef01a 100644 --- a/modules/ROOT/pages/deployment-infrastructure.adoc +++ b/modules/ROOT/pages/deployment-infrastructure.adoc @@ -1,14 +1,12 @@ = Deployment and infrastructure considerations :page-tag: migration,zdm,zero-downtime,zdm-proxy,deploy,infrastructure -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] == Choosing where to deploy the proxy A typical {product-proxy} deployment is made up of multiple proxy instances. A minimum of three proxy instances is recommended for any deployment apart from those for demo or local testing purposes. -All {product-proxy} instances must be reachable by the client application and must be able to connect to your Origin and Target clusters. +All {product-proxy} instances must be reachable by the client application and must be able to connect to your origin and target clusters. The {product-proxy} process is lightweight, requiring only a small amount of resources and no storage to persist state (apart from logs). The {product-proxy} should be deployed close to your client application instances. @@ -18,7 +16,7 @@ If you have a multi-DC cluster with multiple set of client application instances Here's a typical deployment showing connectivity between client applications, {product-proxy} instances, and clusters: -image::{imagesprefix}zdm-during-migration3.png[Connectivity between client applications, proxy instances, and clusters.] +image::zdm-during-migration3.png[Connectivity between client applications, proxy instances, and clusters.] == Infrastructure requirements @@ -62,7 +60,7 @@ For example, you can use four VMs that are the equivalent of an AWS m5.4xlarge, Next, run {dsbulk-migrator} or {cass-migrator} in parallel on each VM with each one responsible for migrating around 5TB of data. If there is one super large table (e.g. 15 TB of 20 TB is in one table), you can choose to migrate this table in three parts on three separate VMs in parallel by splitting the full token range into three parts and migrating the rest of the tables on the fourth VM. -* Ensure that your Origin and Target clusters can handle high traffic from {cass-migrator} or {dsbulk-migrator} in addition to the live traffic from your application. +* Ensure that your origin and target clusters can handle high traffic from {cass-migrator} or {dsbulk-migrator} in addition to the live traffic from your application. * Test any migration in a lower environment before you plan to do it in production. @@ -84,10 +82,10 @@ The {product-proxy} machines should not be directly accessible by external machi The only direct access to these machines should be from the jumphost. ==== -The {product-proxy} machines must be able to connect to the Origin and Target cluster nodes: +The {product-proxy} machines must be able to connect to the origin and target cluster nodes: * For self-managed clusters ({cass} or {dse-short}), connectivity is needed to the {cass-short} native protocol port (typically 9042). -* For {astra-db}, you will need to ensure outbound connectivity to the {astra-db} endpoint indicated in the Secure Connect Bundle. +* For {astra-db}, you will need to ensure outbound connectivity to the {astra} endpoint indicated in the Secure Connect Bundle. Connectivity over Private Link is also supported. The connectivity requirements for the jumphost / monitoring machine are: diff --git a/modules/ROOT/pages/dsbulk-migrator.adoc b/modules/ROOT/pages/dsbulk-migrator.adoc index e33eb90f..7238fd5c 100644 --- a/modules/ROOT/pages/dsbulk-migrator.adoc +++ b/modules/ROOT/pages/dsbulk-migrator.adoc @@ -66,12 +66,12 @@ When doing a live migration, the options are used to effectively configure the { the clusters. When generating a migration script, most options serve as default values in the generated scripts. -Note however that, even when generating scripts, this tool still needs to access the Origin cluster +Note however that, even when generating scripts, this tool still needs to access the origin cluster in order to gather metadata about the tables to migrate. When generating a DDL file, only a few options are meaningful. Because the standard {dsbulk-loader} is not used, and the import cluster is never contacted, import options and {dsbulk-loader}-related options are ignored. -The tool still needs to access the Origin cluster in order to gather metadata about the keyspaces and tables for which to generate DDL statements. +The tool still needs to access the origin cluster in order to gather metadata about the keyspaces and tables for which to generate DDL statements. [[dsbulk-migrator-reference]] == {dsbulk-migrator} reference @@ -113,7 +113,7 @@ The default is to use an external {dsbulk-loader} command. | | `--export-bundle=PATH` -| The path to a secure connect bundle to connect to the Origin cluster, if that cluster is a {company} {astra-db} cluster. +| The path to a secure connect bundle to connect to the origin cluster, if that cluster is a {company} {astra-db} cluster. Options `--export-host` and `--export-bundle` are mutually exclusive. | @@ -130,7 +130,7 @@ Short options are not supported. | | `--export-host=HOST[:PORT]` -| The host name or IP and, optionally, the port of a node from the Origin cluster. +| The host name or IP and, optionally, the port of a node from the origin cluster. If the port is not specified, it will default to `9042`. This option can be specified multiple times. Options `--export-host` and `--export-bundle` are mutually exclusive. @@ -155,7 +155,7 @@ The default is `-1` (export the entire table). | | `--export-password` -| The password to use to authenticate against the Origin cluster. +| The password to use to authenticate against the origin cluster. Options `--export-username` and `--export-password` must be provided together, or not at all. Omit the parameter value to be prompted for the password interactively. @@ -169,7 +169,7 @@ This is an advanced setting; you should rarely need to modify the default value. | | `--export-username=STRING` -| The username to use to authenticate against the Origin cluster. +| The username to use to authenticate against the origin cluster. Options `--export-username` and `--export-password` must be provided together, or not at all. | `-h` @@ -178,7 +178,7 @@ Options `--export-username` and `--export-password` must be provided together, o | | `--import-bundle=PATH` -| The path to a secure connect bundle to connect to the Target cluster, if it's a {company} {astra-db} cluster. +| The path to a Secure Connect Bundle to connect to a target {astra-db} cluster. Options `--import-host` and `--import-bundle` are mutually exclusive. | @@ -201,7 +201,7 @@ Short options are not supported. | | `--import-host=HOST[:PORT]` -| The host name or IP and, optionally, the port of a node from the Target cluster. +| The host name or IP and, optionally, the port of a node on the target cluster. If the port is not specified, it will default to `9042`. This option can be specified multiple times. Options `--import-host` and `--import-bundle` are mutually exclusive. @@ -226,13 +226,13 @@ Failed records will appear in a `load.bad` file in the {dsbulk-loader} operation | | `--import-password` -| The password to use to authenticate against the Target cluster. +| The password to use to authenticate against the target cluster. Options `--import-username` and `--import-password` must be provided together, or not at all. Omit the parameter value to be prompted for the password interactively. | | `--import-username=STRING` -| The username to use to authenticate against the Target cluster. Options `--import-username` and `--import-password` must be provided together, or not at all. +| The username to use to authenticate against the target cluster. Options `--import-username` and `--import-password` must be provided together, or not at all. | `-k` | `--keyspaces=REGEX` @@ -308,7 +308,7 @@ The data directory will be created if it does not exist. | | `--export-bundle=PATH` -| The path to a secure connect bundle to connect to the Origin cluster, if that cluster is a {company} {astra-db} cluster. +| The path to a secure connect bundle to connect to the origin cluster, if that cluster is a {company} {astra-db} cluster. Options `--export-host` and `--export-bundle` are mutually exclusive. | @@ -325,7 +325,7 @@ Short options are not supported. | | `--export-host=HOST[:PORT]` -| The host name or IP and, optionally, the port of a node from the Origin cluster. +| The host name or IP and, optionally, the port of a node from the origin cluster. If the port is not specified, it will default to `9042`. This option can be specified multiple times. Options `--export-host` and `--export-bundle` are mutually exclusive. @@ -350,7 +350,7 @@ The default is `-1` (export the entire table). | | `--export-password` -| The password to use to authenticate against the Origin cluster. +| The password to use to authenticate against the origin cluster. Options `--export-username` and `--export-password` must be provided together, or not at all. Omit the parameter value to be prompted for the password interactively. @@ -365,7 +365,7 @@ You should rarely need to modify the default value. | | `--export-username=STRING` -| The username to use to authenticate against the Origin cluster. +| The username to use to authenticate against the origin cluster. Options `--export-username` and `--export-password` must be provided together, or not at all. | `-h` @@ -374,7 +374,7 @@ Options `--export-username` and `--export-password` must be provided together, o | | `--import-bundle=PATH` -| The path to a secure connect bundle to connect to the Target cluster, if it's a {company} {astra-db} cluster. +| The path to a Secure Connect Bundle to connect to a target {astra-db} cluster. Options `--import-host` and `--import-bundle` are mutually exclusive. | @@ -397,7 +397,7 @@ Short options are not supported. | | `--import-host=HOST[:PORT]` -| The host name or IP and, optionally, the port of a node from the Target cluster. +| The host name or IP and, optionally, the port of a node on the target cluster. If the port is not specified, it will default to `9042`. This option can be specified multiple times. Options `--import-host` and `--import-bundle` are mutually exclusive. @@ -422,13 +422,13 @@ Failed records will appear in a `load.bad` file in the {dsbulk-loader} operation | | `--import-password` -| The password to use to authenticate against the Target cluster. +| The password to use to authenticate against the target cluster. Options `--import-username` and `--import-password` must be provided together, or not at all. Omit the parameter value to be prompted for the password interactively. | | `--import-username=STRING` -| The username to use to authenticate against the Target cluster. +| The username to use to authenticate against the target cluster. Options `--import-username` and `--import-password` must be provided together, or not at all. | `-k` @@ -481,25 +481,25 @@ The data directory will be created if it does not exist. | | `--export-bundle=PATH` -| The path to a secure connect bundle to connect to the Origin cluster, if that cluster is a {company} {astra-db} cluster. +| The path to a secure connect bundle to connect to the origin cluster, if that cluster is a {company} {astra-db} cluster. Options `--export-host` and `--export-bundle` are mutually exclusive. | | `--export-host=HOST[:PORT]` -| The host name or IP and, optionally, the port of a node from the Origin cluster. +| The host name or IP and, optionally, the port of a node from the origin cluster. If the port is not specified, it will default to `9042`. This option can be specified multiple times. Options `--export-host` and `--export-bundle` are mutually exclusive. | | `--export-password` -| The password to use to authenticate against the Origin cluster. +| The password to use to authenticate against the origin cluster. Options `--export-username` and `--export-password` must be provided together, or not at all. Omit the parameter value to be prompted for the password interactively. | | `--export-username=STRING` -| The username to use to authenticate against the Origin cluster. +| The username to use to authenticate against the origin cluster. Options `--export-username` and `--export-password` must be provided together, or not at all. | `-h` @@ -554,7 +554,7 @@ Do not use these values in your environment. === Generate migration script -Generate a migration script to migrate from an existing Origin cluster to a Target {astra-db} cluster: +Generate a migration script to migrate from an existing origin cluster to a target {astra-db} cluster: [source,bash] ---- @@ -572,8 +572,7 @@ Generate a migration script to migrate from an existing Origin cluster to a Targ === Migrate live using external {dsbulk-loader} install -Migrate live from an existing Origin cluster to a Target {astra-db} cluster using an external {dsbulk-loader} installation. -Passwords will be prompted interactively: +Perform a live migration from an existing origin cluster to a target {astra-db} cluster using an external {dsbulk-loader} installation: [source,bash] ---- @@ -589,11 +588,11 @@ Passwords will be prompted interactively: --import-password # password will be prompted ---- +Passwords are prompted interactively. + === Migrate live using embedded {dsbulk-loader} install -Migrate live from an existing Origin cluster to a Target {astra-db} cluster using the embedded {dsbulk-loader} installation. -Passwords will be prompted interactively. -In this example, additional {dsbulk-loader} options are passed. +Perform a live migration from an existing origin cluster to a target {astra-db} cluster using the embedded {dsbulk-loader} installation: [source,bash] ---- @@ -613,15 +612,16 @@ In this example, additional {dsbulk-loader} options are passed. --import-dsbulk-option "--executor.maxPerSecond=1000" ---- -[NOTE] -==== -In the example above, you must use the `dsbulk-migrator--embedded-dsbulk.jar` fat jar. -Otherwise, an error will be raised because no embedded {dsbulk-loader} can be found. -==== +Passwords areprompted interactively. + +The preceding example passes additional {dsbulk-loader} options. + +The preceding example requires the `dsbulk-migrator--embedded-dsbulk.jar` fat jar. +Otherwise, an error is raised because no embedded {dsbulk-loader} can be found. -=== Generate DDL to recreate Origin schema in Target +=== Generate DDL to recreate the origin schema on the target cluster -Generate DDL files to recreate the Origin schema in a Target {astra-db} cluster: +Generate DDL files to recreate the origin schema on a target {astra-db} cluster: [source,bash] ---- diff --git a/modules/ROOT/pages/enable-async-dual-reads.adoc b/modules/ROOT/pages/enable-async-dual-reads.adoc index 6bd63622..8c60b71c 100644 --- a/modules/ROOT/pages/enable-async-dual-reads.adoc +++ b/modules/ROOT/pages/enable-async-dual-reads.adoc @@ -1,20 +1,16 @@ = Phase 3: Enable asynchronous dual reads :page-tag: migration,zdm,zero-downtime,zdm-proxy,async-reads -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] In this phase, you can optionally enable asynchronous dual reads. -The idea is to test performance and verify that Target can handle your application's live request load before cutting over from Origin to Target. +The idea is to test performance and verify that the target cluster can handle your application's live request load before cutting over from the origin cluster to the target cluster. -//include::partial$lightbox-tip.adoc[] - -image::{imagesprefix}migration-phase3ra.png[Phase 3 diagram shows optional step enabling async dual reads to test performance of Target.] +image::migration-phase3ra.png[Phase 3 diagram shows optional step enabling async dual reads to test performance of the target.] //For illustrations of all the migration phases, see the xref:introduction.adoc#_migration_phases[Introduction]. [TIP] ==== -As you test the performance on Target, be sure to examine the async read metrics. +As you test the performance on the target, be sure to examine the async read metrics. As noted in the xref:#_validating_performance_and_error_rate[section] below, you can learn more in xref:metrics.adoc#_asynchronous_read_requests_metrics[Asynchronous read requests metrics]. ==== @@ -29,7 +25,7 @@ Example: read_mode: DUAL_ASYNC_ON_SECONDARY ---- -Before making the change, you should still have Origin as the primary cluster, which is the default: +Before making the change, you should still have the origin as the primary cluster, which is the default: [source,yml] ---- @@ -40,29 +36,30 @@ To apply this change, run the `rolling_update_zdm_proxy.yml` playbook as explain [NOTE] ==== -This optional phase introduces an additional check to make sure that Target can handle the load without timeouts or unacceptable latencies. -You would typically perform this step once you have migrated all the existing data from Origin and completed all validation checks and reconciliation, if necessary. +This optional phase introduces an additional check to make sure that the target can handle the load without timeouts or unacceptable latencies. +You would typically perform this step once you have migrated all the existing data from the origin cluster and completed all validation checks and reconciliation, if necessary. ==== == Asynchronous Dual Reads mode -When using the {product-proxy}, all writes are synchronously sent to both Origin and Target. +When using the {product-proxy}, all writes are synchronously sent to both the origin and target clusters. Reads operate differently: with the default read mode, reads are only sent to the primary cluster (Origin by default). -Before changing the read routing so that reads are routed to Target (phase 4), you may want to temporarily send the reads to both clusters, to make sure that Target can handle the full workload of reads and writes. +In Phase 4, you will change the read routing so that reads are routed to the target. +Before you do this, you might want to temporarily send the reads to both clusters to make sure that the target can handle the full workload of reads and writes. -If you set the proxy's read mode configuration variable (`read_mode`) to `DUAL_ASYNC_ON_SECONDARY`, then asynchronous dual reads will be enabled. +If you set the proxy's `read_mode` configuration variable to `DUAL_ASYNC_ON_SECONDARY`, then asynchronous dual reads will be enabled. That change will result in reads being additionally sent to the secondary cluster. The proxy will return the read response to the client application as soon as the primary cluster's response arrives. The secondary cluster's response will only be used to track metrics. There will be no impact to the client application if the read fails on the secondary cluster, or if the read performance on the secondary cluster is degraded. -Therefore, this feature can be used as a safer way to test the full workload on Target before making the switch to set Target as the primary cluster (phase 4). +Therefore, you can use this feature as a safer way to test the full workload on the target before setting the target as the primary cluster in Phase 4 [NOTE] ==== In some cases the additional read requests can cause the write requests to fail or timeout on that cluster. -This means that, while this feature provides a way to route read requests to Target with a lower chance of having impact on the client application, it doesn't completely eliminate that chance. +This means that, while this feature provides a way to route read requests to the target with a lower chance of having impact on the client application, it doesn't completely eliminate that chance. ==== [[_validating_performance_and_error_rate]] @@ -79,9 +76,6 @@ For more, see xref:metrics.adoc#_asynchronous_read_requests_metrics[Asynchronous == Reminder to switch off async dual reads -[TIP] -==== -Once you are satisfied that your Target cluster is ready and tuned appropriately to handle the production read load, you can decide to switch your sync reads to Target. +Once you are satisfied that your target cluster is ready and tuned appropriately to handle the production read load, you can switch your sync reads to the target permanently. At this point, be sure to also disable async dual reads by reverting `read_mode` in `vars/zdm_proxy_core_config.yml` to `PRIMARY_ONLY`. -This step is explained in more detail in the xref:change-read-routing.adoc[next topic]. -==== +For more information and instructions, see xref:change-read-routing.adoc[]. diff --git a/modules/ROOT/pages/faqs.adoc b/modules/ROOT/pages/faqs.adoc index a6734eff..d1659f2a 100644 --- a/modules/ROOT/pages/faqs.adoc +++ b/modules/ROOT/pages/faqs.adoc @@ -1,11 +1,11 @@ = Frequently Asked Questions :navtitle: FAQs :page-tag: migration,zdm,zero-downtime,zdm-proxy,faq -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] If you're new to the {company} {product} features, these FAQs are for you. +//TODO: Eliminate redundancies in these FAQs and the Glossary. + == What is meant by {product}? {product} ({product-short}) means the ability for you to reliably migrate client applications and data between CQL clusters with no interruption of service. @@ -27,13 +27,11 @@ However, it is important to emphasize that the {product-proxy} can be freely use See the diagrams of the {product-short} xref:introduction.adoc#_migration_phases[migration phases]. -== Do you offer an interactive self-guided lab to help me learn about {product-short} migrations at my own pace? - -Yes! Here's a fun way to learn. +== Do you have a demo of {product-short}? -include::partial$interactive-lab.adoc[] +Yes, you can use the {product-short} interactive lab to see how the migration process works. -The interactive lab spans the pre-migration prerequisites and each of the five key migration phases. +For more information, see xref:ROOT:introduction.adoc#lab[{product} interactive lab]. == What components are provided with {product-short}? @@ -45,10 +43,10 @@ To simplify its setup, the suite includes the {product-utility}. This interactive utility creates a Docker container acting as the Ansible Control Host. The Ansible playbooks constitute the {product-automation}. * **{cass-migrator}** is designed to: -** Connect to your clusters and compare the data between Origin and Target. +** Connect to your clusters and compare the data between the origin and target clusters. ** Report differences in a detailed log file. -** Reconcile any missing records and fix any data inconsistencies between Origin and Target by enabling `autocorrect` in a configuration file. -* **{dsbulk-migrator}** is provided to migrate smaller amounts of data from Origin to Target. +** Use AutoCorrect mode to reconcile any missing records and fix any data inconsistencies between the origin and target. +* **{dsbulk-migrator}** is provided to migrate smaller amounts of data from the origin to the target. * Well-defined steps in this migration documentation, organized as a sequence of phases. == What exactly is {product-proxy}? @@ -68,9 +66,9 @@ Bottom line: You want to migrate your critical database infrastructure without r == Which releases of {cass-short} or {dse-short} are supported for migrations? -include::partial$supported-releases.adoc[] +include::ROOT:partial$supported-releases.adoc[] -include::partial$migration-scenarios.adoc[] +include::ROOT:partial$migration-scenarios.adoc[] == Does {product-short} migrate clusters? @@ -90,7 +88,7 @@ The suite of {product} tools from {company} is free and open-sourced. == Is there support available if I have questions or issues during our migration? -{product-proxy} and related software tools in the migration suite include technical assistance by {support-url}[{company} Support] for {dse-short} and Luna subscribers, and {astra-db} users who are on an Enterprise plan. +{product-proxy} and related software tools in the migration suite include technical assistance by {support-url}[{company} Support] for {dse-short} and Luna subscribers, and {astra} users who are on an Enterprise plan. Free and Pay As You Go plan users do not have support access and must raise questions in the {astra-ui} chat. https://www.datastax.com/products/luna[Luna] is a subscription to the {cass} support and expertise at {company}. @@ -134,8 +132,9 @@ For TLS details, see xref:tls.adoc[]. == How does {product-proxy} handle Lightweight Transactions (LWTs)? {product-proxy} handles LWTs as write operations. -The proxy sends the LWT to Origin and Target clusters concurrently, and waits for a response from both. -{product-proxy} will return a `success` status to the client if both Origin and Target send successful acknowledgements, or otherwise will return a `failure` status if one or both do not return an acknowledgement. +The proxy sends the LWT to the origin and target clusters concurrently, and waits for a response from both. +{product-proxy} will return a `success` status to the client if both the origin and target clusters send successful acknowledgements. +Otherwise, it will return a `failure` status if one or both do not return an acknowledgement. What sets LWTs apart from regular writes is that they are conditional. For important details, including the client context for a returned `applied` flag, see xref:feasibility-checklists.adoc#_lightweight_transactions_and_the_applied_flag[Lightweight Transactions and the `applied` flag]. @@ -152,7 +151,7 @@ This way, each client application instance can connect to all {product-proxy} in This deployment model gives maximum resilience and failure tolerance guarantees and allows the client application driver to continue using the same load balancing and retry mechanisms that it would normally use. -Conversely, deploying a single {product-proxy} instance would undermine this resilience mechanism and create a single point of failure, which could affect the client applications if one or more nodes of the underlying clusters (Origin or Target) go offline. +Conversely, deploying a single {product-proxy} instance would undermine this resilience mechanism and create a single point of failure, which could affect the client applications if one or more nodes of the underlying origin or target clusters go offline. In a sidecar deployment, each client application instance would be connecting to a single {product-proxy} instance, and would therefore be exposed to this risk. For more information, see xref:deployment-infrastructure.adoc#_choosing_where_to_deploy_the_proxy[Choosing where to deploy the proxy]. diff --git a/modules/ROOT/pages/feasibility-checklists.adoc b/modules/ROOT/pages/feasibility-checklists.adoc index 785b4cbd..4402a43e 100644 --- a/modules/ROOT/pages/feasibility-checklists.adoc +++ b/modules/ROOT/pages/feasibility-checklists.adoc @@ -1,7 +1,5 @@ = Feasibility checks :page-tag: migration,zdm,zero-downtime,zdm-proxy,feasibility -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] Before starting your migration, refer to the following considerations to ensure that your client application workload and xref:glossary.adoc#origin[**Origin**] are suitable for this {product} process. @@ -11,7 +9,7 @@ Before starting your migration, refer to the following considerations to ensure [NOTE] ==== -include::partial$supported-releases.adoc[] +include::ROOT:partial$supported-releases.adoc[] ==== {product-proxy} technically doesn't support `v5`. @@ -35,23 +33,23 @@ This means that {product-proxy} supports migrations of the following cluster ver [TIP] ==== -Ensure that you test your client application with Target (connected directly without the {product-proxy}) before the migration process begins. +Ensure that you test your client application on the target (connected directly, without the {product-proxy}) before the migration process begins. ==== == Schema/keyspace compatibility -{product-proxy} does not modify or transform CQL statements besides the optional feature that replaces `now()` functions with timestamp literals. +{product-proxy} does not modify or transform CQL statements besides the optional feature that replaces `now()` functions with timestamp literals. See <> for more information about this feature. A CQL statement that your client application sends to {product-proxy} must be able to succeed on both clusters. -This means that any keyspace that your client application uses must exist on both Origin and Target with the same name (although they can have different replication strategies and durable writes settings). +This means that any keyspace that your client application uses must exist on both the origin and target clusters with the same name (although they can have different replication strategies and durable writes settings). Table names must also match. The schema doesn't have to be an exact match as long as the CQL statements can be executed successfully on both clusters. -For example, if a table has 10 columns but your client application only uses 5 of those columns then you could create that table on Target with just those 5 columns. +For example, if a table has 10 columns but your client application only uses 5 of those columns then you could create that table on the target with just those 5 columns. You can also change the primary key in some cases. -For example, if your compound primary key is `PRIMARY KEY (A, B)` and you always provide parameters for the `A` and `B` columns in your CQL statements then you could change the key to `PRIMARY KEY (B, A)` when creating the schema on Target because your CQL statements will still run successfully. +For example, if your compound primary key is `PRIMARY KEY (A, B)` and you always provide parameters for the `A` and `B` columns in your CQL statements then you could change the key to `PRIMARY KEY (B, A)` when creating the schema on the target because your CQL statements will still run successfully. == Considerations for {astra-db} migrations @@ -69,22 +67,26 @@ Read-only applications require special handling only if you are using {product-p [TIP] ==== -If you have an existing {product-proxy} deployment, you can check which version you are running as explained xref:troubleshooting-tips.adoc#_how_to_identify_the_zdm_proxy_version[here]. To find out how to upgrade an existing {product-proxy} deployment, please see xref:manage-proxy-instances.adoc#_upgrade_the_proxy_version[here]. +If you have an existing {product-proxy} deployment, you can xref:ROOT:troubleshooting-tips.adoc#check-version[check your {product-proxy} version]. + +For upgrade instructions, see xref:ROOT:manage-proxy-instances.adoc#_upgrade_the_proxy_version[Upgrade the proxy version]. ==== -==== *Versions older than 2.1.0* +//TODO: combine the below 2 sections to only use 2.1.0 or later. +//Reconcile with troubleshooting-scenarios.adoc in case this issue is also described there. +==== Versions older than 2.1.0 If a client application only sends `SELECT` statements to a database connection then you may find that {product-proxy} terminates these read-only connections periodically, which may result in request errors if the driver is not configured to retry these requests in these conditions. This happens because {astra-db} terminates idle connections after some inactivity period (usually around 10 minutes). -If {astra-db} is your Target and a client connection is only sending read requests to the {product-proxy}, then the {astra-db} connection that is paired to that client connection will remain idle and will be eventually terminated. +If {astra-db} is your target, and a client connection is only sending read requests to the {product-proxy}, then the {astra-db} connection that is paired to that client connection will remain idle and will be eventually terminated. -A potential workaround is to not connect these read-only client applications to {product-proxy}, but you need to ensure that these client applications switch reads to Target at any point after all the data has been migrated and all validation and reconciliation has completed. +A potential workaround is to not connect these read-only client applications to {product-proxy}, but you need to ensure that these client applications switch reads to the target at any point after all the data has been migrated and all validation and reconciliation has completed. Another work around is to implement a mechanism in your client application that creates a new `Session` periodically to avoid the {astra-db} inactivity timeout. You can also implement some kind of meaningless write request that the application sends periodically to make sure the {astra-db} connection doesn't idle. -==== *Version 2.1.0 and newer* +==== Version 2.1.0 and newer This issue is solved in version 2.1.0 of the {product-proxy}, which introduces periodic heartbeats to keep alive idle cluster connections. We strongly recommend using version 2.1.0 (or newer) to benefit from this improvement, especially if you have a read-only workload. @@ -103,7 +105,8 @@ For more information on how to handle non-deterministic functions please refer t Given that there are two separate clusters involved, the state of each cluster may be different. For conditional writes, this may create a divergent state for a time. -It may not make a difference in many cases, but if non-idempotent operations are used, we recommend a reconciliation phase in the migration before and after switching reads to rely on Target (setting Target as the primary cluster). + +If non-idempotent operations are used, {company} recommends adding a reconciliation phase to your migration before and after Phase 4, where you switch reads to the target. For details about using the {cass-migrator}, see xref:migrate-and-validate-data.adoc[]. @@ -116,29 +119,29 @@ Some application workloads can tolerate inconsistent data in some cases (especia === Lightweight Transactions and the `applied` flag {product-proxy} handles LWTs as write operations. -The proxy sends the LWT to Origin and Target clusters concurrently, and waits for a response from both. -{product-proxy} will return a `success` status to the client if both Origin and Target send successful acknowledgements, or otherwise will return a `failure` status if one or both do not return an acknowledgement. +The proxy sends the LWT to the origin and target clusters concurrently, and then waits for a response from both. +{product-proxy} will return a `success` status to the client if both the origin and target send successful acknowledgements. +Otherwise, it will return a `failure` status if one or both do not return an acknowledgement. What sets LWTs apart from regular writes is that they are conditional. In other words, a LWT can appear to have been successful (its execution worked as expected). However, the change will be applied only if the LWT's condition was met. Whether the condition was met depends on the state of the data on the cluster. -In a migration, the clusters will not be in sync until all existing data has been imported into Target. +In a migration, the clusters will not be in sync until all existing data has been imported into the target. Up to that point, an LWT's condition can be evaluated differently on each side, leading to a different outcome even though the LWT was technically successful on both sides. The response that a cluster sends after executing a LWT includes a flag called `applied`. This flag tells the client whether the LWT update was actually applied. The status depends on the condition, which in turn depends on the state of the data. -When {product-proxy} receives a response from both Origin and Target, each response would have its own `applied` flag. +When {product-proxy} receives a response from both the origin and target, each response would have its own `applied` flag. However, {product-proxy} can only return a *single response* to the client. Recall that the client has no knowledge that there are two clusters behind the proxy. Therefore, {product-proxy} returns the `applied` flag from the cluster that is *currently used as primary*. If your client has logic that depends on the `applied` flag, be aware that during the migration, you will only have visibility of the flag coming from the primary cluster; that is, the cluster to which synchronous reads are routed. -To reiterate, {product-proxy} only returns the `applied` value from the primary cluster, which is the cluster from where read results are returned to the client application (by default, Origin). -This means that when you set Target as your primary cluster, the `applied` value returned to the client application will come from Target. - +To reiterate, {product-proxy} only returns the `applied` value from the primary cluster, which is the cluster from where read results are returned to the client application. By default, this is the origin cluster. +This means that when you set the target cluster as your primary cluster, then the `applied` value returned to the client application will come from the target cluster. == Advanced workloads ({dse-short}) @@ -146,17 +149,16 @@ This means that when you set Target as your primary cluster, the `applied` value {product-proxy} handles all {dse-short} Graph requests as write requests even if the traversals are read-only. There is no special handling for these requests, so you need to take a look at the traversals that your client application sends and determine whether the traversals are idempotent. If the traversals are non-idempotent then the reconciliation step is needed. -Keep in mind that our recommended tools for data migration and reconciliation are CQL-based, so they can be used for migrations where Origin is a database that uses the new {dse-short} Graph engine released with {dse-short} 6.8, but *cannot be used for the old Graph engine* that older {dse-short} versions relied on. +Keep in mind that our recommended tools for data migration and reconciliation are CQL-based, so they can be used for migrations where the origin cluster is a database that uses the new {dse-short} Graph engine released with {dse-short} 6.8, but *cannot be used for the old Graph engine* that older {dse-short} versions relied on. See <> for more information about non-idempotent operations. === Search -Read-only Search workloads can be moved directly from Origin to Target without {product-proxy} being involved. +Read-only Search workloads can be moved directly from the origin to the target without {product-proxy} being involved. If your client application uses Search and also issues writes, or if you need the read routing capabilities from {product-proxy}, then you can connect your search workloads to it as long as you are using the {company} drivers to submit these queries. This approach means the queries are regular CQL `SELECT` statements, so {product-proxy} handles them as regular read requests. -If you use the HTTP API then you can either modify your applications to use the CQL API instead or you will have to move those applications directly from Origin to Target when the migration is complete if that is acceptable. - +If you use the HTTP API then you can either modify your applications to use the CQL API instead or you will have to move those applications directly from the origin to the target when the migration is complete if that is acceptable. == Client compression @@ -168,7 +170,6 @@ This kind of compression is disabled by default on all of our {company} drivers This is *NOT* related to storage compression which you can configure on a table by table basis with the `compression` table property. Storage/table compression does not affect the client application or {product-proxy} in any way. - == Authenticator and Authorizer configuration {product-proxy} supports the following cluster authenticator configurations: @@ -181,12 +182,12 @@ Storage/table compression does not affect the client application or {product-pro While the authenticator has to be supported, the *authorizer* does not affect client applications or {product-proxy} so you should be able to use any kind of authorizer configuration on both of your clusters. -The authentication configuration on each cluster can be different between Origin and Target, as the {product-proxy} treats them independently. +The authentication configuration on each cluster can be different between the origin and target clustesr, as the {product-proxy} treats them independently. [[cql-function-replacement]] == Server-side non-deterministic functions in the primary key -Statements with functions like `now()` and `uuid()` will result in data inconsistency between Origin and Target because the values are computed at cluster level. +Statements with functions like `now()` and `uuid()` will result in data inconsistency between the origin and target clusters because the values are computed at the cluster level. If these functions are used for columns that are not part of the primary key, you may find it acceptable to have different values in the two clusters depending on your application business logic. However, if these columns are part of the primary key, the data migration phase will not be successful as there will be data inconsistencies between the two clusters and they will never be in sync. diff --git a/modules/ROOT/pages/glossary.adoc b/modules/ROOT/pages/glossary.adoc index 4860555b..231cbba4 100644 --- a/modules/ROOT/pages/glossary.adoc +++ b/modules/ROOT/pages/glossary.adoc @@ -1,7 +1,5 @@ = Glossary :page-tag: migration,zdm,zero-downtime,glossary -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] Here are a few terms used throughout the {company} {product} documentation and code. @@ -17,7 +15,8 @@ For details about the playbooks available in {product-automation}, see: [[_asynchronous_dual_reads]] == Asynchronous dual reads -An optional testing phase in which reads are sent to both Origin and Target, enabling you to check that the intended Target of your migration can handle the full workload of reads and writes before finalizing the migration and moving off the {product-proxy} instances. +An optional testing phase in which reads are sent to both the origin and target clusters. +This lets you check that the target cluster/database can handle the full workload of reads and writes before you finalize the migration and moving off the {product-proxy} instances. For details, see xref:enable-async-dual-reads.adoc[]. == CQL @@ -28,7 +27,7 @@ For details, see https://docs.datastax.com/en/astra/astra-db-vector/cql/develop- == Dual-write logic -{product-proxy} handles your client application's real-time write requests and forwards them to two {cass-short}-based clusters (Origin and Target) simultaneously. +{product-proxy} handles your client application's real-time write requests and forwards them to two {cass-short}-based origin and target clusters simultaneously. The dual-write logic in {product-proxy} means that you do not need to modify your client application to perform dual writes manually during a migration: {product-proxy} takes care of it for you. See the diagram in the xref:introduction.adoc#migration-workflow[workflow introduction]. @@ -42,8 +41,8 @@ Your existing {cass-short}-based cluster, whether it's {cass-reg}, {dse}, or {as The cluster that is currently considered the "primary" source of truth. While writes are always sent to both clusters, the primary cluster is the one to which all synchronous reads are always sent, and their results are returned to the client application. -During a migration, Origin is typically the primary cluster. -Near the end of the migration, you'll shift the primary cluster to be Target. +During a migration, the origin cluster is typically the primary cluster. +Near the end of the migration, you shift the primary cluster to be the target cluster. For more, see <>. @@ -67,9 +66,9 @@ See xref:glossary.adoc#_asynchronous_dual_reads[Asynchronous dual reads]. During a migration, the secondary cluster is the one that is currently **not** the source of truth. -When using the {product-proxy}, all writes are synchronously sent to both Origin and Target. +When using the {product-proxy}, all writes are synchronously sent to both the origin and target clusters. Reads operate differently: with the default read mode, reads are only sent to the primary cluster (Origin by default). -In Phase 3 of a migration, you may (optionally) want to temporarily send the reads to both clusters, to make sure that Target can handle the full workload of reads and writes. +In Phase 3 of a migration, you can optionally send the reads to both clusters temporarily if you want to verify that the target cluster can handle the full workload of reads and writes. If you set the proxy's read mode configuration variable (`read_mode`) to `DUAL_ASYNC_ON_SECONDARY`, then asynchronous dual reads are enabled. That change results in reads being additionally sent to the secondary cluster. diff --git a/modules/ROOT/pages/index.adoc b/modules/ROOT/pages/index.adoc index a9566e02..3d3915ec 100644 --- a/modules/ROOT/pages/index.adoc +++ b/modules/ROOT/pages/index.adoc @@ -1,24 +1,48 @@ = Introduction to data migration :page-tag: migration,zdm,zero-downtime,zdm-proxy, introduction -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] -Enterprises today want to reliably migrate mission-critical client applications and data to cloud environments with zero downtime or near zero downtime during the migration. +Enterprises today want to reliably migrate mission-critical client applications and data across environments with little or no downtime during the migration. -{company} has developed a set of thoroughly tested self-service tools to walk you through well-defined migration options. -These tools provide features that help you migrate your data from any {cass-short} origin ({cass-reg}, {dse}, or {astra-db}) to any {cass-short} target ({cass-reg}, {dse}, or {astra-db}). +{company} has developed a set of thoroughly-tested self-service tools that can help you migrate your data from any {cass-short} origin cluster to any {cass-short}-compatible target. -== Migration process and tools +Compatible origin and target clusters include {cass-reg}, {dse}, {hcd}, and {astra-db}. -A migration is a workflow that encompasses the lifecycle of uploading and importing your data to the selected databases. -{company} can migrate all data, however critical, with acceptable or zero downtime. -When the migration is complete, the data is present in the new database and all client applications connect exclusively to the new database. The old database becomes obsolete and can be removed. +When the migration is complete, the data is present in the new database, and you can update your client applications to connect exclusively to the new database. +The old database becomes obsolete and can be removed. -The migration tools are: +Available migration tools include: -* https://docs.datastax.com/en/data-migration/introduction.html[{product}] ({product-short}): Comprised of {product-proxy}, {product-proxy}, and {product-automation}, you can continue to run your current application and migrate data from the Origin to the Target database without any downtime. +* xref:ROOT:introduction.adoc[{product}] ({product-short}): Comprised of {product-proxy}, {product-proxy}, and{product-automation}, you can continue to run your current application and migrate data from the origin to the target database without any downtime. {product-proxy} helps to manage the activity in transition. -* xref:cassandra-data-migrator.adoc[{cass-migrator}]: It can be used in conjunction with the {product-proxy} for a migration with zero downtime. It can also be used on its own for migrations with acceptable downtime. -* https://docs.datastax.com/en/dsbulk/overview/dsbulk-about.html[{dsbulk-migrator}]: In addition to loading and unloading CSV and JSON data, {dsbulk-loader} can transfer data between databases. -It can read data from a table from your origin database and write it to a table in your target database. -It can be used as an alternative to {cass-migrator} ({cass-migrator-short}). + +* xref:ROOT:cassandra-data-migrator.adoc[{cass-migrator}]: It can be used in conjunction with the {product-proxy} for a migration with zero downtime. It can also be used on its own for migrations with acceptable downtime. + +* xref:ROOT:dsbulk-migrator.adoc[{dsbulk-migrator}]: {dsbulk-migrator} is an extension of {dsbulk-loader}. +In addition to loading and unloading CSV and JSON data, {dsbulk-migrator} can transfer data between databases. +It can read data from a table from your origin database and write it to a table in your target database. +It can be used as an alternative to {cass-migrator} ({cass-migrator-short}). + +* xref:sideloader:sideloader-overview.adoc[{sstable-sideloader}]: {sstable-sideloader} is a service running in {astra-db} that directly imports data from snapshot backups that you've uploaded to {astra-db} from an existing {cass-reg}, {dse}, or {hcd} cluster. + +//// +* https://github.com/datastax/zdm-proxy-automation[{product-automation}] repo for Ansible-based {product-proxy} automation. +//{product-automation} 2.3.0, which enables ansible scripts and terraform to work with both Ubuntu and RedHat-family Linux distributions. + +* https://github.com/datastax/dsbulk-migrator[{dsbulk-migrator}] repo for migration of smaller data quantities. + +* https://github.com/datastax/cassandra-data-migrator[{cass-migrator}] repo for migration of larger data quantities and where detailed verifications and reconciliation options are needed. +//// + +//// +Possible related content: + +https://docs.datastax.com/en/dse/6.8/tooling/migration-path-dse.html + +https://docs.datastax.com/en/dse/6.9/tooling/migration-path-dse.html + +MC only: + +https://docs.datastax.com/en/mission-control/migrate/oss-cass-to-mission-control.html + +https://docs.datastax.com/en/mission-control/migrate/dse-to-mission-control.html +//// \ No newline at end of file diff --git a/modules/ROOT/pages/introduction.adoc b/modules/ROOT/pages/introduction.adoc index 4209b7b4..95360ea2 100644 --- a/modules/ROOT/pages/introduction.adoc +++ b/modules/ROOT/pages/introduction.adoc @@ -1,114 +1,102 @@ = Introduction to {product} :navtitle: Introduction +:description: Before you begin, learn about migration concepts, software components, and the sequence of operations. :page-tag: migration,zdm,zero-downtime,zdm-proxy,introduction -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] -{product} provides a simple and reliable way for you to migrate applications from any CQL-based cluster (https://cassandra.apache.org/_/index.html[{cass-reg}], https://www.datastax.com/products/datastax-enterprise[{dse}], https://www.datastax.com/products/datastax-astra[{astra-db}], or any type of CQL-based database) to any other CQL-based cluster, without any interruption of service to the client applications and data. +{product} provides a simple and reliable way for you to migrate applications from a CQL-based cluster to another CQL-based cluster with little or no downtime and minimal interruption of service to your client applications and data. -* You can move your application to {astra-db}, {dse-short}, or {cass-short} with no downtime and with minimal configuration changes. -* Your clusters are kept in sync at all times by a dual-write logic configuration. -* You can xref:rollback.adoc[roll back] at any point, for complete peace of mind. +include::ROOT:partial$supported-releases.adoc[] -include::partial$note-downtime.adoc[] +{product-short} keeps your clusters in sync at all times by a dual-write logic configuration, and you can xref:rollback.adoc[roll back] at any point. -[TIP] +[IMPORTANT] ==== -The {product} process requires you to be able to perform rolling restarts of your client applications during the migration. +* True zero downtime migration is only possible if your database meets the xref:ROOT:feasibility-checklists.adoc[minimum requirements]. +If your database doesn't meet these requirements, you can still complete the migration, but downtime might be necessary to finish the migration. -This is standard practice for client applications that are deployed over multiple instances and is a widely used approach to roll out releases and configuration changes. +* The {product} process requires you to be able to perform rolling restarts of your client applications during the migration. +This is standard practice for client applications that are deployed over multiple instances, and it is a widely used approach to roll out releases and configuration changes. ==== -== Supported releases - -include::partial$supported-releases.adoc[] - == Migration scenarios -include::partial$migration-scenarios.adoc[] - -[TIP] -==== -An important migration prerequisite is that you already have the matching schema on Target. -A CQL statement that your client application sends to {product-proxy} must be able to succeed on both Origin and Target clusters. -This means that any keyspace that your client application uses must exist on both Origin and Target with the same name. -Table names must also match. -For more, see xref:feasibility-checklists.adoc#_schemakeyspace_compatibility[Schema/keyspace compatibility]. -==== +include::ROOT:partial$migration-scenarios.adoc[] == Migration phases -First, a couple of key terms used throughout the {product-short} documentation and software components: - -* **Origin:** This cluster is your existing {cass-short}-based environment, whether it's {cass}, {dse-short}, or {astra-db}. - -* **Target:** This cluster is the new environment to which you want to migrate client applications and data. - -=== Migration diagram +A migration project includes preparation for the migration and five migration phases. -Discover the migration concepts, software components, and sequence of operations. +The following sections describe the major events in each phase and how your client applications perform read and write operations on your origin and target clusters during each phase. -Your migration project occurs through a sequence of phases, which matches the structure of the {product} documentation. +The _origin_ is is your existing {cass-short}-based environment, which can be {cass}, {dse-short}, or {astra-db}. +The _target_ is your new {cass-short}-based environment where you want to migrate your data and client applications. -The highlighted components in each phase emphasize how your client applications perform read and write operations on your Origin and Target clusters. - -==== Pre-migration client application operations +=== Pre-migration client application operations Here's a look at a pre-migration from a high-level view. At this point, your client applications are performing read/write operations with an existing CQL-compatible database such as {cass}, {dse-short}, or {astra-db}. image:pre-migration0ra9.png["Pre-migration environment."] -''' +[TIP] +==== +For the migration to succeed, the origin and target clusters must have matching schemas. -==== Phase 1: Deploy {product-proxy} and connect client applications +A CQL statement that your client application sends to {product-proxy} must be able to succeed on both the origin and target clusters. + +This means that any keyspace that your client application uses must exist on both the origin and target clusters with the same name. +The table names, column names, and data types must also match. +For more information, see xref:feasibility-checklists.adoc#_schemakeyspace_compatibility[Schema/keyspace compatibility]. +==== + +=== Phase 1: Deploy {product-proxy} and connect client applications In this first phase, deploy the {product-proxy} instances and connect client applications to the proxies. This phase activates the dual-write logic. -Writes are bifurcated (sent to both Origin and Target), while reads are executed on Origin only. +Writes are bifurcated (sent to both the origin and target), while reads are executed on the origin only. image:migration-phase1ra9.png["Migration Phase 1."] -''' - -==== Phase 2: Migrate data +=== Phase 2: Migrate data In this phase, migrate existing data using {cass-migrator} or {dsbulk-loader}. Validate that the migrated data is correct, while continuing to perform dual writes. image:migration-phase2ra9a.png["Migration Phase 2."] -''' - -==== Phase 3: Enable asynchronous dual reads +=== Phase 3: Enable asynchronous dual reads In this phase, you can optionally enable asynchronous dual reads. -The idea is to test performance and verify that Target can handle your application's live request load before cutting over from Origin to Target. +The idea is to test performance and verify that the target cluster can handle your application's live request load before cutting over from the origin to the target permanently. image:migration-phase3ra9.png["Migration Phase 3."] -''' +=== Phase 4: Route reads to the target cluster -==== Phase 4: Route reads to Target +In this phase, read routing on the {product-proxy} is switched to teh target cluster so that all reads are executed on the target. +Writes are still sent to both clusters. -In this phase, read routing on the {product-proxy} is switched to Target so that all reads are executed on it, while writes are still sent to both clusters. -In other words, Target becomes the primary cluster. +At this point, the target becomes the primary cluster. image:migration-phase4ra9.png["Migration Phase 4."] -''' +=== Phase 5: Connect directly to the target cluster -==== Phase 5: Connect directly to Target +In this phase, move your client applications off the {product-proxy} and connect them directly to the target cluster. -In this phase, move your client applications off the {product-proxy} and connect the apps directly to Target. -Once that happens, the migration is complete. +Once this happens, the migration is complete, and you now exclusively use the target cluster. image:migration-phase5ra9.png["Migration Phase 5."] -''' +[#lab] +== {product} interactive lab + +As a companion to the {product-short} documentation, you can use the https://www.datastax.com/dev/zdm[{product} interactive lab] to try the entire migration process in a demo environment. -== A fun way to learn: {product} Interactive Lab +The lab only requires a GitHub account and a supported browser. +All browsers are supported except Safari. -include::partial$interactive-lab.adoc[] +You don't need to install anything because the lab uses a pre-configured GitPod environment. -The interactive lab spans the pre-migration prerequisites and each of the five key migration phases illustrated above. +This lab provides an interactive, detailed walkthrough of the migration process, including pre-migration preparation and each of the five migration phases. +The lab describes and demonstrates all steps and automation required to prepare for and complete a migration from any supported origin database to any supported target database. \ No newline at end of file diff --git a/modules/ROOT/pages/manage-proxy-instances.adoc b/modules/ROOT/pages/manage-proxy-instances.adoc index a15d1f70..c6a3f039 100644 --- a/modules/ROOT/pages/manage-proxy-instances.adoc +++ b/modules/ROOT/pages/manage-proxy-instances.adoc @@ -1,7 +1,5 @@ = Manage your {product-proxy} instances :page-tag: migration,zdm,zero-downtime,zdm-proxy -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] In this topic, we'll learn how to perform simple operations on your {product-proxy} deployment with no interruption to its availability: @@ -56,19 +54,22 @@ This playbook restarts each proxy container one by one, without impacting the av .. If unsuccessful, it repeats the check for six times at 5-second intervals and eventually interrupts the whole process if the check still fails. .. If successful, it waits for a configurable interval and then moves on to the next container. -The pause between the restart of each {product-proxy} instance defaults to 10 seconds. If you wish to change this value, you can edit `vars/zdm_playbook_internal_config.yml` (located in `zdm-proxy-automation/ansible/vars`) and set it to the desired number of seconds. +The pause between the restart of each {product-proxy} instance defaults to 10 seconds. +To change this value, you can set the desired number of seconds in `zdm-proxy-automation/ansible/vars/zdm_playbook_internal_config.yml`. [TIP] ==== To check the state of your {product-proxy} instances, you have a couple of options. -See xref:deploy-proxy-monitoring.adoc#_indications_of_success_on_origin_and_target_clusters[Indications of success on Origin and Target clusters]. +See xref:deploy-proxy-monitoring.adoc#_indications_of_success_on_origin_and_target_clusters[Indications of success on origin and target clusters]. ==== +[#access-the-proxy-logs] == Access the proxy logs To confirm that the {product-proxy} instances are operating normally, or investigate any issue, you can view or collect their logs. -[[_view_the_logs]] +You can view the logs for a single proxy instance, or you can use a playbook to systematically retrieve logs from all instances and package them in a zip archive for later inspection. + === View the logs The {product-proxy} runs as a Docker container on each proxy host. @@ -81,7 +82,6 @@ docker container logs zdm-proxy-container To leave the logs open and continuously output the latest log messages, append the `--follow` (or `-f`) option to the command above. -[[_collect_the_logs]] === Collect the logs You can easily retrieve the logs of all {product-proxy} instances using a dedicated playbook (`collect_zdm_proxy_logs.yml`). @@ -114,8 +114,8 @@ Commonly changed variables, located in `vars/zdm_proxy_core_config.yml`: * `primary_cluster`: ** This variable determines which cluster is currently considered the xref:glossary.adoc#_primary_cluster[primary cluster]. -At the start of the migration, the primary cluster is Origin, as it contains all the data. -In Phase 4 of the migration, once all the existing data has been transferred and any validation/reconciliation step has been successfully executed, you can switch the primary cluster to be Target. +At the start of the migration, the primary cluster is the origin cluster because it contains all of the data. +In Phase 4 of the migration, once all the existing data has been transferred and any validation/reconciliation step has been successfully executed, you can switch the primary cluster to be the target cluster. ** Valid values: `ORIGIN`, `TARGET`. * `read_mode`: ** This variable determines how reads are handled by the {product-proxy}. @@ -124,16 +124,16 @@ In Phase 4 of the migration, once all the existing data has been transferred and *** `DUAL_ASYNC_ON_SECONDARY`: reads are sent synchronously to the primary cluster and also asynchronously to the secondary cluster. See xref:enable-async-dual-reads.adoc[]. ** Typically, when choosing `DUAL_ASYNC_ON_SECONDARY` you will want to ensure that `primary_cluster` is still set to `ORIGIN`. -When you are ready to use Target as the primary cluster, you should revert `read_mode` to `PRIMARY_ONLY`. +When you are ready to use the target cluster as the primary cluster, revert `read_mode` to `PRIMARY_ONLY`. * `log_level`: ** Defaults to `INFO`. ** Only set to `DEBUG` if necessary and revert to `INFO` as soon as possible, as the extra logging can have a slight performance impact. Other, rarely changed variables: -* Origin username/password, in `vars/zdm_proxy_cluster_config.yml`) -* Target username/password, in `vars/zdm_proxy_cluster_config.yml`) -* Advanced configuration variables, located in `vars/zdm_proxy_advanced_config.yml`: +* Origin username/password in `vars/zdm_proxy_cluster_config.yml` +* Target username/password in `vars/zdm_proxy_cluster_config.yml` +* Advanced configuration variables in `vars/zdm_proxy_advanced_config.yml`: ** `zdm_proxy_max_clients_connections`: *** Maximum number of client connections that the {product-proxy} should accept. Each client connection results in additional cluster connections and causes the allocation of several in-memory structures, so this variable can be tweaked to cap the total number on each instance. @@ -153,7 +153,7 @@ Note that, in this case, the {product-proxy} will not return any result or error *** Defaults to `10000` ms. If your client application has a higher client-side timeout because it is expected to generate requests that take longer to complete, you need to increase this timeout accordingly. ** `origin_connection_timeout_ms` and `target_connection_timeout_ms`: -*** Timeout (in ms) when attempting to establish a connection from the proxy to Origin or Target. +*** Timeout (in ms) when attempting to establish a connection from the proxy to the origin or the target. *** Defaults to `30000` ms. ** `async_handshake_timeout_ms`: *** Timeout (in ms) when performing the initialization (handshake) of a proxy-to-secondary cluster connection that will be used solely for asynchronous dual reads. @@ -161,7 +161,7 @@ If your client application has a higher client-side timeout because it is expect This has no impact on the handling of synchronous requests: the {product-proxy} will continue to handle all synchronous reads and writes normally. *** Defaults to `4000` ms. ** `heartbeat_interval_ms`: -*** Frequency (in ms) with which heartbeats will be sent on cluster connections (i.e. all control and request connections to Origin and Target). +*** Frequency (in ms) with which heartbeats will be sent on cluster connections (i.e. all control and request connections to the origin and the target). Heartbeats keep idle connections alive. *** Defaults to `30000` ms. ** `metrics_enabled`: @@ -180,9 +180,9 @@ If you have a custom driver configuration with a higher value, you should change Deprecated variables, which will be removed in a future {product-proxy} release: * `forward_client_credentials_to_origin`: -** Whether the credentials provided by the client application are for Origin. +** Whether the credentials provided by the client application are for the origin cluster. ** Boolean value. -Defaults to `false` (the client application is expected to pass Target credentials), can be set to `true` if the client passes credentials for Origin instead. +Defaults to `false` (the client application is expected to pass the target credentials), can be set to `true` if the client passes credentials for the origin cluster instead. To change any of these variables, edit the desired values in `vars/zdm_proxy_core_config.yml`, `vars/zdm_proxy_cluster_config.yml` (credentials only) and/or `vars/zdm_proxy_advanced_config.yml` (mutable variables only, as listed above). @@ -211,7 +211,7 @@ Make sure you collect the logs prior to this operation if you want to keep them. .. If successful, it waits for 10 seconds and then moves on to the next container. The pause between the restart of each {product-proxy} instance defaults to 10 seconds. -If you wish to change this value, you can edit `vars/zdm_playbook_internal_config.yml` (located in `zdm-proxy-automation/ansible/vars`) and set it to the desired number of seconds. +To change this value, you can set the desired number of seconds in `zdm-proxy-automation/ansible/vars/zdm_playbook_internal_config.yml`. [NOTE] ==== @@ -251,16 +251,16 @@ To perform an upgrade, change the version tag number to the desired version in ` [source,bash] ---- -zdm_proxy_image::{imagesprefix} datastax/zdm-proxy:x.y.z +zdm_proxy_image: datastax/zdm-proxy:x.y.z ---- -Replace x.y.z with the version you would like to upgrade to. +Replace `x.y.z` with the version you would like to upgrade to. {product-proxy} example: [source,bash] ---- -zdm_proxy_image::{imagesprefix} datastax/zdm-proxy:2.1.0 +zdm_proxy_image: datastax/zdm-proxy:2.1.0 ---- Then run the same playbook as above, with the following command: diff --git a/modules/ROOT/pages/metrics.adoc b/modules/ROOT/pages/metrics.adoc index 17fd29d0..45904200 100644 --- a/modules/ROOT/pages/metrics.adoc +++ b/modules/ROOT/pages/metrics.adoc @@ -1,7 +1,5 @@ = Leverage metrics provided by {product-proxy} :page-tag: migration,zdm,zero-downtime,metrics -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] This topic provides detailed information about the metrics captured by the {product-proxy} and explains how to interpret the metrics. @@ -29,7 +27,7 @@ There are three groups of metrics in this dashboard: * Node level metrics * Asynchronous read requests metrics -image::{imagesprefix}zdm-grafana-proxy-dashboard1.png[Grafana dashboard shows three categories of {product-short} metrics for the proxy.] +image::zdm-grafana-proxy-dashboard1.png[Grafana dashboard shows three categories of {product-short} metrics for the proxy.] === Proxy-level metrics @@ -57,9 +55,9 @@ You can set the interval in the `Error Rate interval` dashboard variable at the ** Read Failure Rate: one `cluster` label with two settings, `origin` and `target`. The label that contains data depends on which cluster is currently considered the primary, the same as the latency and throughput metrics explained above. ** Write Failure Rate: one `failed_on` label with three settings, `origin`, `target`, and `both`. -*** `failed_on=origin`: the write request failed on Origin ONLY. -*** `failed_on=target`: the write request failed on Target ONLY. -*** `failed_on=both`: the write request failed on BOTH clusters. +*** `failed_on=origin`: the write request failed on the origin only. +*** `failed_on=target`: the write request failed on the target only. +*** `failed_on=both`: the write request failed on both the origin and target clusters. * Request Failure Counters: Number of total request failures (resets when the {product-proxy} instance is restarted) ** Connect Failure Counters: the same labels as the connect failure rate. @@ -72,17 +70,17 @@ To see error metrics by error type, see the node-level error metrics on the next === Node-level metrics * Latency: metrics on this bucket are not split by request type like the proxy level latency metrics so writes and reads are mixed together: -** Origin: latency measured by the {product-proxy} up to the point it received a response from the Origin connection. -** Target: latency measured by the {product-proxy} up to the point it received a response from the Target connection. +** Origin: latency measured by the {product-proxy} up to the point it received a response from the origin connection. +** Target: latency measured by the {product-proxy} up to the point it received a response from the target connection. * Throughput: same as node level latency metrics, reads and writes are mixed together. -* Number of connections per Origin node and per Target node. +* Number of connections per origin node and per target node. * Number of Used Stream Ids: -** Tracks the total number of used xref:manage-proxy-instances.adoc#zdm_proxy_max_stream_ids[stream ids] ("request ids") per connection type (Origin, Target and Async). +** Tracks the total number of used xref:manage-proxy-instances.adoc#zdm_proxy_max_stream_ids[stream ids] ("request ids") per connection type (`Origin`, `Target`, and `Async`). -* Number of errors per error type per Origin node and per Target node. +* Number of errors per error type per origin node and per target node. Possible values for the `error` type label: + ** `error=client_timeout` @@ -104,8 +102,8 @@ These metrics track: * Latency. * Throughput. -* Number of dedicated connections per node for async reads: whether it's Origin or Target connections depends on the {product-proxy} configuration. -That is, if the primary cluster is Origin, then the asynchronous reads are sent to Target. +* Number of dedicated connections per node for async reads: whether it's origin or target connections depends on the {product-proxy} configuration. +That is, if the primary cluster is the origin cluster, then the asynchronous reads are sent to the target cluster. * Number of errors per error type per node. === Insights via the {product-proxy} metrics @@ -121,7 +119,7 @@ Some examples of problems manifesting on these metrics: This dashboard in Grafana is not as important as the {product-proxy} dashboard. However, it may be useful to troubleshoot performance issues. Here you can see memory usage, Garbage Collection (GC) duration, open fds (file descriptors - useful to detect leaked connections), and the number of goroutines: -image::{imagesprefix}zdm-golang-dashboard.png[Golang metrics dashboard example is shown.] +image::zdm-golang-dashboard.png[Golang metrics dashboard example is shown.] Some examples of problem areas on these Go runtime metrics: diff --git a/modules/ROOT/pages/migrate-and-validate-data.adoc b/modules/ROOT/pages/migrate-and-validate-data.adoc index e9dc712b..67d2b0fa 100644 --- a/modules/ROOT/pages/migrate-and-validate-data.adoc +++ b/modules/ROOT/pages/migrate-and-validate-data.adoc @@ -1,90 +1,54 @@ = Phase 2: Migrate and validate data :page-tag: migration,zdm,zero-downtime,validate-data -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] -This topic introduces two open-source data migration tools that you can use during Phase 2 of your migration project. +In Phase 2 of data migration, you migrate data from the origin to the target, and then validate the migrated data. -For full details, see these topics: - -* xref:cassandra-data-migrator.adoc[{cass-migrator}] -* xref:dsbulk-migrator.adoc[{dsbulk-migrator}] - -These tools provide sophisticated features that help you migrate your data from any {cass-short} **Origin** ({cass}, {dse-short}, or {astra-db}) to any {cass-short} **Target** ({cass}, {dse-short}, or {astra-db}). - -//include::partial$lightbox-tip.adoc[] - -image::{imagesprefix}migration-phase2ra.png[Phase 2 diagram shows using tools to migrate data from Origin to Target.] +image::migration-phase2ra.png[In ZDM Phase 2, you migrate data from the origin cluster to the target cluster.] //For illustrations of all the migration phases, see the xref:introduction.adoc#_migration_phases[Introduction]. -== What's the difference between these data migration tools? - -In general: +This topic introduces data migration tools that you can use during Phase 2 of your migration project: -* {cass-migrator} ({cass-migrator-short}) is the best choice to migrate large data quantities, and where detailed logging, data verifications, table column renaming (if needed), and reconciliation options are provided. +{cass-migrator} ({cass-migrator-short}):: +Best for migrating large amounts of data and for migrations that need support for detailed logging, data verification, table column renaming, and reconciliation. -* {dsbulk-migrator} leverages {dsbulk-loader} to perform the data migration, and provides new commands specific to migrations. {dsbulk-migrator} is ideal for simple migration of smaller data quantities, and where data validation (other than post-migration row counts) is not necessary. +{dsbulk-migrator}:: +Extends {dsbulk-loader} with migration-specific commands. Best for simple migration of smaller amounts of data quantities, and migrations that don't require support for data validation during the migration. -== Open-source repos with essential data migration tools +{sstable-sideloader}:: +Exclusively for migrations from a {cass-reg}, {dse}, or {hcd} cluster to an {astra-db} database. +You can use {cass-migrator-short} to validate data after the migration. -Refer to the following GitHub repos: +[[cass-migrator-key-features]] +== {cass-migrator} -* https://github.com/datastax/cassandra-data-migrator[{cass-migrator}] repo. +{cass-migrator-short} offers extensive functionality and configuration options to support large and complex migrations as well as post-migration data validation. -* https://github.com/datastax/dsbulk-migrator[{dsbulk-migrator}] repo. +For more information, see xref:ROOT:cassandra-data-migrator.adoc[] and the https://github.com/datastax/cassandra-data-migrator[{cass-migrator} repository]. -A number of helpful assets are provided in each repo. +[[dsbulk-migrator-key-features]] +== {dsbulk-migrator} -In particular, the {cass-migrator-short} repo provides two configuration templates, with embedded comments and default values, which you can customize to match your data migration's requirements: +{dsbulk-migrator}, which is based on {dsbulk-loader}, is best for migrating smaller amounts of data or when you can shard data from table rows into more manageable quantities. -* https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[cdm.properties] provides a subset of configuration options with commonly required settings. +{dsbulk-migrator} provides the following commands: -* https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties] with all available options. +* `migrate-live`: Start a live data migration using the embedded version of {dsbulk-loader} or your own {dsbulk-loader} installation. +A live migration means that the data migration starts immediately and is performed by this migrator tool through the specified {dsbulk-loader} installation. -[[cass-migrator-key-features]] -== {cass-migrator} features - -{cass-migrator-short} offers functionalities like bulk export, import, data conversion, mapping of column names between Origin and Target, and validation. -The {cass-migrator-short} capabilities are extensive: - -* Automatic detection of each table's schema - column names, types, keys, collections, UDTs, and other schema items. -* Validation - Log partitions range-level exceptions, use the exceptions file as input for rerun operations. -* Supports migration of Counter tables. -* Preserves writetimes and Time To Live (TTL). -* Validation of advanced data types - Sets, Lists, Maps, UDTs. -* Filter records from Origin using writetimes, and/or CQL conditions, and/or a list of token ranges. -* Guardrail checks, such as identifying large fields. -* Fully containerized support - Docker and Kubernetes friendly. -* SSL support - including custom cipher algorithms. -* Migration/validation from and to Azure Cosmos {cass-short}. -* Validate migration accuracy and performance using a smaller randomized data-set. -* Support for adding custom fixed writetime. - -With new or enhanced capabilities in recent https://github.com/datastax/cassandra-data-migrator/packages/1832128[{cass-migrator-short} v4.x releases]. - -* Column names can differ between Origin and Target. -* UDTs can be migrated from Origin to Target, even when the keyspace names differ. -* Predefined Codecs allow for data type conversion between Origin and Target; you can add custom Codecs. -* Separate Writetime and TTL configuration supported. Writetime columns can differ from TTL columns. -* A subset of columns can be specified with Writetime and TTL: Not all eligible columns need to be used to compute the Origin value. -* Automatic `RandomPartitioner` min/max: Partition min/max values no longer need to be manually configured. -* You can populate Target columns with constant values: New columns can be added to the Target table, and populated with constant values. -* Expand Origin Map Column into Target rows: A Map in Origin can be expanded into multiple rows in Target when the Map key is part of the Target primary key. - -For extensive usage and reference details, see xref:cassandra-data-migrator.adoc[{cass-migrator}]. +* `generate-script`: Generate a migration script that you can execute to perform a data migration with a your own {dsbulk-loader} installation. +This command _doesn't_ trigger the migration; it only generates the migration script that you must then execute. -[[dsbulk-migrator-key-features]] -== {dsbulk-migrator} features +* `generate-ddl`: Read the schema from origin, and then generate CQL files to recreate it in your target {astra-db} database. -{dsbulk-migrator}, which is based on {dsbulk-loader}, is best for migrating smaller amounts of data, and/or when you can shard data from table rows into more manageable quantities. +For more information, see xref:ROOT:dsbulk-migrator.adoc[] and the https://github.com/datastax/dsbulk-migrator[{dsbulk-migrator} repository]. -{dsbulk-migrator} provides the following commands: +== {sstable-sideloader} -* `migrate-live` starts a live data migration using a pre-existing {dsbulk-loader} installation, or alternatively, the embedded {dsbulk-loader} version. A "live" migration means that the data migration will start immediately and will be performed by this migrator tool through the desired {dsbulk-loader} installation. +{sstable-sideloader} is a service running in {astra-db} that directly imports data from snapshot backups that you've uploaded to {astra-db} from an existing {cass-short}, {dse-short}, or {hcd-short} cluster. -* `generate-script` generates a migration script that, once executed, will perform the desired data migration, using a pre-existing {dsbulk-loader} installation. Please note: this command does not actually migrate the data; it only generates the migration script. +Because it imports data directly, {sstable-sideloader} can offer several advantages over CQL-based tools like {dsbulk-migrator} and {cass-migrator}, including faster, more cost-effective data loading, and minimal performance impacts on your origin cluster and target database. -* `generate-ddl` reads the schema from Origin and generates CQL files to recreate it in an {astra-db} cluster used as Target. +{sstable-sideloader} uses the {astra} {devops-api}, your cloud provider's CLI, and `nodetool`. -For extensive usage and reference details, see xref:dsbulk-migrator.adoc[{dsbulk-migrator}]. +For more information, see xref:sideloader:sideloader-overview.adoc[]. \ No newline at end of file diff --git a/modules/ROOT/pages/phase1.adoc b/modules/ROOT/pages/phase1.adoc index 4f23c9d6..d2a61a01 100644 --- a/modules/ROOT/pages/phase1.adoc +++ b/modules/ROOT/pages/phase1.adoc @@ -1,7 +1,5 @@ = Phase 1: Deploy {product-proxy} and connect client applications :page-tag: migration,zdm,zero-downtime,deploy,zdm-proxy,connect-apps -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] This section presents the following: @@ -11,8 +9,6 @@ This section presents the following: * xref:connect-clients-to-proxy.adoc[] * xref:manage-proxy-instances.adoc[] -//include::partial$lightbox-tip.adoc[] - -image::{imagesprefix}migration-phase1ra.png[Phase 1 diagram shows deployed {product-proxy} instances, client app connections to proxies, and Target is setup.] +image::migration-phase1ra.png[Phase 1 diagram shows deployed {product-proxy} instances, client app connections to proxies, and the target cluster is setup.] //For illustrations of all the migration phases, see the xref:introduction.adoc#_migration_phases[Introduction]. diff --git a/modules/ROOT/pages/preliminary-steps.adoc b/modules/ROOT/pages/preliminary-steps.adoc index 068dcde3..356d8aa9 100644 --- a/modules/ROOT/pages/preliminary-steps.adoc +++ b/modules/ROOT/pages/preliminary-steps.adoc @@ -1,7 +1,5 @@ = Preliminary steps :page-tag: migration,zdm,zero-downtime,preliminary-steps -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] Before starting your migration, verify that you met the prerequisites and performed the preliminary tasks as documented in: diff --git a/modules/ROOT/pages/release-notes.adoc b/modules/ROOT/pages/release-notes.adoc deleted file mode 100644 index a8d6d14f..00000000 --- a/modules/ROOT/pages/release-notes.adoc +++ /dev/null @@ -1,127 +0,0 @@ -= {company} {product} Release Notes -:navtitle: Release notes -:page-tag: migration,zdm,zero-downtime,release-notes -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] - -== {product-automation} 2.3.0 update - -**03 February 2023** - -Released {product-automation} 2.3.0, which enables ansible scripts and terraform to work with both Ubuntu and RedHat-family Linux distributions. -Documentation updates included the following in the xref:deployment-infrastructure.adoc#_machines[Machines] section of the Deployment and infrastructure considerations topic: - -"Ubuntu Linux 20.04 or newer, RedHat Family Linux 7 or newer" - -== {product-automation} 2.2.0 update - -**31 January 2023** - -Starting in version 2.2.0 of the {product-automation}, we added the `zdm_proxy_cluster_config.yml` file to contain all the configuration variables for Origin and Target. -Prior to version 2.2.0, the variables were in the `zdm_proxy_core_config.yml` file. - -[TIP] -==== -This change is backward compatible. -If you previously populated the variables in `zdm_proxy_core_config.yml`, these variables will be honored and take precedence over any variables in `zdm_proxy_cluster_config.yml`, if both files are present. -==== - -We encourage existing 2.x {product-short} users to upgrade to the 2.3.0 version of {product-automation}. -To do so, simply `git pull` the `main` branch of https://github.com/datastax/zdm-proxy-automation from within the Ansible Control Host container. -You can also check out a https://github.com/datastax/zdm-proxy-automation/releases/tag/v2.3.0[specific tag], such as 2.3.0. - -For more about the YML files used to configure access to your clusters, see xref:deploy-proxy-monitoring.adoc#_configure_the_zdm_proxy[this topic]. - -[NOTE] -==== -The latest {product-proxy} version is 2.1.0. -The latest {product-automation} version is 2.3.1. -==== - -If you are using a {product-automation} version up to and including 2.1.0, please use `zdm_proxy_core_config.yml` to configure access to your clusters. - -== {product-short} 2.1.0 release - -**13 January 2023** - -The {product-short} 2.1.0 release adds {product-proxy} heartbeat functionality and provides several bug fixes. - -The periodic heartbeat feature in 2.1.0 has been implemented to keep alive idle cluster connections. - -By default, {product-proxy} now sends heartbeats after 30 seconds of inactivity on a cluster connection. -You can tune the heartbeat interval with the Ansible configuration variable `heartbeat_insterval_ms`, or by directly setting the `ZDM_HEARTBEAT_INTERVAL_MS` environment variable if you do not use the {product-automation}. - -{company} strongly recommends that you use version 2.1.0 (or newer) to benefit from this improvement, especially if you have a read-only workload. - -To verify which {product-proxy} version you're running, see this xref:troubleshooting-tips.adoc#_how_to_identify_the_zdm_proxy_version[topic]. - -To find out how to upgrade an existing {product-proxy} deployment, see xref:manage-proxy-instances.adoc#_upgrade_the_proxy_version[Upgrade the proxy version]. - -=== {product-proxy} 2.1.0 changes - -For the latest information about {product-proxy} new features and other changes, please refer to these GitHub-hosted documents in the open-source {product-proxy} repo: - -* https://github.com/datastax/zdm-proxy/blob/main/RELEASE_NOTES.md[RELEASE_NOTES] - -* https://github.com/datastax/zdm-proxy/blob/main/CHANGELOG/CHANGELOG-2.1.md[CHANGELOG 2.1] - -=== {product-short} 2.1.0 documentation updates - -The following topics have been updated for the 2.1.0 release: - -* xref:feasibility-checklists.adoc#_read_only_applications[Feasibility checks for read-only applications]. -See the notes indicating that this issue is solved by the {product-proxy} 2.1.0 release. - -* xref:manage-proxy-instances.adoc#change-mutable-config-variable[Change a mutable configuration variable]. -See the `heartbeat_interval_ms` and `zdm-proxy_max_stream_ids` information. - -* xref:troubleshooting-scenarios.adoc#_async_read_timeouts_stream_id_map_exhausted[Async read timeouts]. -See the clarification in the *Workaround* section indicating that this issue is solved by the {product-proxy} 2.1.0 release. - -* xref:metrics.adoc#_node_level_metrics[Node-level metrics]. -See the "Number of Used Stream Ids" section. - - -== {product-short} 2.0.0 release - -**18 October 2022** - -=== {product-proxy} 2.0.0 changes - -This 2.0.0 version marks the public release of the self-service {company} {product} product suite. - -The following GitHub repos are public. -You are welcome to read the source and submit feedback via GitHub Issues per repo. - -* https://github.com/datastax/zdm-proxy[{product-proxy}] open-source repo: in addition to sending feedback, you may submit Pull Requests (PRs) for potential inclusion, provided you accept the https://cla.datastax.com/[{company} Contributor License Agreement (CLA)]. -For more information, see xref:contributions.adoc[]. - -* https://github.com/datastax/zdm-proxy-automation[{product-automation}] repo for Ansible-based {product-proxy} automation. - -* https://github.com/datastax/dsbulk-migrator[{dsbulk-migrator}] repo for migration of smaller data quantities. - -* https://github.com/datastax/cassandra-data-migrator[{cass-migrator}] repo for migration of larger data quantities and where detailed verifications and reconciliation options are needed. - -include::partial$note-downtime.adoc[] - -For the latest information about {product-proxy} new features and other changes, please refer to the GitHub-hosted https://github.com/datastax/zdm-proxy/blob/main/RELEASE_NOTES.md[RELEASE_NOTES] in the open-source {product-proxy} repo. -The document includes CHANGELOG links for each {product-proxy} `N.n` release. - -[TIP] -==== -The {product} process requires you to be able to perform rolling restarts of your client applications during the migration. -This is standard practice for client applications that are deployed over multiple instances and is a widely used approach to roll out releases and configuration changes. -==== - - -=== {product-short} 2.0.0 documentation updates - -Starting with the 2.0.0 version on 18-Oct-2022, the {product} documentation set is available online, starting xref:introduction.adoc[here]. - -== Supported releases - -include::partial$supported-releases.adoc[] - -== Migration scenarios - -include::partial$migration-scenarios.adoc[] diff --git a/modules/ROOT/pages/rollback.adoc b/modules/ROOT/pages/rollback.adoc index 2cd26090..10c6daff 100644 --- a/modules/ROOT/pages/rollback.adoc +++ b/modules/ROOT/pages/rollback.adoc @@ -1,20 +1,17 @@ = Understand the rollback options :navtitle: Understand rollback options :page-tag: migration,zdm,zero-downtime,rollback -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] -At any point during the migration process until the very last phase, if you hit any unexpected issue and need to (in effect) "rollback" the migration, you can always easily revert your client applications to connect directly to Origin. +At any point during the migration process until the very last phase, if you hit any unexpected issue and need to (in effect) "rollback" the migration, you can always easily revert your client applications to connect directly to the origin cluster. The migration can be started from scratch once the issue has been addressed. -//include::partial$lightbox-tip-all-phases.adoc[] +image::migration-all-phases.png[Migration phases from start to finish.] -image::{imagesprefix}migration-all-phases.png[Migration phases from start to finish.] +After moving your client applications off the {product-proxy} instances (Phase 5), writes are no longer sent to both the origin and target clusters. +The data on origin cluster is no longer kept up-to-date, and you lose this seamless rollback option. +This is the point at which you commit to using the target cluster permanently. +The {product-proxy} deployment can be destroyed, and the origin cluster is no longer needed by the client applications that have been migrated. -After moving your client applications off the {product-proxy} instances (Phase 5), writes are no longer sent to both Origin and Target clusters: the data on Origin is no longer kept up-to-date, and you lose this seamless rollback option. -This is the point at which you commit to using Target permanently. -The {product-proxy} deployment can be destroyed, and Origin is no longer needed by the client applications that have been migrated. - -However, should you decide to move back to Origin at a later point, or move to a new cluster entirely, you can simply execute the same migration process. -In this case, the new Origin will now be the former Target, and the new Target will be whatever cluster you wish to migrate to (which could even be the former Origin). +However, should you decide to move back to the origin cluster later, or if you want to move to a new cluster entirely, you can rerun the same migration process. +In this case, you use your original target cluster as the new origin cluster, and you set the new target cluster to whatever cluster you want to migrate to (which could even be the original ancestor origin cluster). diff --git a/modules/ROOT/pages/setup-ansible-playbooks.adoc b/modules/ROOT/pages/setup-ansible-playbooks.adoc index a4082bfd..6ae409e1 100644 --- a/modules/ROOT/pages/setup-ansible-playbooks.adoc +++ b/modules/ROOT/pages/setup-ansible-playbooks.adoc @@ -1,7 +1,5 @@ = Set up the {product-automation} with {product-utility} :page-tag: migration,zdm,zero-downtime,zdm-automation,zdm-proxy,ansible -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] This page explains how to use the {product-utility} to set up the Ansible Control Host container for the {product-automation}. @@ -20,7 +18,7 @@ This is a Golang (Go) executable program that runs anywhere. This utility prompts you for a few configuration values, with helpful embedded explanations and error handling, then automatically creates the Ansible Control Host container ready for you to use. From this container, you will be able to easily configure and run the {product-automation} Ansible playbooks. -image::{imagesprefix}docker-container-and-zdm-utility.png[{product-proxy} connections from Docker container created by {product-utility}] +image::docker-container-and-zdm-utility.png[{product-proxy} connections from Docker container created by {product-utility}] == Prerequisites @@ -221,7 +219,7 @@ They report connection or protocol errors, but do not give you enough informatio Metrics, however, provide especially helpful data and the graphs show you how they vary over time. The monitoring stack ships with preconfigured Grafana dashboards that are automatically set up as part of the monitoring deployment. -For details about the metrics you can observe in these preconfigured Grafana dashboards, see xref:troubleshooting-tips.adoc#how-to-leverage-metrics[this section] of the troubleshooting tips. +For details about the metrics you can observe in these preconfigured Grafana dashboards, see xref:ROOT:metrics.adoc[]. ==== + You can choose to deploy the monitoring stack on the jumphost or on a different machine, as long as it can connect to the {product-proxy} instances over TCP on ports 9100 (to collect host-level metrics) and on the port on which the {product-proxy} exposes its own metrics, typically 14001. @@ -241,7 +239,7 @@ At this point, the {product-utility}: * Presents a summary of the configuration thus far, and prompts you to Continue. Example: -image::{imagesprefix}zdm-go-utility-results3.png[A summary of the configuration provided is displayed in the terminal] +image::zdm-go-utility-results3.png[A summary of the configuration provided is displayed in the terminal] If you agree, enter `Y` to proceed. @@ -251,7 +249,7 @@ The {product-utility} now: * Creates, configures and starts the Ansible Control Host container. * Displays a message. Example: -image::{imagesprefix}zdm-go-utility-success3.png[Ansible Docker container success messages] +image::zdm-go-utility-success3.png[Ansible Docker container success messages] [NOTE] ==== diff --git a/modules/ROOT/pages/tls.adoc b/modules/ROOT/pages/tls.adoc index 349eebab..05ac7e64 100644 --- a/modules/ROOT/pages/tls.adoc +++ b/modules/ROOT/pages/tls.adoc @@ -1,8 +1,6 @@ = Configure Transport Layer Security (TLS) :navtitle: Configure Transport Layer Security :page-tag: migration,zdm,zero-downtime,tls,transport-layer,zdm-proxy -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] {product-proxy} supports proxy-to-cluster and application-to-proxy TLS encryption. @@ -16,7 +14,7 @@ See the information here in this topic, and then refer to the {product-automatio * All TLS configuration is optional. Enable TLS between the {product-proxy} and any cluster that requires it, and/or between your client application and the {product-proxy} if required. -* Proxy-to-cluster TLS can be configured between the {product-proxy} and Origin/Target (either or both) as desired. +* Proxy-to-cluster TLS can be configured between the {product-proxy} and either or both the origin and target clusters, as desired. Each set of configurations is independent of the other. When using proxy-to-cluster TLS, the {product-proxy} acts as the TLS client and the cluster as the TLS server. One-way TLS and Mutual TLS are both supported and can be enabled depending on each cluster's requirements. @@ -29,7 +27,7 @@ This is done through the Secure Connect Bundle (SCB) and does not require any ex [[_retrieving_files_from_a_jks_keystore]] == Retrieving files from a JKS keystore -If you are already using TLS between your client application and Origin, the files needed to configure TLS will already be used in the client application's configuration (TLS client files) and Origin's configuration (TLS Server files). +If you are already using TLS between your client application and the origin cluster, then the files needed to configure TLS will already be used in the client application's configuration (TLS client files) and the origin's configuration (TLS Server files). In some cases, these files may be contained in a JKS keystore. The {product-proxy} does not accept a JKS keystore, requiring the raw files instead. @@ -64,7 +62,7 @@ The files required to configure proxy-to-cluster TLS are: [TIP] ==== -If your Origin cluster requires TLS, your client application will already be using these files in its configuration to connect to it. +If your origin cluster requires TLS, your client application will already be using these files in its configuration to connect to it. ==== [NOTE] @@ -74,32 +72,49 @@ All files must be in plain-text, non-binary format. === Prepare the TLS files and copy them to the Ansible Control Host container -For each self-managed cluster requiring TLS (Origin and/or Target), execute the following steps: +For each self-managed origin or target cluster that requires TLS, do the following: -* If your TLS files are in a JKS keystore, extract them as plain text (see xref:tls.adoc#_retrieving_files_from_a_jks_keystore[]). -* Upload the following files to the jumphost: -** For one-way TLS, only the server CA. -** For Mutual TLS, the server CA, the client cert and the client key. -* From a shell on the jumphost, copy the files to the relevant TLS directory into the Ansible Control Host container: -** For Origin: `docker cp zdm-ansible-container:/home/ubuntu/origin_tls_files` -** For Target: `docker cp zdm-ansible-container:/home/ubuntu/target_tls_files` +. If your TLS files are in a JKS keystore, extract them as plain text (see xref:tls.adoc#_retrieving_files_from_a_jks_keystore[]). + +. Upload the following files to the jumphost: ++ +* For one-way TLS, uploda only the server CA. +* For Mutual TLS, upload the server CA, the client cert, and the client key. -There is a set of TLS configuration variables for Origin and a separate set for Target, so that they can be configured independently as desired. +. From a shell on the jumphost, copy the files to the relevant TLS directory into the Ansible Control Host container: ++ +* For origin clusters, run: `docker cp zdm-ansible-container:/home/ubuntu/origin_tls_files` +* For target clusters, run: `docker cp zdm-ansible-container:/home/ubuntu/target_tls_files` === Configure TLS -Here is how to do it: +There are separate TLS configuration variables for origin and target clusters so that you can configure these independently, if needed. -* Ensure that you have a shell open to the container. -If you do not, you can open it with `docker exec -it zdm-ansible-container bash`. -* From this shell, uncomment the relevant variables and edit their values in the configuration file `zdm-proxy-automation/ansible/vars/zdm_proxy_custom_tls_config.yml` as follows. -** For proxy-to-Origin TLS configuration: - *** `origin_tls_user_dir_path`: uncomment and leave to its preset value of `/home/ubuntu/origin_tls_files`. - *** `origin_tls_server_ca_filename`: filename (without path) of the Server CA. - *** `origin_tls_client_cert_filename`: filename (without path) of the Client cert. This is for Mutual TLS only, leave unset otherwise. - *** `origin_tls_client_key_filename`: filename (without path) of the Client key. - For Mutual TLS only, leave unset otherwise. -** The configuration for proxy-to-Target TLS is done in the same way but using the Target-specific configuration variables (`target_tls_*`). +. Open a shell to the container: ++ +[source,bash] +---- +docker exec -it zdm-ansible-container bash +---- + +. Find the custom TLS configuration file at `zdm-proxy-automation/ansible/vars/zdm_proxy_custom_tls_config.yml`. + +. Uncomment and set the following variables in the custom TLS configuration file for the proxy-to-origin TLS configuration: ++ +* `origin_tls_user_dir_path`: uncomment and leave to its preset value of `/home/ubuntu/origin_tls_files`. +* `origin_tls_server_ca_filename`: filename (without path) of the Server CA. +* `origin_tls_client_cert_filename`: filename (without path) of the Client cert. This is for Mutual TLS only, leave unset otherwise. +* `origin_tls_client_key_filename`: filename (without path) of the Client key. +For Mutual TLS only, leave unset otherwise. + +. Uncomment and set the variables for the proxy-to-target TLS configuration: ++ +* `target_tls_user_dir_path`: uncomment and leave to its preset value of `/home/ubuntu/target_tls_files`. +* `target_tls_server_ca_filename`: filename (without path) of the Server CA. +* `target_tls_client_cert_filename`: filename (without path) of the Client cert. +This is for Mutual TLS only, leave unset otherwise. +* `target_tls_client_key_filename`: filename (without path) of the Client key. +For Mutual TLS only, leave unset otherwise. == Application-to-proxy TLS @@ -116,11 +131,8 @@ All these files are required for one-way and Mutual TLS. [TIP] ==== -If your Origin cluster currently requires TLS, it will already be using these files for its own TLS configuration. -==== +If your origin cluster currently requires TLS, it will already be using these files for its own TLS configuration. -[TIP] -==== All files must be in plain-text, non-binary format. ==== diff --git a/modules/ROOT/pages/troubleshooting-scenarios.adoc b/modules/ROOT/pages/troubleshooting-scenarios.adoc index 42fda6bf..2b3326f6 100644 --- a/modules/ROOT/pages/troubleshooting-scenarios.adoc +++ b/modules/ROOT/pages/troubleshooting-scenarios.adoc @@ -1,14 +1,12 @@ = Troubleshooting scenarios :page-tag: migration,zdm,zero-downtime,zdm-proxy,troubleshooting -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] -Refer the following troubleshooting scenarios for information about resolving common migration issues. -Each section presents: +//TODO: use same format as driver troubleshooting. +//TODO: Remove or hide issues that have been resolved by a later release. -* Symptoms -* Cause -* Solution or Workaround +This page provides troubleshooting advice for specific issues or error messages related to {product}. + +Each section includes symptoms, causes, and suggested solutions or workarounds. == Configuration changes are not being applied by the automation @@ -166,8 +164,8 @@ This error means that at least one of these three sets of credentials is incorre === Solution or Workaround -If the authentication error is preventing the proxy from starting then it's either the Origin or Target credentials that are incorrect or have insufficient permissions. -The log message shows whether it is the Target or Origin handshake that is failing. +If the authentication error is preventing the proxy from starting then it's either the origin or target credentials that are incorrect or have insufficient permissions. +The log message shows whether it is the origin or target handshake that is failing. If the proxy is able to start up -- that is, this message can be seen in the logs: @@ -324,7 +322,7 @@ Restart the client application to force an immediate reconnect. If you expect {product-proxy} instances to go down frequently, change the reconnection policy on the driver so that the interval between reconnection attempts has a shorter limit. -== Error with {astra-db} DevOps API when using the {product-automation} +== Error with {astra} DevOps API when using the {product-automation} === Symptoms @@ -339,7 +337,7 @@ Connection failure: Remote end closed connection without response", "redirected" === Cause -The {astra-db} DevOps API is likely temporarily unavailable. +The {astra} DevOps API is likely temporarily unavailable. === Solution or Workaround @@ -362,21 +360,25 @@ metadata service (Astra) returned not successful status code There are two possible causes for this: -1. The credentials that the {product-proxy} is using for {astra-db} don't have sufficient permissions. -2. The {astra-db} database is hibernated. +* The credentials that the {product-proxy} is using for {astra-db} don't have sufficient permissions. +* The {astra-db} database is hibernated or otherwise unavailable. === Solution or Workaround -Start by opening the {astra-ui} and checking the `Status` of your database. -If it is `Hibernated`, click the “Resume” button and wait for it to become `Active`. -If it is `Active` already, then it is likely an issue with permissions. +In the {astra-ui}, check the xref:astra-db-serverless:databases:database-statuses.adoc[database status]. + +If the database is not in *Active* status, you might need to take action or wait for the database to return to active status. +For example, if the database is hibernated, xref:astra-db-serverless:databases:database-statuses.adoc#hibernated[reactivate the database]. +When the database is active again, retry the connection. -We recommend starting with a token that has the Database Administrator role in {astra-db} to confirm that it is a permissions issue. -Refer to https://docs.datastax.com/en/astra/astra-db-vector/administration/manage-database-access.html[Manage user permissions]. +If the database is in *Active* status, then the issue is likely due to the credentials permissions. +Try using an xref:astra-db-serverless:administration:manage-application-tokens.adoc[application token scoped to a database], specifically a token with the *Database Administrator* role for your target database. [[_async_read_timeouts_stream_id_map_exhausted]] == Async read timeouts / stream id map exhausted +//Supposedly resolved in 2.1.0 release? + === Symptoms Dual reads are enabled and the following messages are found in the {product-proxy} logs: @@ -411,6 +413,7 @@ If you find an issue like this please submit an https://github.com/datastax/zdm- == Client application closed connection errors every 10 minutes when migrating to {astra-db} +//TODO: Remove - resolved in 2.1.0 [NOTE] ==== This issue is fixed in {product-proxy} 2.1.0. See the Fix section below. @@ -418,7 +421,7 @@ This issue is fixed in {product-proxy} 2.1.0. See the Fix section below. === Symptoms -Every 10 minutes a message is logged in the {product-proxy} logs showing a disconnect that was caused by {astra-db}. +Every 10 minutes a message is logged in the {product-proxy} logs showing a disconnect that was caused by {astra-db}: [source,log] ---- @@ -428,7 +431,7 @@ Every 10 minutes a message is logged in the {product-proxy} logs showing a disco === Cause {astra-db} terminates idle connections after 10 minutes of inactivity. -If a client application is only sending reads through a connection then the Target (i.e. {astra-db} in this case) connection will not get any traffic because {product-short} forwards all reads to the Origin connection. +If a client application only sends reads through a connection then the target cluster, which is an {astra-db} database in this example, then the connection won't get any traffic because {product-short} forwards all reads to the origin connection. === Solution or Workaround @@ -445,19 +448,19 @@ Consider a case where a user runs separate benchmarks against: * {astra-db} directly * Origin directly -* {product-short} (with {astra-db} and Origin) +* {product-short} (with {astra-db} and the origin cluster) -The results of these tests show latency/throughput values are worse with {product-short} than when connecting to {astra-db} or Origin directly. +The results of these tests show latency/throughput values are worse with {product-short} than when connecting to {astra-db} or origin cluster directly. === Cause {product-short} will always add additional latency which, depending on the nature of the test, will also result in a lower throughput. Whether this performance hit is expected or not depends on the difference between the {product-short} test results and the test results with the cluster that performed the worst. -Writes in {product-short} require an `ACK` from both clusters while reads only require the result from the Origin cluster (or target if the proxy is set up to route reads to the target cluster). -This means that if Origin has better performance than Target then {product-short} will inevitably have a worse performance for writes. +Writes in {product-short} require an `ACK` from both clusters while reads only require the result from the origin cluster (or target if the proxy is set up to route reads to the target cluster). +This means that if the origin cluster has better performance than the target cluster, then {product-short} will have worse write performance. -From our testing benchmarks, a performance degradation of up to 2x latency is not unheard of even without external factors adding more latency, but it is still worth checking some things that might add additional latency like whether the proxy is deployed on the same Availability Zone (AZ) as the Origin cluster or application instances. +From our testing benchmarks, a performance degradation of up to 2x latency is not unheard of even without external factors adding more latency, but it is still worth checking some things that might add additional latency like whether the proxy is deployed on the same Availability Zone (AZ) as the origin cluster or application instances. Simple statements and batch statements are things that will make the proxy add additional latency compared to normal prepared statements. Simple statements should be discouraged especially with the {product-proxy} because currently the proxy takes a considerable amount of time just parsing the queries and with prepared statements the proxy only has to parse them once. diff --git a/modules/ROOT/pages/troubleshooting-tips.adoc b/modules/ROOT/pages/troubleshooting-tips.adoc index b154180e..8e0dd708 100644 --- a/modules/ROOT/pages/troubleshooting-tips.adoc +++ b/modules/ROOT/pages/troubleshooting-tips.adoc @@ -1,47 +1,77 @@ = Troubleshooting tips :page-tag: migration,zdm,zero-downtime,zdm-proxy,troubleshooting -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] +:page-aliases: ROOT:troubleshooting.adoc +:description: Get help with {product}. -Refer to the tips on this page for information that can help you troubleshoot issues with your migration. +This page provides general troubleshooting advice and describes some common issues you might encounter with {product}. -== How to retrieve the {product-proxy} log files +For specific error messages, see xref:troubleshooting-scenarios.adoc[]. -Depending on how you deployed {product-proxy}, there may be different ways to access -the logs. -If you used the {product-automation}, see xref:manage-proxy-instances.adoc#_view_the_logs[View the logs] for a quick way -to view the logs of a single proxy instance. -Follow the instructions on xref:manage-proxy-instances.adoc#_collect_the_logs[Collect the logs] for a playbook that systematically retrieves all logs by all instances and packages them in a zip archive for later inspection. +You can also contact your {company} account representative or {support-url}[{company} Support], if you have a https://www.datastax.com/products/luna[Luna service contract]. -If you did not use the {product-automation}, you might have to access the logs differently. -If Docker is used, enter the following command to export the logs of a container to a file: +[#proxy-logs] +== {product-proxy} logs + +The {product-proxy} logs can help you troubleshoot issues with {product}. + +=== Set the {product-proxy} log level + +Set the {product-proxy} log level to print the messages that you need. + +The default log level is `INFO`, which is adequate for most logging. + +If you need more detail for temporary troubleshooting, you can set the log level to `DEBUG`. +However, this can slightly degrade performance, and {company} recommends that you revert to `INFO` logging as soon as possible. + +How you set the log level depends on how you deployed the {product-proxy}: + +* If you used {product-automation} to deploy the {product-proxy}, set `log_level` in `vars/zdm_proxy_core_config.yml`. ++ +You can change this value in a rolling fashion by editing the variable and running the `rolling_update_zdm_proxy.yml` playbook. +For more information, see xref:manage-proxy-instances.adoc#change-mutable-config-variable[Change a mutable configuration variable]. + +* If you didn't use {product-automation} to deploy the {product-proxy}, set the `ZDM_LOG_LEVEL` environment variable on each proxy instance, and then restart each instance. + +=== Retrieve the {product-proxy} log files + +//TODO: Reconcile with manage-proxy-instance.adoc content. + +If you used the {product-automation} to deploy {product-proxy}, then you can get logs for a single proxy instance, and you can use a playbook to retrieve logs for all instances. +For instructions and more information, see xref:ROOT:manage-proxy-instances.adoc#access-the-proxy-logs[Access the proxy logs]. + +If you did not use the {product-automation} to deploy {product-proxy}, you might have to access the logs another way. +For example, if you used Docker, you can use the following command to export a container's logs to a `log.txt` file: [source,bash] ---- docker logs my-container > log.txt ---- -[TIP] -==== -Keep in mind that docker logs are deleted if the container is recreated. -==== +Keep in mind that Docker logs are deleted if the container is recreated. -== What to look for in the logs +=== Message levels -Make sure that the log level of the {product-proxy} is set to the appropriate value: +Some log messages contain text that sounds like an error, but they are not errors. +The message's `level` typically indicates severity: -* If you deployed the {product-proxy} through the {product-automation}, the log level is determined by the variable `log_level` in `vars/zdm_proxy_core_config.yml`. -This value can be changed in a rolling fashion by editing this variable and running the playbook `rolling_update_zdm_proxy.yml`. -For more information, see xref:manage-proxy-instances.adoc#change-mutable-config-variable[Change a mutable configuration variable]. +* `level=debug` and `level=info`: Expected and normal messages that are typically not errors. +However, if you enable `DEBUG` logging, `debug` messages can help you find the source of a problem. + +* `level=warn`: Reports an event that wasn't fatal to the overall process, but could indicate an issue with an individual request or connection. + +* `level=error`: Indicates an issue with the {product-proxy}, client application, or clusters. +These messages require further examination. -* If you did not use the {product-automation} to deploy the {product-proxy}, change the environment variable `ZDM_LOG_LEVEL` on each proxy instance and restart it. +If the meaning of a `warn` or `error` message isn't clear, you can submit an issue in the https://github.com/datastax/zdm-proxy/issues[{product-proxy} GitHub repository]. -Here are the most common messages you'll find in the proxy logs: +=== Common log messages -=== {product-proxy} startup message +Here are the most common messages in the {product-proxy} logs. -Assuming the Log Level is not filtering out `INFO` entries, you can look for the following type of log message in order to verify that the {product-proxy} is starting up correctly. -Example: +==== {product-proxy} startup message + +If the log level doesn't filter out `info` entries, you can look for a `Proxy started` log message to verify that the {product-proxy} started correctly. +For example: [source,json] ---- @@ -50,41 +80,30 @@ msg=\"Proxy started. Waiting for SIGINT/SIGTERM to shutdown. \"\n","stream":"stderr","time":"2023-01-13T11:50:48.522097083Z"} ---- -=== {product-proxy} configuration +==== {product-proxy} configuration message + +If the log level doesn't filter out `info` entries, the first few lines of a {product-proxy} log file contain all configuration variables and values in a long JSON string. -The first few lines of the {product-proxy} log file contains all the configuration variables and values. -They are printed in a long JSON string format. -You can copy/paste the string into a JSON formatter/viewer to make it easier to read. -Example log message: +For example, this log message has been truncated for readability: [source,json] ---- {"log":"time=\"2023-01-13T11:50:48Z\" level=info msg=\"Parsed configuration: {\\\"ProxyIndex\\\":1,\\\"ProxyAddresses\\\":"...", -[remaining of json string removed for simplicity] +...TRUNCATED... ","stream":"stderr","time":"2023-01-13T11:50:48.339225051Z"} ---- -Seeing the configuration settings is useful while troubleshooting issues. -However, remember to check the log level variable to ensure you're viewing the intended types of messages. -Setting the log level setting to `DEBUG` might cause a slight performance degradation. +Configuration settings can help with troubleshooting. -=== Be aware of current log level +To make this message easier to read, pass it through a JSON formatter or paste it into a text editor that can reformat JSON. -When you find a log message that looks like an error, the most important thing is to check the **log level** of that message. +==== Protocol log messages -* A log message with `level=debug` or `level=info` is very likely not an error, but something expected and normal. +There are cases where protocol errors are fatal, and they will kill an active connection that was being used to serve requests. +However, it is also possible to get normal protocol log messages that contain wording that sounds like an error. -* Log messages with `level=error` must be examined as they usually indicate an issue with the proxy, the client application, or the clusters. - -* Log messages with `level=warn` are usually related to events that are not fatal to the overall running workload, but may cause issues with individual requests or connections. - -* In general, log messages with `level=error` or `level=warn` should be brought to the attention of {company}, if the meaning is not clear. -In the {product-proxy} GitHub repo, submit a https://github.com/datastax/zdm-proxy/issues[GitHub Issue] to ask questions about log messages of type `error` or `warn` that are unclear. - -=== Protocol log messages - -Here's an example of a log message that looks like an error, but it's actually an expected and normal message: +For example, the following `DEBUG` message contains the phrases `force a downgrade` and `unsupported protocol version`, which can sound like errors: [source,json] ---- @@ -94,24 +113,25 @@ to the client to force a downgrade: PROTOCOL (code=Code Protocol [0x0000000A], msg=Invalid or unsupported protocol version (5)).\"\n","stream":"stderr","time":"2023-01-13T12:02:12.379287735Z"} ---- -There are cases where protocol errors are fatal so they will kill an active connection that was being used to serve requests. -However, if you find a log message similar to the example above with log level `debug`, then it's likely not an issue. -Instead, it's more likely an expected part of the handshake process during the connection initialization; that is, the normal protocol version negotiation. - -[[_how_to_identify_the_zdm_proxy_version]] -== How to identify the {product-proxy} version +However, `level=debug` indicates that this is not an error. +Instead, this is a normal part of protocol version negotiation (handshake) during connection initialization. -In the {product-proxy} logs, the first message contains the version string (just before the message that shows the configuration): +[#check-version] +== Check your {product-proxy} version +//TODO: Possibly duplicated on manage-proxy-instances.html#_upgrade_the_proxy_version +In the {product-proxy} logs, the first message contains the version string: [source,console] ---- time="2023-01-13T13:37:28+01:00" level=info msg="Starting ZDM proxy version 2.1.0" -time="2023-01-13T13:37:28+01:00" level=info msg="Parsed configuration: {removed for simplicity}" +time="2023-01-13T13:37:28+01:00" level=info msg="Parsed configuration: ..." ---- -You can also provide a `-version` command line parameter to the {product-proxy} and it will only print the version. -Example: +This message is logged immediately before the long `Parsed configuration` string. + +You can also pass the `-version` flag to the {product-proxy} to print the version. +For example, you can use the following Docker command: [source,bash] ---- @@ -119,43 +139,65 @@ docker run --rm datastax/zdm-proxy:2.x -version ZDM proxy version 2.1.0 ---- -[TIP] +[IMPORTANT] ==== -Do not use `--rm` when actually launching the {product-proxy} otherwise you will not be able to access the logs when it stops (or crashes). +Don't use `--rm` when you launch the {product-proxy} container. +This flag will prevent you from accessing the logs when {product-proxy} stops or crashes. ==== -[#how-to-leverage-metrics] -== How to leverage the metrics provided by {product-proxy} +== Report an issue + +To report an issue or get additional support, submit an issue in the {product-short} component GitHub repositories: -See xref:metrics.adoc[]. +* https://github.com/datastax/zdm-proxy/issues[{product-proxy} repository] +* https://github.com/datastax/zdm-proxy-automation/issues[{product-automation} repository] (includes {product-automation} and the {product-utility}) +* https://github.com/datastax/cassandra-data-migrator/issues[{cass-migrator} repository] +* https://github.com/datastax/dsbulk-migrator/issues[{dsbulk-migrator} repository] -== Reporting an issue +[IMPORTANT] +==== +These repositories are public. + +Don't include any proprietary or private information in issues, pull requests, or comments that you make in these repositories. +==== -If you encounter a problem during your migration, please contact us. -In the {product-proxy} GitHub repo, submit a https://github.com/datastax/zdm-proxy/issues[GitHub Issue]. -Only to the extent that the issue's description does not contain **your proprietary or private** information, please include the following: +In the issue description, include as much of the following information as possible, and make sure to remove all proprietary and private information before submitting the issue: -* {product-proxy} version (see xref:_how_to_identify_the_zdm_proxy_version[here]). -* {product-proxy} logs: ideally at `debug` level if you can reproduce the issue easily and can tolerate a restart of the proxy instances to apply the configuration change. -* Version of database software on the Origin and Target clusters (relevant for {dse-short} and {cass} deployments only). -* If {astra-db} is being used, please let us know in the issue description. -* Screenshots of the {product-proxy} metrics dashboards from Grafana or whatever visualization tool you use. -If you can provide a way for us to access those metrics directly that would be even better. -* Application/Driver logs. -* Driver and version that the client application is using. +* Your <>. -=== Reporting a performance issue +* <>, ideally at `DEBUG` level, if you can easily reproduce the issue and tolerate restarting the proxy instances to apply the log level configuration change. -If the issue is related to performance, troubleshooting can be more complicated and dynamic. -Because of this we request additional information to be provided which usually comes down to the answers to a few questions (in addition to the information from the prior section): +* Database deployment type ({dse-short}, {hcd-short}, {cass-short}, or {astra-db}) and version for the origin and target clusters. +The version isn't required for {astra-db}. + +* Screenshots of the xref:ROOT:metrics.adoc[{product-proxy} metrics] dashboards from Grafana or your chosen visualization tool. ++ +Direct read access to your metrics dashboard is preferred, if permitted by your security policy. +This is particularly helpful for performance-related issues. + +* Client application and driver logs. + +* The driver language and version that the client application is using. + +For performance-related issues, provide the following additional information: + +* Which statement types (simple, prepared, batch) do you use? + +* If you use batch statements: ++ +** Which driver API do you use to create these batches? +** Are you passing a `BEGIN BATCH` CQL query string to a simple/prepared statement, or do you use the actual batch statement objects that the drivers allow you to create? -* Which statement types are being used: simple, prepared, batch? -* If batch statements are being used, which driver API is being used to create these batches? -Are you passing a `BEGIN BATCH` cql query string to a simple/prepared statement? -Or are you using the actual batch statement objects that drivers allow you to create? * How many parameters does each statement have? + * Is CQL function replacement enabled? -You can see if this feature is enabled by looking at the value of the Ansible advanced configuration variable `replace_cql_functions` if using the automation, or the environment variable `ZDM_REPLACE_CQL_FUNCTIONS` otherwise. -CQL function replacement is disabled by default. -* If permissible within your security rules, please provide us access to the {product-proxy} metrics dashboard. -Screenshots are fine but for performance issues it is more helpful to have access to the actual dashboard so the team can use all the data from these metrics in the troubleshooting process. \ No newline at end of file +This feature is disabled by default. +To determine if this feature is enabled, check the following variables: ++ +** If you use {product-automation}, check the Ansible advanced configuration variable `replace_cql_functions`. +** If you don't use {product-automation}, check the environment variable `ZDM_REPLACE_CQL_FUNCTIONS`. + +== See also + +* xref:ROOT:troubleshooting-scenarios.adoc[] +* xref:ROOT:metrics.adoc[] \ No newline at end of file diff --git a/modules/ROOT/pages/troubleshooting.adoc b/modules/ROOT/pages/troubleshooting.adoc deleted file mode 100644 index 59d5edf4..00000000 --- a/modules/ROOT/pages/troubleshooting.adoc +++ /dev/null @@ -1,22 +0,0 @@ -= Troubleshooting -:page-tag: migration,zdm,zero-downtime,zdm-proxy,troubleshooting -ifdef::env-github,env-browser,env-vscode[:imagesprefix: ../images/] -ifndef::env-github,env-browser,env-vscode[:imagesprefix: ] - -The troubleshooting information for {product} is organized as follows: - -* xref:troubleshooting-tips.adoc[] - general advice and some common issues you may encounter. -* xref:troubleshooting-scenarios.adoc[] - specific descriptions of error conditions, and advice on how to solve the issues. - -[TIP] -==== -If you still have questions, please submit a GitHub Issue in the relevant public repo: - -* https://github.com/datastax/zdm-proxy/issues[{product-proxy}]. -* https://github.com/datastax/zdm-proxy-automation/issues[{product-automation}], which includes the {product-utility}. -* https://github.com/datastax/cassandra-data-migrator/issues[{cass-migrator}]. -* https://github.com/datastax/dsbulk-migrator/issues[{dsbulk-migrator}]. - -You may also contact your {company} account representative or {support-url}[{company} Support] if you have a Luna service contract. -https://www.datastax.com/products/luna[Luna] is a subscription to the {cass} support and expertise at {company}. -==== diff --git a/modules/ROOT/partials/auto-correction-parameters.adoc b/modules/ROOT/partials/auto-correction-parameters.adoc deleted file mode 100644 index 9e76ceff..00000000 --- a/modules/ROOT/partials/auto-correction-parameters.adoc +++ /dev/null @@ -1,37 +0,0 @@ -Auto-correction parameters allow {cass-migrator} to correct data differences found between the origin and target clusters when you run the `DiffData` program. -Typically, these parameters are run-disabled for "what if" migration testing, and generate a list of data discrepancies. -The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. - -For information about invoking `DiffData` in a {cass-migrator} command, see https://docs.datastax.com/en/data-migration/cdm.html#cdm-validation-steps[{cass-migrator} steps in validation mode]. - -[cols="2,2,3a"] -|=== -|Property | Default | Notes - -| `spark.cdm.autocorrect.missing` -| `false` -| When `true`, data that is missing in the target cluster but is found in the origin cluster is re-migrated to the target cluster. - -| `spark.cdm.autocorrect.mismatch` -| `false` -| When `true`, data that is different between the origin and target clusters is reconciled. -[NOTE] -==== -The `TIMESTAMP` of records may have an effect. -If the `WRITETIME` of the origin record that is determined with `.writetime.names` is earlier than the `WRITETIME` of the target record, the change does appear in the target cluster. -This comparative state may be particularly challenging to troubleshoot if individual columns or cells have been modified in the target cluster. -==== - -| `spark.cdm.autocorrect.missing.counter` -| `false` -| Commented out. -By default, counter tables are not copied when missing, unless explicitly set. - -| `spark.tokenrange.partitionFile` -| `./._partitions.csv` -| Commented out. -This CSV file is used as input, as well as output, when applicable. -If the file exists, only the partition ranges in this file are migrated or validated. -Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions are logged to this file. - -|=== \ No newline at end of file diff --git a/modules/ROOT/partials/cassandra-filter-parameters.adoc b/modules/ROOT/partials/cassandra-filter-parameters.adoc deleted file mode 100644 index 2967f853..00000000 --- a/modules/ROOT/partials/cassandra-filter-parameters.adoc +++ /dev/null @@ -1,24 +0,0 @@ -{cass-short} filters are applied on the coordinator node. -Depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cass-migrator} specifies `ALLOW FILTERING`. - -By default, these parameters are commented out. - -[cols="3,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.filter.cassandra.partition.min` -| `-9223372036854775808` -| Default is `0` when using `RandomPartitioner` and `-9223372036854775808` or -2^63 otherwise. -Lower partition bound of the range is inclusive. - -| `spark.cdm.filter.cassandra.partition.max` -| `9223372036854775807` -| Default is `2^127-1` when using `RandomPartitioner` and `9223372036854775807` or 2^63-1 otherwise. -Upper partition bound of the range is inclusive. - -| `spark.cdm.filter.cassandra.whereCondition` -| -| CQL added to the `WHERE` clause of `SELECT` statements from the origin cluster. - -|=== \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-build-jar-local.adoc b/modules/ROOT/partials/cdm-build-jar-local.adoc deleted file mode 100644 index d205cd50..00000000 --- a/modules/ROOT/partials/cdm-build-jar-local.adoc +++ /dev/null @@ -1,13 +0,0 @@ -Optionally, you can build the {cass-migrator} JAR for local development. You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x. - -Example: - -[source,bash] ----- -cd ~/github -git clone git@github.com:datastax/cassandra-data-migrator.git -cd cassandra-data-migrator -mvn clean package ----- - -The fat jar file, `cassandra-data-migrator-x.y.z.jar`, should be present now in the `target` folder. \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-guardrail-checks.adoc b/modules/ROOT/partials/cdm-guardrail-checks.adoc deleted file mode 100644 index 5802f890..00000000 --- a/modules/ROOT/partials/cdm-guardrail-checks.adoc +++ /dev/null @@ -1,13 +0,0 @@ -Use {cass-migrator} to identify large fields from a table that may break your https://docs.datastax.com/en/astra-db-serverless/cql/cassandra-guardrails.html[cluster guardrails]. -For example, {astra-db} has a 10MB limit for a single large field. -Specify `--class com.datastax.cdm.job.GuardrailCheck` on the command. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---conf spark.cdm.feature.guardrail.colSizeInKB=10000 \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.GuardrailCheck cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- diff --git a/modules/ROOT/partials/cdm-install-as-container.adoc b/modules/ROOT/partials/cdm-install-as-container.adoc deleted file mode 100644 index 27825330..00000000 --- a/modules/ROOT/partials/cdm-install-as-container.adoc +++ /dev/null @@ -1,3 +0,0 @@ -Get the latest image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. - -All migration tools, `cassandra-data-migrator` and `dsbulk` and `cqlsh`, are available in the `/assets/` folder of the container. \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-install-as-jar.adoc b/modules/ROOT/partials/cdm-install-as-jar.adoc deleted file mode 100644 index e94d28e5..00000000 --- a/modules/ROOT/partials/cdm-install-as-jar.adoc +++ /dev/null @@ -1,8 +0,0 @@ -Download the *latest* JAR file from the {cass-migrator} https://github.com/datastax/cassandra-data-migrator/packages/1832128[GitHub repo]. -image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?color=green[Latest release] - -[NOTE] -==== -Version 4.x of {cass-migrator} is not backward-compatible with `*.properties` files created in previous versions, and package names have changed. -If you're starting new, use the latest released version if possible. -==== diff --git a/modules/ROOT/partials/cdm-partition-ranges.adoc b/modules/ROOT/partials/cdm-partition-ranges.adoc deleted file mode 100644 index 78a5cbca..00000000 --- a/modules/ROOT/partials/cdm-partition-ranges.adoc +++ /dev/null @@ -1,9 +0,0 @@ -You can also use {cass-migrator} to xref:cdm-steps.adoc#cdm-steps[migrate] or xref:cdm-steps.adoc#cdm-validation-steps[validate] specific partition ranges by passing the below additional parameters. - -[source,bash] ----- ---conf spark.cdm.filter.cassandra.partition.min= ---conf spark.cdm.filter.cassandra.partition.max= ----- - -This mode is specifically useful to process a subset of partition-ranges. \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-prerequisites.adoc b/modules/ROOT/partials/cdm-prerequisites.adoc deleted file mode 100644 index e7ffcb01..00000000 --- a/modules/ROOT/partials/cdm-prerequisites.adoc +++ /dev/null @@ -1,16 +0,0 @@ -Read the prerequisites below before using the {cass-migrator}: - -* Install or switch to Java 11. -The Spark binaries are compiled with this version of Java. -* Select a single VM to run this job and install https://archive.apache.org/dist/spark/spark-3.5.3/[Spark 3.5.3] there. -No cluster is necessary for most one-time migrations. However, Spark cluster mode is also supported for complex migrations. -* Optionally, install https://maven.apache.org/download.cgi[Maven] `3.9.x` if you want to build the JAR for local development. - -Run the following commands to install Apache Spark: - -[source,bash] ----- -wget https://archive.apache.org/dist/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3-scala2.13.tgz - -tar -xvzf spark-3.5.3-bin-hadoop3-scala2.13.tgz ----- diff --git a/modules/ROOT/partials/cdm-validation-steps.adoc b/modules/ROOT/partials/cdm-validation-steps.adoc deleted file mode 100644 index 49dffc4f..00000000 --- a/modules/ROOT/partials/cdm-validation-steps.adoc +++ /dev/null @@ -1,46 +0,0 @@ -To run your migration job with {cass-migrator} in **data validation mode**, use class option `--class com.datastax.cdm.job.DiffData`. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.DiffData cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -The {cass-migrator} validation job reports differences as `ERROR` entries in the log file. -Example: - -[source,bash] ----- -23/04/06 08:43:06 ERROR DiffJobSession: Mismatch row found for key: [key3] Mismatch: Target Index: 1 Origin: valueC Target: value999) -23/04/06 08:43:06 ERROR DiffJobSession: Corrected mismatch row in target: [key3] -23/04/06 08:43:06 ERROR DiffJobSession: Missing target row found for key: [key2] -23/04/06 08:43:06 ERROR DiffJobSession: Inserted missing row in target: [key2] ----- - -[TIP] -==== -To get the list of missing or mismatched records, grep for all `ERROR` entries in the log files. -Differences noted in the log file are listed by primary-key values. -==== - -You can also run the {cass-migrator} validation job in an **AutoCorrect** mode, which can: - -* Add any missing records from the origin to target cluster. -* Update any mismatched records between the origin and target clusters; this action makes the target cluster the same as the origin cluster. - -To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. - -[source,properties] ----- -spark.cdm.autocorrect.missing false|true -spark.cdm.autocorrect.mismatch false|true ----- - -[IMPORTANT] -==== -The {cass-migrator} validation job never deletes records from the source or target clusters. -The job only adds or updates data on the target cluster. -==== \ No newline at end of file diff --git a/modules/ROOT/partials/common-connection-parameters.adoc b/modules/ROOT/partials/common-connection-parameters.adoc deleted file mode 100644 index e0f00a23..00000000 --- a/modules/ROOT/partials/common-connection-parameters.adoc +++ /dev/null @@ -1,50 +0,0 @@ -[cols="5,2,4"] -|=== -|Property | Default | Notes - -| `spark.cdm.connect.origin.host` -| `localhost` -| Hostname/IP address of the cluster. -May be a comma-separated list, and can follow the `:` convention. - -| `spark.cdm.connect.origin.port` -| `9042` -| Port number to use if not specified on `spark.cdm.connect.origin.host`. - -| `spark.cdm.connect.origin.scb` -| (Not set) -| Secure Connect Bundle, used to connect to an {astra-db} database. -Example: `file:///aaa/bbb/scb-enterprise.zip`. - -| `spark.cdm.connect.origin.username` -| `cassandra` -| Username (or `client_id` value) used to authenticate. - -| `spark.cdm.connect.origin.password` -| `cassandra` -| Password (or `client_secret` value) used to authenticate. - -| `spark.cdm.connect.target.host` -| `localhost` -| Hostname/IP address of the cluster. -May be a comma-separated list, and can follow the `:` convention. - -| `spark.cdm.connect.target.port` -| `9042` -| Port number to use if not specified on `spark.cdm.connect.origin.host`. - -| `spark.cdm.connect.target.scb` -| (Not set) -| Secure Connect Bundle, used to connect to an {astra-db} database. -Default is not set. -Example if set: `file:///aaa/bbb/my-scb.zip`. - -| `spark.cdm.connect.target.username` -| `cassandra` -| Username (or `client_id` value) used to authenticate. - -| `spark.cdm.connect.origin.password` -| `cassandra` -| Password (or `client_secret` value) used to authenticate. - -|=== \ No newline at end of file diff --git a/modules/ROOT/partials/interactive-lab.adoc b/modules/ROOT/partials/interactive-lab.adoc deleted file mode 100644 index 88e0b20d..00000000 --- a/modules/ROOT/partials/interactive-lab.adoc +++ /dev/null @@ -1,14 +0,0 @@ -Now that you've seen a conceptual overview of the process, let's put what you learned into practice. - -We've built a complementary learning resource that is a companion to this comprehensive {product-short} documentation. It's the https://www.datastax.com/dev/zdm[{product} Interactive Lab]. - -* All you need is a browser and a GitHub account. -* There's nothing to install for the lab, which opens in a pre-configured GitPod environment. -* You'll learn about a full migration without leaving your browser! - -[NOTE] -==== -To run the lab, all major browsers are supported, except Safari. For more, see the lab's https://www.datastax.com/dev/zdm[start page]. -==== - -We encourage you to explore this free hands-on interactive lab from {company} Academy. It's an excellent, detailed view of the migration process. The lab describes and demonstrates all the steps and automation performed to prepare for, and complete, a migration from any {cass-short}/{dse-short}/{astra-db} database to another {cass-short}/{dse-short}/{astra-db} database across clusters. diff --git a/modules/ROOT/partials/lightbox-tip-all-phases.adoc b/modules/ROOT/partials/lightbox-tip-all-phases.adoc deleted file mode 100644 index 9c839825..00000000 --- a/modules/ROOT/partials/lightbox-tip-all-phases.adoc +++ /dev/null @@ -1,7 +0,0 @@ - -[TIP] -==== -Here's an illustrated view of all the migration phases. -Click the components to open a larger view. -Click again to return to the original view. -==== \ No newline at end of file diff --git a/modules/ROOT/partials/lightbox-tip.adoc b/modules/ROOT/partials/lightbox-tip.adoc deleted file mode 100644 index bcb71d4e..00000000 --- a/modules/ROOT/partials/lightbox-tip.adoc +++ /dev/null @@ -1,7 +0,0 @@ - -[TIP] -==== -Here's an illustrated view of this phase. -Click the components to open a larger view. -Click again to return to the original view. -==== \ No newline at end of file diff --git a/modules/ROOT/partials/note-downtime.adoc b/modules/ROOT/partials/note-downtime.adoc deleted file mode 100644 index 354746c2..00000000 --- a/modules/ROOT/partials/note-downtime.adoc +++ /dev/null @@ -1,4 +0,0 @@ -[NOTE] -==== -This suite of tools allows for zero downtime migration only if your database meets the minimum xref:feasibility-checklists.adoc[requirements]. If your database does not meet these requirements, you can complete the migration from Origin to Target, but downtime might be necessary to finish the migration. -==== \ No newline at end of file diff --git a/modules/ROOT/partials/origin-schema-parameters.adoc b/modules/ROOT/partials/origin-schema-parameters.adoc deleted file mode 100644 index 095664c4..00000000 --- a/modules/ROOT/partials/origin-schema-parameters.adoc +++ /dev/null @@ -1,54 +0,0 @@ -[cols="3,1,5a"] -|=== -|Property | Default | Notes - -| `spark.cdm.schema.origin.keyspaceTable` -| -| Required - the `.` of the table to be migrated. -Table must exist in the origin cluster. - -| `spark.cdm.schema.origin.column.ttl.automatic` -| `true` -| Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. -When `true`, determine the Time To Live (TTL) of the target record. -Find the maximum TTL of all origin columns that can have TTL set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. -When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the target record has the target table configuration determine the TTL. - -| `spark.cdm.schema.origin.column.ttl.names` -| -| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. -Specify a subset of eligible columns that are used to calculate the TTL of the target record. - -| `spark.cdm.schema.origin.column.writetime.automatic` -| `true` -| Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. -When `true`, determine the `WRITETIME` of the target record. -Find the maximum `WRITETIME` of all origin columns that can have `WRITETIME` set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. -When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the target table configuration determines the target record's `WRITETIME`. - -[NOTE] -==== -The `spark.cdm.transform.custom.writetime` property, if set, overrides `spark.cdm.schema.origin.column.writetime`. -==== - -| `spark.cdm.schema.origin.column.writetime.names` -| -| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. -Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the target record. -Example: `data_col1,data_col2,...` - -| `spark.cdm.schema.origin.column.names.to.target` -| -| Default is empty. -If column names are changed between the origin and target clusters, then this mapped list provides a mechanism to associate the two. -The format is `:`. -The list is comma separated. -You only need to list renamed columns. - -|=== - -[NOTE] -==== -For optimization reasons, {cass-migrator} does not migrate TTL and writetime at the field level. -Instead, {cass-migrator} finds the field with the highest TTL and the field with the highest writetime within an origin table row, and uses those values on the entire target table row. -==== \ No newline at end of file diff --git a/modules/ROOT/partials/performance-and-operations-parameters.adoc b/modules/ROOT/partials/performance-and-operations-parameters.adoc deleted file mode 100644 index 0fd1b28f..00000000 --- a/modules/ROOT/partials/performance-and-operations-parameters.adoc +++ /dev/null @@ -1,59 +0,0 @@ -Performance and operations parameters that can affect migration throughput, error handling, and similar concerns. - -[cols="4,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.perfops.numParts` -| `10000` -| In standard operation, the full token range of -2^63 to 2^63-1 is divided into a number of parts, which are parallel processed. -You should aim for each part to comprise a total of ≈1-10GB of data to migrate. -During initial testing, you may want this to be a small number, such as `1`. - -| `spark.cdm.perfops.batchSize` -| `5` -| When writing to the target cluster, this comprises the number of records that are put into an `UNLOGGED` batch. -{cass-migrator} tends to work on the same partition at a time. -If your partition sizes are larger, this number may be increased. -If the `spark.cdm.perfops.batchSize` would mean that more than 1 partition is often contained in a batch, reduce this parameter's value. -Ideally < 1% of batches have more than 1 partition. - -| `spark.cdm.perfops.ratelimit.origin` -| `20000` -| Concurrent number of operations across all parallel threads from the origin cluster. -This value may be adjusted up or down, depending on the amount of data and the processing capacity of the origin cluster. - -| `spark.cdm.perfops.ratelimit.target` -| `40000` -| Concurrent number of operations across all parallel threads from the target cluster. -This may be adjusted up or down, depending on the amount of data and the processing capacity of the target cluster. - -| `spark.cdm.perfops.consistency.read` -| `LOCAL_QUORUM` -| Commented out. -Read consistency from the origin cluster and from the target cluster when records are read for comparison purposes. -The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. - -| `spark.cdm.perfops.consistency.write` -| `LOCAL_QUORUM` -| Commented out. -Write consistency to the target cluster. -The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. - -| `spark.cdm.perfops.printStatsAfter` -| `100000` -| Commented out. -Number of rows of processing after which a progress log entry is made. - -| `spark.cdm.perfops.fetchSizeInRows` -| `1000` -| Commented out. -This parameter affects the frequency of reads from the origin cluster and the frequency of flushes to the target cluster. - -| `spark.cdm.perfops.errorLimit` -| `0` -| Commented out. -Controls how many errors a thread may encounter during `MigrateData` and `DiffData` operations before failing. -Recommendation: set this parameter to a non-zero value **only when not doing** a mutation-type operation, such as when you're running `DiffData` without `.autocorrect`. - -|=== \ No newline at end of file diff --git a/modules/ROOT/partials/supported-releases.adoc b/modules/ROOT/partials/supported-releases.adoc index 20dad199..28145c37 100644 --- a/modules/ROOT/partials/supported-releases.adoc +++ b/modules/ROOT/partials/supported-releases.adoc @@ -1,4 +1,3 @@ -Overall, you can use {product-proxy} to migrate: +You can use {product-proxy} to migrate from a cluster running https://cassandra.apache.org/_/index.html[{cass-reg}] version 2.1.6 and later or https://www.datastax.com/products/datastax-enterprise[{dse}] version 4.7.1 and later. -* **From:** Any {cass-short} 2.1.6 or higher release, or from any {dse-short} 4.7.1 or higher release. -* **To:** Any equivalent or higher release of {cass-short}, or to any equivalent or higher release of {dse-short}, or to {astra-db}. +You can migrate to https://www.datastax.com/products/datastax-astra[{astra-db}] or a cluster running the same or later version of {cass-short} or {dse-short} \ No newline at end of file diff --git a/modules/ROOT/partials/target-schema-parameters.adoc b/modules/ROOT/partials/target-schema-parameters.adoc deleted file mode 100644 index 62a1f610..00000000 --- a/modules/ROOT/partials/target-schema-parameters.adoc +++ /dev/null @@ -1,11 +0,0 @@ -[cols="3,1,2"] -|=== -|Property | Default | Notes - -| `spark.cdm.schema.target.keyspaceTable` -| Equals the value of `spark.cdm.schema.origin.keyspaceTable` -| This parameter is commented out. -It's the `.` of the table to be migrated into the target. -Table must exist in the target cluster. - -|=== \ No newline at end of file diff --git a/modules/ROOT/partials/tip-scb.adoc b/modules/ROOT/partials/tip-scb.adoc deleted file mode 100644 index 4492c0b0..00000000 --- a/modules/ROOT/partials/tip-scb.adoc +++ /dev/null @@ -1,12 +0,0 @@ -[TIP] --- -The SCB can be downloaded from the {astra-ui} as follows: - -. In the {astra-ui}, go to the Dashboard and select your database. -. Make sure it is in `Active` status. -. Click **Connect**. -. Select the **Java driver**, choosing the driver based on the CQL APIs. -. Click **Download bundle**, and select a region if required. - -For more information on the SCB and how to retrieve it, see https://docs.datastax.com/en/astra/astra-db-vector/drivers/secure-connect-bundle.html[the {astra-db} documentation]. --- diff --git a/modules/ROOT/partials/transformation-parameters.adoc b/modules/ROOT/partials/transformation-parameters.adoc deleted file mode 100644 index d8ff2f18..00000000 --- a/modules/ROOT/partials/transformation-parameters.adoc +++ /dev/null @@ -1,58 +0,0 @@ -Parameters to perform schema transformations between the origin and target clusters. - -By default, these parameters are commented out. - -[cols="2,1,4a"] -|=== -|Property | Default | Notes - -| `spark.cdm.transform.missing.key.ts.replace.value` -| `1685577600000` -| Timestamp value in milliseconds. -Partition and clustering columns cannot have null values. -If they are added as part of a schema transformation between the origin and target clusters, it is possible that the origin side is null. -In this case, the `Migrate` data operation fails. -This parameter allows a crude constant value to be used in its place that is separate from the constant values feature. - -| `spark.cdm.transform.custom.writetime` -| `0` -| Default is 0 (disabled). -Timestamp value in microseconds to use as the `WRITETIME` for the target record. -This is useful when the `WRITETIME` of the record in the origin cluster cannot be determined. Such an example is when the only non-key columns are collections. -This parameter allows a crude constant value to be used in its place and overrides `spark.cdm.schema.origin.column.writetime.names`. - -| `spark.cdm.transform.custom.writetime.incrementBy` -| `0` -| Default is `0`. -This is useful when you have a list that is not frozen and you are updating this using the autocorrect feature. -Lists are not idempotent, and subsequent UPSERTs add duplicates to the list. - -| `spark.cdm.transform.codecs` -| -| Default is empty. -A comma-separated list of additional codecs to enable. - - * `INT_STRING` : int stored in a string. - * `DOUBLE_STRING` : double stored in a string. - * `BIGINT_STRING` : bigint stored in a string. - * `DECIMAL_STRING` : decimal stored in a string. - * `TIMESTAMP_STRING_MILLIS` : timestamp stored in a string, as Epoch milliseconds. - * `TIMESTAMP_STRING_FORMAT` : timestamp stored in a string with a custom format. - -[NOTE] -==== -Where there are multiple type pair options, such as with `TIMESTAMP_STRING_*`, only one can be configured at a time with the `spark.cdm.transform.codecs` parameter. -==== - -| `spark.cdm.transform.codecs.timestamp.string.format` -| `yyyyMMddHHmmss` -| Configuration for `CQL_TIMESTAMP_TO_STRING_FORMAT` codec. -Default format is `yyyyMMddHHmmss`; `DateTimeFormatter.ofPattern(formatString)` - - -| `spark.cdm.transform.codecs.timestamp.string.zone` -| `UTC` -| Default is `UTC`. -Must be in `ZoneRulesProvider.getAvailableZoneIds()`. - -|=== \ No newline at end of file diff --git a/modules/ROOT/partials/use-cdm-migrator.adoc b/modules/ROOT/partials/use-cdm-migrator.adoc deleted file mode 100644 index ac5ae675..00000000 --- a/modules/ROOT/partials/use-cdm-migrator.adoc +++ /dev/null @@ -1,27 +0,0 @@ -. Configure for your environment the `cdm*.properties` file that's provided in the {cass-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. -The file can have any name. -It does not need to be `cdm.properties` or `cdm-detailed.properties`. -In both versions, the `spark-submit` job processes only the parameters that aren't commented out. -Other parameter values use defaults or are ignored. -+ -See the descriptions and defaults in each file. -For more information about the sample properties configuration, see the https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties]. -This is the full set of configurable settings. - -. Place the properties file that you elected to use and customize where it can be accessed while running the job using `spark-submit`. - -. Run the job using `spark-submit` command: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.Migrate cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -[TIP] -==== -* The command generates a log file `logfile_name_*.txt` to prevent log output on the console. -* Update the memory options, driver and executor memory, based on your use case. -==== \ No newline at end of file diff --git a/modules/sideloader/images/cql-console-create-identical-schema.png b/modules/sideloader/images/cql-console-create-identical-schema.png new file mode 100644 index 00000000..bb14216e Binary files /dev/null and b/modules/sideloader/images/cql-console-create-identical-schema.png differ diff --git a/modules/sideloader/images/data-importer-workflow.svg b/modules/sideloader/images/data-importer-workflow.svg new file mode 100644 index 00000000..98141dc0 --- /dev/null +++ b/modules/sideloader/images/data-importer-workflow.svg @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/modules/sideloader/images/data-importer-zdm.svg b/modules/sideloader/images/data-importer-zdm.svg new file mode 100644 index 00000000..22147f4a --- /dev/null +++ b/modules/sideloader/images/data-importer-zdm.svg @@ -0,0 +1,98 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/modules/sideloader/pages/cleanup-sideloader.adoc b/modules/sideloader/pages/cleanup-sideloader.adoc new file mode 100644 index 00000000..42dae738 --- /dev/null +++ b/modules/sideloader/pages/cleanup-sideloader.adoc @@ -0,0 +1,92 @@ += Clean up {sstable-sideloader} migrations +:description: {sstable-sideloader} has an automatic cleanup process. + +{description} +You can also manually start or reschedule a cleanup. + +The cleanup process deletes all SSTable snapshots from the migration directory, revokes any unexpired upload credentials, and then closes the migration. + +Each migration ID has its own cleanup schedule, and the cleanup process deletes only the files and credentials associated with the specific migration ID that is being cleaned up. +Cleaning up one migration doesn't affect other migrations associated with the same database. + +== Idle timeout and automatic cleanup + +A migration becomes idle if it is _not_ in `Initializing` or `ImportInProgress` status. +If a migration remains continuously idle for one week, it hits the idle timeout and triggers the automatic migration cleanup process. + +A migration's idle timer starts when you initialize the migration, and it automatically restarts when you xref:sideloader:migrate-sideloader.adoc#import-data[import data]. + +The idle time _doesn't_ restart when you upload snapshots or take any other action besides importing data. + +You can override the idle timer by manually starting or scheduling a cleanup. +However, you cannot permanently prevent the cleanup process. + +[WARNING] +==== +{company} recommends that you <> if your migration could be idle for several days. +This includes time spent completely idle, as well as time required to upload snapshots or import data to the target database. + +For mutli-terabyte and cross-region migrations, it can take several days to upload snapshots or import data. +Make sure you <> to avoid automatic cleanup. +==== + +== Manually start a cleanup + +. Use the {devops-api} to immediately start the cleanup process for a migration: ++ +[source,bash] +---- +curl -X POST \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID}/cleanup \ + | jq . +---- ++ +The cleanup process never runs on migrations in `ImportInProgress` status. +If the request fails due to `ImportInProgress`, you must either wait for the import process to end, xref:sideloader:stop-restart-sideloader.adoc#abort-migration[abort the migration], or <>. + +. Wait a few minutes, and then check the migration status: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=check-status] ++ +While the cleanup is running, the migration status is `CleaningUpFiles`. +When complete, the migration status is `Closed`. + +[#reschedule-a-cleanup] +== Reschedule a cleanup + +[IMPORTANT] +==== +If you reschedule a cleanup, the cleanup timer doesn't reset when you import data. + +Keep a record of your rescheduled cleanups so you can reschedule them again, if necessary. + +For example, you might need to reschedule a cleanup if your migration needs more time. +Alternatively, if your migration is complete, you might reschedule the cleanup to minimize storage costs for the migration bucket. +==== + +You can use the {devops-api} to schedule a migration cleanup for a specific date and time: + +[source,bash,subs="+quotes"] +---- +curl -X POST \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID}/cleanup \ + ?option.cleanupTime=**CLEANUP_TIME** \ + | jq . +---- + +Replace the following: + +* Set your `dbID` and `migrationID` environment variables according to the migration that you want to reschedule. ++ +This endpoint overrides the idle timeout for the specified migration ID only. + +* Replace `*CLEANUP_TIME*` with the date and time that you want the cleanup process to run. +You must use https://en.wikipedia.org/wiki/ISO_8601[ISO 8601] format (`YYYY-MM-DDTHH:MM:SSZ`), such as `option.cleanupTime=2025-03-31T14:30Z`. ++ +Setting a cleanup time in the past immediately starts the cleanup process. + +At your scheduled time, the cleanup process runs on the specified migration ID. +The cleanup process never runs on migrations in `ImportInProgress` status. +If the migration is in `ImportInProgress` at the scheduled cleanup time, the cleanup process will start when the migration's status changes. \ No newline at end of file diff --git a/modules/sideloader/pages/migrate-sideloader.adoc b/modules/sideloader/pages/migrate-sideloader.adoc new file mode 100644 index 00000000..e3bffaa9 --- /dev/null +++ b/modules/sideloader/pages/migrate-sideloader.adoc @@ -0,0 +1,817 @@ += Migrate data with {sstable-sideloader} +:description: You can use {sstable-sideloader} to migrate data to {astra-db} from {cass-reg}, {dse}, or {hcd}. +:loop-var: pass:[${i}] + +{description} + +== Prerequisites + +Before you use {sstable-sideloader} for a migration, xref:sideloader:sideloader-overview.adoc[learn about the {sstable-sideloader} process] and xref:sideloader:prepare-sideloader.adoc[prepare your environments for {sstable-sideloader}]. + +[#create-snapshots] +== Create snapshots + +On _each node_ in your origin cluster, use `nodetool` to create a backup of the data that you want to migrate, including all keyspaces and CQL tables that you want to migrate. + +. Be aware of the {sstable-sideloader} limitations related to materialized views, secondary indexes, and encrypted data that are described in xref:sideloader:prepare-sideloader.adoc#origin-cluster-requirements[Origin cluster requirements]. +If necessary, modify the data model on your origin cluster to prepare for the migration. + +. Optional: Before you create snapshots, consider running `xref:dse:managing:tools/nodetool/cleanup.adoc[nodetool cleanup]` to remove data that no longer belongs to your nodes. +This command is particularly useful after adding more nodes to a cluster because it helps ensure that each node only contains the data that it is responsible for, according to the current cluster configuration and partitioning scheme. ++ +If you run `nodetool cleanup` before you take a snapshot, you can ensure that the snapshot only includes relevant data, potentially reducing the size of the snapshot. +Smaller snapshots can lead to lower overall migration times and lower network transfer costs. ++ +However, take adequate precautions before you run this command because the cleanup operations can introduce additional load on your origin cluster. + +. Use `xref:dse:managing:tools/nodetool/snapshot.adoc[nodetool snapshot]` to create snapshots for the tables that you want to migrate. ++ +Don't create snapshots of system tables or tables that you don't want to migrate. +The migration can fail if you attempt to migrate snapshots that don't have a matching schema in the target database. +{sstable-sideloader} ignores system keyspaces. ++ +The structure of the `nodetool snapshot` command depends on the keyspaces and tables that you want to migrate. ++ +[tabs] +====== +All keyspaces:: ++ +-- +Create a snapshot of all tables in all keyspaces: + +[source,bash,subs="+quotes"] +---- +nodetool snapshot -t *SNAPSHOT_NAME* +---- + +Replace *`SNAPSHOT_NAME`* with a descriptive name for the snapshot. +Use the same snapshot name on each node. +This makes it easier to programmatically upload the snapshots to the migration directory. + +.Optional: Use a for loop to simplify snapshot creation +[%collapsible] +==== +If the nodes in your origin cluster are named in a predictable way (for example, `dse0`, `dse1`, `dse2`, etc.), you can use a `for` loop to simplify snapshot creation. +For example: + +[source,bash,subs="+quotes"] +---- +for i in 0 1 2; do ssh dse${i} nodetool snapshot -t *SNAPSHOT_NAME*; done +---- + +You can use the same `for` loop to verify that each snapshot was successfully created: + +[source,bash] +---- +for i in 0 1 2; do ssh dse${i} nodetool listsnapshots; done +---- +==== +-- + +Specific keyspaces:: ++ +-- +Create a snapshot of all tables in one or more keyspaces: + +.Single keyspace +[source,bash,subs="+quotes"] +---- +nodetool snapshot -t *SNAPSHOT_NAME* *KEYSPACE_NAME* +---- + +.Multiple keyspaces +[source,bash,subs="+quotes"] +---- +nodetool snapshot -t *SNAPSHOT_NAME* *KEYSPACE_NAME_1* *KEYSPACE_NAME_2* +---- + +Replace the following: + +* *`KEYSPACE_NAME`*: The name of the keyspace that contains the tables you want to migrate. ++ +To include multiple keyspaces, list each keyspace separated by a space as shown in the example above. +* *`SNAPSHOT_NAME`*: A descriptive name for the snapshot. ++ +Use the same snapshot name on each node. +This makes it easier to programmatically upload the snapshots to the migration directory. + +.Optional: Use a for loop to simplify snapshot creation +[%collapsible] +==== +If the nodes in your origin cluster are named in a predictable way (for example, `dse0`, `dse1`, `dse2`, etc.), you can use a `for` loop to simplify snapshot creation. +For example: + +[source,bash,subs="+quotes"] +---- +for i in 0 1 2; do ssh dse${i} nodetool snapshot -t *SNAPSHOT_NAME* *KEYSPACE_NAME*; done +---- + +To include multiple keyspaces in the snapshot, include multiple comma-separated `*KEYSPACE_NAME*` values, such as `keyspace1,keyspace2`. + +You can use the same `for` loop to verify that each snapshot was successfully created: + +[source,bash] +---- +for i in 0 1 2; do ssh dse${i} nodetool listsnapshots; done +---- +==== +-- + +Specific tables:: ++ +-- +Create a snapshot of specific tables within one or more keyspaces: + +.Single table +[source,bash,subs="+quotes"] +---- +nodetool snapshot -kt *KEYSPACE_NAME*.*TABLE_NAME* -t *SNAPSHOT_NAME* +---- + +.Multiple tables from one or more keyspaces +[source,bash,subs="+quotes"] +---- +nodetool snapshot -kt *KEYSPACE_NAME_1*.*TABLE_NAME_A* *KEYSPACE_NAME_1*.*TABLE_NAME_B* *KEYSPACE_NAME_2*.*TABLE_NAME_X* -t *SNAPSHOT_NAME* +---- + +Replace the following: + +* *`KEYSPACE_NAME`*: The name of the keyspace that contains the table you want to migrate. + +* *`TABLE_NAME`*: The name of the table you want to migrate. ++ +To include multiple tables from one or more keyspaces, list each *`KEYSPACE_NAME.TABLE_NAME`* pair separated by a space as shown in the example above. + +* *`SNAPSHOT_NAME`*: A descriptive name for the snapshot. ++ +Use the same snapshot name on each node. +This makes it easier to programmatically upload the snapshots to the migration directory. + +.Optional: Use a for loop to simplify snapshot creation +[%collapsible] +==== +If the nodes in your origin cluster are named in a predictable way (for example, `dse0`, `dse1`, `dse2`, etc.), you can use a `for` loop to simplify snapshot creation. +For example: + +[source,bash,subs="+quotes"] +---- +for i in 0 1 2; do ssh dse${i} nodetool snapshot -kt *KEYSPACE_NAME*.*TABLE_NAME* -t *SNAPSHOT_NAME*; done +---- + +To include multiple tables in the snapshot, include multiple comma-separated `*KEYSPACE_NAME*.*TABLE_NAME*` pairs, such as `keyspace1.table1,keyspace1.table2`. + +You can use the same `for` loop to verify that each snapshot was successfully created: + +[source,bash] +---- +for i in 0 1 2; do ssh dse${i} nodetool listsnapshots; done +---- +==== +-- +====== + +. Use `xref:6.9@dse:managing:tools/nodetool/list-snapshots.adoc[nodetool listsnapshots]` to verify that the snapshots were created: ++ +[source,bash] +---- +nodetool listsnapshots +---- ++ +Snapshots have a specific directory structure, such as `*KEYSPACE_NAME*/*TABLE_NAME*/snapshots/*SNAPSHOT_NAME*/...`. +{sstable-sideloader} relies on this fixed structure to properly interpret the SSTable components. +**With the exception of secondary index directories (as explained in the following step), don't modify the snapshot's directory structure.** + +. If your origin cluster has xref:dse-5.1@cql:develop:indexing/2i/2i-concepts.adoc[secondary indexes (2i)], remove all directories related to those indexes from all snapshots before you xref:sideloader:migrate-sideloader.adoc#upload-snapshots-to-migration-directory[upload the snapshots]. ++ +[WARNING] +==== +Secondary indexes defined in the origin cluster are ignored by {astra-db}, but they will cause the migration to fail. +To avoid errors, you must remove all secondary index directories from your snapshots before you upload them. +==== ++ +You can find secondary index directories in the table's snapshot directory: ++ +[source,plaintext,subs="+quotes"] +---- +**NODE_UUID**/**KEYSPACE_NAME**/**TABLE_NAME**-**TABLE_UUID**/snapshots/**SNAPSHOT_NAME**/.**INDEX_NAME** +---- ++ +For example, given the following table schema, the index directory is found at `*NODE_UUID*/smart_home/sensor_readings-*TABLE_UUID*/snapshots/*SNAPSHOT_NAME*/.roomidx`: ++ +[source,cql] +---- +CREATE TABLE IF NOT EXISTS smart_home.sensor_readings ( + device_id UUID, + room_id UUID, + reading_type TEXT, + PRIMARY KEY ((device_id)) +); +CREATE INDEX IF NOT EXISTS roomidx ON smart_home.sensor_readings(room_id); +---- + +[#record-schema] +== Configure the target database + +To prepare your target database for the migration, you must record the schema for each table in your origin cluster that you want to migrate, recreate these schemas in your target database, and then set environment variables required to connect to your database. + +[WARNING] +==== +For the migration to succeed, your target database must meet the schema requirements described in this section. +Additionally, your snapshots must contain compatible data and directories, as described in xref:sideloader:prepare-sideloader.adoc#origin-cluster-requirements[Origin cluster requirements] and xref:sideloader:migrate-sideloader.adoc#create-snapshots[Create snapshots]. +For example, {astra-db} doesn't support materialized views, and {sstable-sideloader} can't migrate encrypted data. + +However, indexes don't need to match. +You can define indexes in your target database independently from the origin cluster because {sstable-sideloader} ignores Storage Attached Indexes (SAI) defined on the origin cluster. +During the migration, {sstable-sideloader} automatically populates any SAI defined in your target database, even if those SAI weren't present in your origin cluster. +//TODO: Difference between "indexes" and "SAI" here? +//You can define {astra-db}-supported indexes independently on the target database and they will populate as part of the data migration process. +==== + +. Get the following schema properties for _each table_ that you want to migrate: ++ +* Exact keyspace name. +* Exact table name. +* Exact column names, data types, and the order in which they appear in the table creation DDL. +* Exact primary key definition as defined in your origin cluster, including the partition key, clustering columns, and ascending/descending ordering clauses. +You must define partition key columns and clustering columns in the exact order that they are defined on your origin cluster. ++ +To retrieve schema properties, you can run the `xref:astra@cql:reference:cqlsh-commands/describe-keyspace.adoc[DESCRIBE KEYSPACE]` command on your origin cluster: ++ +[source,cql,subs="+quotes"] +---- +DESCRIBE *KEYSPACE_NAME*; +---- ++ +Replace *`KEYSPACE_NAME`* with the name of the keyspace that contains the tables you want to migrate, +such as `DESCRIBE smart_home;`. ++ +Then, get the schema properties from the result: ++ +[source,cql] +---- +CREATE TABLE smart_home.sensor_readings ( + device_id UUID, + room_id UUID, + reading_type TEXT, + reading_value DOUBLE, + reading_timestamp TIMESTAMP, + PRIMARY KEY (device_id, room_id, reading_timestamp) +) WITH CLUSTERING ORDER BY (room_id ASC, reading_timestamp DESC); +---- +//However, {sstable-sideloader} can't import data to a xref:astra-db-serverless:databases:collection in a {db-serverless-vector} database. +. Recreate the schemas in your target database: ++ +.. In the {astra-ui-link} navigation menu, click *Databases*, and then click the name of your {astra-db} database. +.. xref:astra-db-serverless:databases:manage-keyspaces.adoc#keyspaces[Create a keyspace] with the exact same name as your origin cluster's keyspace. +.. In your database's xref:astra-db-serverless:cql:develop-with-cql.adoc#connect-to-the-cql-shell[CQL console], create tables with the exact same names and schemas as your origin cluster. ++ +image::sideloader:cql-console-create-identical-schema.png[] ++ +{astra-db} rejects or ignores some table properties, such as compaction strategy. +See xref:astra-db-serverless:databases:database-limits.adoc[] for more information. +//TODO: Does this matter? + +. In your terminal, set environment variables for your target database: ++ +[source,bash,subs="+quotes"] +---- +export dbID=*DATABASE_ID* +export token=*TOKEN* +---- ++ +Replace *`DATABASE_ID`* with the xref:astra-db-serverless:databases:create-database.adoc#get-db-id[database ID], and replace *`TOKEN`* with an xref:astra-db-serverless:administration:manage-application-tokens.adoc[application token] with the *Database Administrator* role. ++ +[TIP] +==== +Later, you will add another environment variable for the migration ID. + +The curl commands in this guide assume that you have set environment variables for token, database ID, and migration ID. +Running the commands without these environment variables causes error messages like `Moved Permanently` and `404 page not found`. + +Additionally, the curl command use https://jqlang.github.io/jq/[jq] to format the JSON responses. +If you don't have jq installed, remove `| jq .` from the end of each command. +==== + +[#initialize-migration] +== Initialize the migration + +Use the {devops-api} to initialize the migration and get your migration directory path and credentials. + +.What happens during initialization? +[%collapsible] +==== +include::sideloader:partial$sideloader-partials.adoc[tags=initialize] +==== + +The initialization process can take several minutes to complete, especially if the migration bucket doesn't already exist. + +. In your terminal, use the {devops-api} to initialize the data migration: ++ +[source,bash] +---- +curl -X POST \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/initialize \ + | jq . +---- + +. Get the `migrationID` from the response: ++ +[source,json] +---- +{ + "migrationID": "272eac1d-df8e-4d1b-a7c6-71d5af232182", + "dbID": "b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d", + "status": "Initializing", + "progressInfo": "", + "uploadBucketDir": "", + "uploadCredentials": { + "name": "", + "keys": null, + "credentialExpiration": null + }, + "expectedCleanupTime": "2025-03-04T15:14:38Z" +} +---- ++ +The `migrationID` is a unique identifier (UUID) for the migration. ++ +The response also includes the migration `status`. +You will refer to this status multiple times throughout the migration process. + +. Assign the migration ID to an environment variable: ++ +[source,bash,subs="+quotes"] +---- +export migrationID=*MIGRATION_ID* +---- ++ +Replace *`MIGRATION_ID`* with the `migrationID` returned by the `initialize` endpoint. + +. Check the migration status: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=check-status] + +. Check the `status` field in the response: ++ +* `"status": "ReceivingFiles"`: Initialization is complete and your upload credentials are available. +Proceed to the next step. +* `"status": "Initializing"`: The migration is still initializing. +Wait a few minutes before you check the status again. + +. Get your migration directory path and upload credentials from the response. +You need these values to xref:sideloader:migrate-sideloader.adoc#upload-snapshots-to-migration-directory[upload snapshots to the migration directory]. ++ +[tabs] +====== +AWS:: ++ +-- +.MigrationStatus with AWS credentials +[source,json] +---- +{ + "migrationID": "272eac1d-df8e-4d1b-a7c6-71d5af232182", + "dbID": "b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d", + "status": "ReceivingFiles", + "progressInfo": "", + "uploadBucketDir": "s3://ds-mig-b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d/272eac1d-df8e-4d1b-a7c6-71d5af232182/sstables/", + "uploadCredentials": { + "name": "sessionToken", + "keys": { + "accessKeyID": "ASXXXXXXXXXXXXXXXXXX", + "secretAccessKey": "2XXXXXXXXXXXXXXXWqcdV519ZubYbyfuNxbZg1Rw", + "sessionToken": "XXXXXXXXXX" + }, + "credentialExpiration": "2024-01-18T19:45:09Z", + "hint": "\nexport AWS_ACCESS_KEY_ID=ASXXXXXXXXXXXXXXXXXX\nexport AWS_SECRET_ACCESS_KEY=2XXXXXXXXXXXXXXXWqcdV519ZubYbyfuNxbZg1Rw\nexport AWS_SESSION_TOKEN=XXXXXXXXXXXXXX\n" + }, + "expectedCleanupTime": "2024-01-25T15:14:38Z" +} +---- + +Securely store the `uploadBucketDir`, `accessKeyID`, `secretAccessKey`, and `sessionToken`: + +* `uploadBucketDir` is the migration directory URL. +Note the trailing slash. + +* `uploadCredentials` contains the AWS credentials that authorize uploads to the migration directory, namely `accessKeyID`, `secretAccessKey`, and `sessionToken`. + +[IMPORTANT] +==== +The `sessionToken` expires after one hour. +If your total migration takes longer than one hour, xref:sideloader:troubleshoot-sideloader.adoc#get-new-upload-credentials[generate new credentials], and then xref:sideloader:stop-restart-sideloader.adoc[resume the migration] with the fresh credentials. + +If you use automation to handle {sstable-sideloader} migrations, you might need to script a xref:sideloader:stop-restart-sideloader.adoc[pause] every hour so you can generate new credentials without unexpectedly interrupting the migration. +==== +-- + +Google Cloud:: ++ +-- +.MigrationStatus with Google Cloud credentials +[source,json] +---- +{ + "migrationID": "272eac1d-df8e-4d1b-a7c6-71d5af232182", + "dbID": "b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d", + "status": "ReceivingFiles", + "progressInfo": "", + "uploadBucketDir": "gs://ds-mig-b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d/272eac1d-df8e-4d1b-a7c6-71d5af232182/sstables/", + "uploadCredentials": { + "name": "TYPE_GOOGLE_CREDENTIALS_FILE", + "keys": { + "file": "CREDENTIALS_FILE" + }, + "credentialExpiration": "2024-08-07T18:51:39Z" + }, + "expectedCleanupTime": "2024-08-14T15:14:38Z" +} +---- + +.. Find the `uploadBucketDir` and the `uploadCredentials` in the response: ++ +* `uploadBucketDir` is the migration directory URL. +Note the trailing slash. +* `uploadCredentials` includes a base64-encoded file containing Google Cloud credentials that authorize uploads to the migration directory. + +.. Pipe the Google Cloud credentials `file` to a `creds.json` file: ++ +[source,bash] +---- +curl -X GET \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID} \ + | jq -r '.uploadCredentials.keys.file' \ + | base64 -d > creds.json +---- + +.. Securely store the `uploadBucketDir` and `creds.json`. +-- + +Microsoft Azure:: ++ +-- +.MigrationStatus with Azure credentials +[source,json] +---- +{ + "migrationID": "456ca4a9-0551-46c4-b8bb-90fcd136a0c3", + "dbID": "ccefd141-8fda-4e4d-a746-a102a96657bc", + "status": "ReceivingFiles", + "progressInfo": "", + "uploadBucketDir": "https://muztx5cqmp3jhe3j2guebksz.blob.core.windows.net/mig-upload-456ca4a9-0551-46c4-b8bb-90fcd136a0c3/sstables/", + "uploadCredentials": { + "name": "URL signature", + "keys": { + "url": "https://UPLOAD_BUCKET_DIR/?si=AZURE_SAS_TOKEN", + "urlSignature": "si=AZURE_SAS_TOKEN" + }, + "credentialExpiration": "2025-04-02T15:14:31Z" + }, + "expectedCleanupTime": "2025-03-04T15:14:38Z" +} +---- +Securely store the `uploadBucketDir` and `urlSignature`: + +* `uploadBucketDir` is the migration directory URL. +Note the trailing slash. + +* `uploadCredentials` contains `url` and `urlSignature` keys that represent an https://learn.microsoft.com/en-us/azure/ai-services/translator/document-translation/how-to-guides/create-sas-tokens[Azure Shared Access Signature (SAS) token]. +In the preceding example, these strings are truncated for readability. ++ +You need the `urlSignature` to upload snapshots to the migration directory. +-- +====== + +[#upload-snapshots-to-migration-directory] +== Upload snapshots to the migration directory + +//TODO: ENV VARS: A variable for MIGRATION_DIR would simplify these steps slightly. Env vars for all the values except the ones that change each time (Node name, snapshot name) would be most efficient. + +Use your cloud provider's CLI and your upload credentials to upload snapshots for _each origin node_ into the migration directory. + +[IMPORTANT] +==== +Be aware of the following requirements for the upload commands: + +* You must include the asterisk (`*`) character as shown in the commands, otherwise the commands won't work properly. + +* With the exception of the leading `://` in the migration directory path, your paths must _not_ include double slashes (`//`). + +* Use the CLI that corresponds with your target database's cloud provider. +For more information, see xref:sideloader:prepare-sideloader.adoc[]. + +* These commands assume that you installed the cloud provider's CLI on the nodes in your origin cluster. +For more information, see xref:sideloader:prepare-sideloader.adoc[]. + +* You might need to modify these commands depending on your environment, node names, directory structures, and other variables. +==== + +[tabs] +====== +AWS:: ++ +-- +//// +Originals: +[source,bash,subs="+quotes"] +---- +export AWS_ACCESS_KEY_ID=**ACCESS_KEY_ID**; export AWS_SECRET_ACCESS_KEY=**SECRET_ACCESS_KEY**; export AWS_SESSION_TOKEN=**SESSION_TOKEN**; \ +du -sh **CASSANDRA_DATA_DIR**/**KEYSPACE_NAME**/\*/snapshots/***SNAPSHOT_NAME***; \ +aws s3 sync --only-show-errors --exclude '\*' --include '*/snapshots/**SNAPSHOT_NAME***' **CASSANDRA_DATA_DIR**/ **MIGRATION_DIR**/**NODE_NAME** +---- + +[source,bash] +---- +export AWS_ACCESS_KEY_ID=ASXXXXXXXXXXXXXXXXXX; export AWS_SECRET_ACCESS_KEY=2XXXXXXXXXXXXXXXWqcdV519ZubYbyfuNxbZg1Rw; AWS_SESSION_TOKEN=XXXXXXXXXX; \ +du -sh /var/lib/cassandra/data/smart_home/*/snapshots/*sensor_readings*; \ +aws s3 sync --only-show-errors --exclude '*' --include '*/snapshots/sensor_readings*' /var/lib/cassandra/data/ s3://ds-mig-b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d/272eac1d-df8e-4d1b-a7c6-71d5af232182/sstables/dse0 +---- +//// +. Set environment variables for the AWS credentials that were generated when you xref:sideloader:migrate-sideloader.adoc#initialize-migration[initialized the migration]: ++ +[source,bash,subs="+quotes"] +---- +export AWS_ACCESS_KEY_ID=**ACCESS_KEY_ID** +export AWS_SECRET_ACCESS_KEY=**SECRET_ACCESS_KEY** +export AWS_SESSION_TOKEN=**SESSION_TOKEN** +---- + +. Use the AWS CLI to upload one snapshot from one node into the migration directory: ++ +[source,bash,subs="+quotes,attributes"] +---- +du -sh **CASSANDRA_DATA_DIR**/**KEYSPACE_NAME**/{asterisk}/snapshots/{asterisk}**SNAPSHOT_NAME**{asterisk}; \ +aws s3 sync --only-show-errors --exclude '{asterisk}' --include '{asterisk}/snapshots/**SNAPSHOT_NAME**{asterisk}' **CASSANDRA_DATA_DIR**/ **MIGRATION_DIR****NODE_NAME** +---- ++ +Replace the following: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=command-placeholders-common] + ++ +.Example: Upload a snapshot with AWS CLI +[%collapsible] +==== +[source,bash] +---- +# Set environment variables +export AWS_ACCESS_KEY_ID=XXXXXXXX +export AWS_SECRET_ACCESS_KEY=XXXXXXXXXX +export AWS_SESSION_TOKEN=XXXXXXXXXX + +# Upload "sensor_readings" snapshot from "dse0" node +du -sh /var/lib/cassandra/data/smart_home/*/snapshots/*sensor_readings*; \ +aws s3 sync --only-show-errors --exclude '*' --include '*/snapshots/sensor_readings*' /var/lib/cassandra/data/ s3://ds-mig-b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d/272eac1d-df8e-4d1b-a7c6-71d5af232182/sstables/dse0 +---- +==== + +. Monitor upload progress: ++ +.. Use the AWS CLI to get a list of cloud storage keys for the files that have been successfully uploaded to the migration directory: ++ +[source,bash,subs="+quotes"] +---- +aws s3 ls --human-readable --summarize --recursive *MIGRATION_DIR* +---- ++ +Replace *`MIGRATION_DIR`* with the `uploadBucketDir` that was generated when you xref:sideloader:migrate-sideloader.adoc#initialize-migration[initialized the migration]. ++ +.. Compare the returned list against the files in your snapshot directory. +When the lists match, the upload is complete. ++ +You can _potentially_ increase upload speeds by adjusting the `max_concurrent_requests`, `multipart_threshold`, and `multipart_chunksize` parameters in your https://docs.aws.amazon.com/cli/latest/topic/s3-config.html[AWS CLI S3 configuration]. +However, upload time primarily depends on the snapshot size, network throughput from your origin cluster to the migration bucket, and whether the origin cluster and migration bucket are in the same region. + +. Repeat the upload process for each snapshot (*`SNAPSHOT_NAME`*) and node (*`NODE_NAME`*) in your origin cluster. ++ +If your credentials expire, see xref:sideloader:troubleshoot-sideloader.adoc#get-new-upload-credentials[Get new upload credentials]. + +.Optional: Use a for loop to simplify snapshot uploads +[%collapsible] +==== +If the nodes in your origin cluster have predictable names (for example, `dse0`, `dse1`, and `dse2`), then you can use a `for` loop to streamline the execution of the upload commands. +For example: + +[source,bash,subs="+quotes,attributes"] +---- +# Set environment variables +export AWS_ACCESS_KEY_ID=**ACCESS_KEY_ID** +export AWS_SECRET_ACCESS_KEY=**SECRET_ACCESS_KEY** +export AWS_SESSION_TOKEN=**SESSION_TOKEN** + +# Loop over the sync command for all nodes +for i in 0 1 2; do ssh dse{loop-var} \ +"du -sh **CASSANDRA_DATA_DIR**/**KEYSPACE_NAME**/{asterisk}/snapshots/{asterisk}**SNAPSHOT_NAME**{asterisk}; \ +aws s3 sync --only-show-errors --exclude '{asterisk}' --include '{asterisk}/snapshots/**SNAPSHOT_NAME**{asterisk}' **CASSANDRA_DATA_DIR**/ **MIGRATION_DIR**dse{loop-var}" & done +---- +==== +-- + +Google Cloud:: ++ +-- +. Authenticate to Google Cloud with the `creds.json` file that you created when you xref:sideloader:migrate-sideloader.adoc#initialize-migration[initialized the migration]: ++ +[source,bash,subs="+quotes,attributes"] +---- +gcloud auth activate-service-account --key-file=creds.json +---- ++ +If necessary, modify the `--key-file` path to match the location of your `creds.json` file, such as `--key-file=~/.gcloud_credentials/creds.json`. ++ +You can also use `gcloud auth login --cred-file creds.json`. + +. Use `gsutil` to upload one snapshot from one node into the migration directory: ++ +[source,bash,subs="+quotes,attributes"] +---- +gsutil -m rsync -r -d **CASSANDRA_DATA_DIR**/**KEYSPACE_NAME**/{asterisk}{asterisk}/snapshots/**SNAPSHOT_NAME**/ **MIGRATION_DIR****NODE_NAME**/ +---- ++ +Replace the following: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=command-placeholders-common] + ++ +.Example: Upload a snapshot with gcloud and gsutil +[%collapsible] +==== +[source,bash,subs="attributes"] +---- +# Authenticate +gcloud auth activate-service-account --key-file=creds.json + +# Upload "sensor_readings" snapshot from "dse0" node +gsutil -m rsync -r -d /var/lib/cassandra/data/smart_home/{asterisk}{asterisk}/snapshots/sensor_readings/ gs://ds-mig-b7e7761f-6f7f-4116-81a5-e8eefcf0cc1d/272eac1d-df8e-4d1b-a7c6-71d5af232182/sstables/dse0 +---- +==== + +. Monitor upload progress: ++ +.. Use `gsutil` to get a list of objects that have been successfully uploaded to the migration directory: ++ +[source,bash,subs="+quotes"] +---- +gsutil ls -r *MIGRATION_DIR* +---- ++ +Replace *`MIGRATION_DIR`* with the `uploadBucketDir` that was generated when you xref:sideloader:migrate-sideloader.adoc#initialize-migration[initialized the migration]. ++ +.. Compare the returned list against the files in your snapshot directory. +When the lists match, the upload is complete. ++ +The `https://cloud.google.com/storage/docs/gsutil/commands/rsync#description[-m]` flag in `gsutil -m rsync` enables parallel synchronization, which can improve upload speed. +However, upload time primarily depends on the snapshot size, network throughput from your origin cluster to the migration bucket, and whether the origin cluster and migration bucket are in the same region. + +. Repeat the upload process for each snapshot (*`SNAPSHOT_NAME`*) and node (*`NODE_NAME`*) in your origin cluster. + +.Optional: Use a for loop to simplify snapshot uploads +[%collapsible] +==== +If the nodes in your origin cluster have predictable names (for example, `dse0`, `dse1`, and `dse2`), then you can use a `for` loop to streamline the execution of the `gsutil rsync` commands. +For example: + +[source,bash,subs="+quotes,attributes"] +---- +for i in 0 1 2; do ssh dse{loop-var} \ +du -sh **CASSANDRA_DATA_DIR**/**KEYSPACE_NAME**/{asterisk}/snapshots/{asterisk}**SNAPSHOT_NAME**{asterisk}; \ +gsutil -m rsync -r -d **CASSANDRA_DATA_DIR**/**KEYSPACE_NAME**/{asterisk}{asterisk}/snapshots/**SNAPSHOT_NAME**/ **MIGRATION_DIR**dse{loop-var} & done +---- +==== +-- + +Microsoft Azure:: ++ +-- +//---- +//for dir in $(find "$CASSANDRA_DATA_DIR" -type d -path "*/snapshots/${SNAPSHOT_NAME}*"); do +// REL_PATH=${dir#"$CASSANDRA_DATA_DIR"} # Remove the base path +// azcopy sync "$dir" "${MIGRATION_DIR}${NODE_NAME}/${REL_PATH}/"?${AZURE_SAS_TOKEN} --recursive +// done +// ' +//---- + +. Set environment variables for the following values: ++ +* *`AZURE_SAS_TOKEN`*: The `urlSignature` key that was generated when you xref:sideloader:migrate-sideloader.adoc#initialize-migration[initialized the migration]. +* *`CASSANDRA_DATA_DIR`*: The absolute file system path to where {cass-short} data is stored on the node, including the trailing slash. +For example, `/var/lib/cassandra/data/`. +* *`SNAPSHOT_NAME`*: The name of the xref:sideloader:migrate-sideloader.adoc#create-snapshots[snapshot backup] that you created with `nodetool snapshot`. +* *`MIGRATION_DIR`*: The entire `uploadBucketDir` value that was generated when you xref:sideloader:migrate-sideloader.adoc#initialize-migration[initialized the migration], including the trailing slash. +* *`NODE_NAME`*: The host name of the current node you are uploading the snapshot from. + ++ +[source,bash,subs="+quotes"] +---- +export AZURE_SAS_TOKEN="**AZURE_CREDENTIALS_URL**" +export CASSANDRA_DATA_DIR="**CASSANDRA_DATA_DIR**" +export SNAPSHOT_NAME="**SNAPSHOT_NAME**" +export MIGRATION_DIR="**MIGRATION_DIR**" +export NODE_NAME="**NODE_NAME**" +---- + +. Use the Azure CLI to upload one snapshot from one node into the migration directory: ++ +[source,bash] +---- +for dir in $(find "$CASSANDRA_DATA_DIR" -type d -path "*/snapshots/${SNAPSHOT_NAME}*"); do + REL_PATH="${dir#"$CASSANDRA_DATA_DIR"}" # Remove the base path + DEST_PATH="${MIGRATION_DIR}${NODE_NAME}/${REL_PATH}/?${AZURE_SAS_TOKEN}" + + azcopy sync "$dir" "$DEST_PATH" --recursive +done +---- + +. Monitor upload progress: ++ +.. Use the Azure CLI to get the curent contents of the migration directory: ++ +[source,bash] +---- +azcopy list ${MIGRATION_DIR}?${AZURE_SAS_TOKEN} +---- ++ +.. Compare the returned list against the files in your snapshot directory. +When the lists match, the upload is complete. ++ +Upload time primarily depends on the snapshot size, network throughput from your origin cluster to the migration bucket, and whether the origin cluster and migration bucket are in the same region. + +. Repeat the upload process for each snapshot and node in your origin cluster. +Be sure to change the `SNAPSHOT_NAME` and `NODE_NAME` environment variables as needed. +-- +====== + +Uploaded snapshots are staged in the migration directory, but the data is not yet written to the target database. +After uploading snapshots, you must xref:sideloader:migrate-sideloader.adoc#import-data[import the data] to finish the migration. + +=== Idle migration directories are evicted + +As an added security measure, migrations that remain continuously idle for one week are subject to xref:sideloader:cleanup-sideloader.adoc[automatic cleanup], which deletes all associated snapshots, revokes any unexpired upload credentials, and then closes the migration. + +{company} recommends that you xref:sideloader:cleanup-sideloader.adoc#reschedule-a-cleanup[manually reschedule the cleanup] if you don't plan to launch the migration within one week or if you need several days to upload snapshots or import data. + +[WARNING] +==== +For large migrations, it can take several days to upload snapshots and import data. +Make sure you xref:sideloader:cleanup-sideloader.adoc#reschedule-a-cleanup[manually reschedule the cleanup] to avoid automatic cleanup. +==== + +[#import-data] +== Import data + +After you upload snapshots for each origin node, import the data into your target database. + +Data import is a multi-step operation that requires complete success. +If one step fails, then the entire import operation stops and the migration fails. +//Does all data fail to import or is it possible to have a partial import? + +.What happens during data import? +[%collapsible] +====== +include::sideloader:partial$sideloader-partials.adoc[tags=import] +====== + +[TIP] +==== +If necessary, you can xref:sideloader:stop-restart-sideloader.adoc[pause or abort the migration] during the import process. + +include::sideloader:partial$sideloader-partials.adoc[tags=no-return] +==== + +. Use the {devops-api} to launch the data import: ++ +[source,bash] +---- +curl -X POST \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID}/launch \ + | jq . +---- ++ +Although this call returns immediately, the import process takes time. + +. Check the migration status periodically: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=check-status] + +. Check the `status` field in the response: ++ +* `"status": "ImportInProgress"`: The data is still being imported. +Wait a few minutes before you check the status again. +* `"status": "MigrationDone"`: The import is complete, and you can proceed to <>. + +. If the migration takes more than a few days, xref:sideloader:cleanup-sideloader.adoc#reschedule-a-cleanup[manually reschedule the cleanup] to avoid automatic cleanup. + +. If the migration fails, see xref:sideloader:troubleshoot-sideloader.adoc[]. + +[#validate-the-migrated-data] +== Validate the migrated data + +include::sideloader:partial$sideloader-partials.adoc[tags=validate] + +== See also + +* xref:sideloader:cleanup-sideloader.adoc[] +* xref:sideloader:troubleshoot-sideloader.adoc[] \ No newline at end of file diff --git a/modules/sideloader/pages/prepare-sideloader.adoc b/modules/sideloader/pages/prepare-sideloader.adoc new file mode 100644 index 00000000..75253dac --- /dev/null +++ b/modules/sideloader/pages/prepare-sideloader.adoc @@ -0,0 +1,302 @@ += Prepare to use {sstable-sideloader} +:description: Before you use {sstable-sideloader}, review the requirements and prepare your target database, origin cluster, and administration server. + +{description} + +Due to the nature of the {sstable-sideloader} process and the tools involved, you need to be familiar with using the command line, including the following: + +* Installing and using CLI tools +* Issuing curl commands +* Basic scripting +* Modifying example commands to fit your environment +* Security best practices + +[IMPORTANT] +==== +The {sstable-sideloader} process uses authentication credentials to write to the migration directory and your database. + +Make sure you understand how to securely store and use sensitive credentials when working on the command line. +==== + +== Target {astra-db} database requirements + +* Your {astra} organization must be on an *Enterprise* xref:astra-db-serverless:administration:subscription-plans.adoc[subscription plan]. ++ +{sstable-sideloader} is a premium feature that incurs costs based on usage: ++ +** Total amount (GB) of data processed as part of the {sstable-sideloader} workload. +** The amount of data stored in the migration bucket is metered at the standard {astra-db} storage rate. + ++ +-- +For more information and specific rates, see the https://www.datastax.com/pricing/astra-db[{astra} Pricing page]. + +[TIP] +==== +Migration directories are automatically cleaned up after one week of idle time. + +To minimize costs, you can xref:sideloader:cleanup-sideloader.adoc[manually clean up migration directories] when you no longer need them. +==== +-- + +* Your target database must be an {astra-db} Serverless database. ++ +If you don't already have one, xref:astra-db-serverless:databases:create-database.adoc[create a database]. +You can use either a {db-serverless} or {db-serverless-vector} database. ++ +{db-serverless-vector} databases can store both vector and non-vector data. + +* Your target database must be in a xref:astra-db-serverless:administration:provisioned-capacity-units.adoc[Provisioned Capacity Unit (PCU) group]. +You can use either a flexible capacity PCU group or a committed capacity PCU group, depending on your long-term needs and other PCU group usage. ++ +[tabs] +====== +Flexible capacity PCU group:: ++ +-- +Because {sstable-sideloader} operations are typically short-term, resource-intensive events, you can create a flexible capacity PCU group exclusively to support your target database during the migration. + +{company} recommends the following flexible capacity PCU group configuration for {sstable-sideloader} migrations. +For instructions, see xref:astra-db-serverless:administration:create-pcu.adoc#flexible-capacity[Create a flexible capacity PCU group]. + +[tabs] +==== +Target database is a {db-serverless} database:: ++ +* Minimum capacity: One or more, depending on the scale of the migration. +* Maximum capacity: Greater than the minimum by several units to allow autoscaling during resource intensive stages of the migration. ++ +For non-trivial migrations, consider setting the maximum to 10. +For extremely large migrations, contact your {company} account representative or {support-url}[{company} Support] to request more than 10 units to support your migration. + +Target database is a {db-serverless-vector} database:: ++ +By default, {db-serverless-vector} databases can have no more than one unit per PCU group. +For any non-trivial migration, contact your {company} account representative or {support-url}[{company} Support] for assistance configuring a PCU group for your target {db-serverless-vector} database. +==== + +After the migration, you can move your target database out of the flexible capacity PCU group, and then park or delete the group. +Don't park the PCU group during the {sstable-sideloader} process because databases in a parked PCU group are hibernated and unavailable for use. +-- + +Committed capacity PCU group:: ++ +-- +If you plan to keep your target database in a PCU group after the migration, you can create a committed capacity PCU group for your target database. + +[IMPORTANT] +==== +The {sstable-sideloader} process can be extremely resource intensive. +If there are any other databases in the same PCU group, the migration process can affect their performance due to resource contention. + +If your PCU groups have multiple databases, consider using a flexible capacity PCU group to temporarily isolate your target database during the migration. +==== + +{company} recommends the following committed capacity PCU group configuration for {sstable-sideloader} migrations. +For instructions, see xref:astra-db-serverless:administration:create-pcu.adoc#committed-capacity[Create a committed capacity PCU group]. + +[tabs] +==== +Target database is a {db-serverless} database:: ++ +* Reserved capacity: One or more, depending on the PCU group's normal, long-term workload requirements. ++ +This is the amount of long-term capacity that you want the group to have after the migration is complete. + +* Minimum capacity: Equal to or greater than the reserved capacity. ++ +If the minimum is greater than the reserved capacity, the surplus capacity is prepared in advance, and there is no autoscaling required to access that capacity. + +* Maximum capacity: Greater than the minimum by several units to allow autoscaling during resource intensive stages of the migration. ++ +For non-trivial migrations, consider setting the maximum to 10. +For extremely large migrations, contact your {company} account representative or {support-url}[{company} Support] to request more than 10 units to support your migration. ++ +After the migration, you can reduce the minimum and maximum capacity down to the levels required for normal database operations. + +Target database is a {db-serverless-vector} database:: ++ +By default, {db-serverless-vector} databases can have no more than one unit per PCU group. +For any non-trivial migration, contact your {company} account representative or {support-url}[{company} Support] for assistance configuring a PCU group for your target {db-serverless-vector} database. +==== +-- +====== ++ +For more information, see xref:astra-db-serverless:administration:provisioned-capacity-units.adoc[]. + +[#origin-cluster-requirements] +== Origin cluster requirements + +The following requirements, recommendations, and limitations apply to origin clusters. +Review all of these to ensure that your cluster is compatible with {sstable-sideloader}. + +=== Cluster infrastructure + +* Your origin cluster can be hosted on premises or on any cloud provider. + +* Your origin cluster must run a supported database version: ++ +** {cass-reg} 3.11 or later +** {dse-short} 5.1 or later +** {hcd-short} 1.1 or later +//Due to a potential occasional issue affecting SSTables generated by C* 3.0 / DSE 5.0. Until the fix is rolled out, we need to restrict the SSTable versions. + +* Your origin cluster must use the default https://cassandra.apache.org/doc/stable/cassandra/configuration/cass_yaml_file.html#partitioner[partitioner], `Murmur3Partitioner`. ++ +Older partitioners, such as `RandomPartitioner`, `ByteOrderedPartitioner`, and `OrderPreservingPartitioner`, are not supported. + +=== Cloud provider CLI + +To upload snapshots directly from the origin cluster, you must install your cloud provider's CLI on each node in the origin cluster. + +The tool you install depends on the region where your target {astra-db} database is deployed: + +* AWS: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html[Install AWS CLI] +* Google Cloud: https://cloud.google.com/sdk/docs/install-sdk[Install gcloud] and https://cloud.google.com/storage/docs/gsutil_install[install gsutil] +* Microsoft Azure: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli[Install Azure CLI] + +Alternatively, you can upload copies of the snapshots from a separate staging server that has the CLI installed, and you must coordinate this through the administration server. +However, this process _isn't_ covered in this guide. +The CLI commands in this guide assume you have installed your cloud provider's CLI on the nodes in the origin cluster. +If you choose the alternative option, you must modify the commands accordingly for your environment. + +=== Incompatible data + +* *{astra-db} doesn't support materialized views*: You must replace these with SAI or an alternative data model design. + +* *{sstable-sideloader} doesn't support encrypted data*: If your origin cluster uses xref:6.9@dse:securing:transparent-data-encryption.adoc[{dse-short} Transparent Data Encryption], be aware that {sstable-sideloader} can't migrate these SSTables. ++ +If you have a mix of encrypted and unencrypted data, you can use {sstable-sideloader} to migrate the unencrypted data. +After the initial migration, you can use another strategy to move the encrypted data, such as https://github.com/datastax/cassandra-data-migrator[{cass-short} Data Migrator (CDM)] or a manual export and reupload. + +* *{sstable-sideloader} doesn't support secondary indexes*: If you don't remove or replace these in your origin cluster, then you must manually remove these directories from your snapshots, as explained in xref:sideloader:migrate-sideloader.adoc#create-snapshots[Create snapshots]. + +== Administration server requirements + +You need a server where you can run the {sstable-sideloader} commands. + +Your administration server must have SSH access to each node in your origin cluster. + +{company} recommends that you install the following additional software on your administration server: + +* https://github.com/datastax/cassandra-data-migrator[{cass-short} Data Migrator (CDM)] to validate imported data and, in the context of {product}, reconcile it with the origin cluster. +* https://jqlang.github.io/jq/[jq] to format JSON responses from the {astra} {devops-api}. +The {devops-api} commands in this guide use this tool. + +== Additional preparation for specific migration scenarios + +The following information can help you prepare for specific migration scenarios, including multi-region migrations and multiple migrations to the same database. + +=== Multi-region migrations + +Multi-region migrations can include one or more of the following scenarios: + +* Your origin cluster is deployed to multiple regions. +* Your target database is, or will be, deployed to multiple regions. +* You need to support multiple regions in a live migration scenario. + +It is difficult to provide a one-size-fits-all solution for multi-region migrations due to the potential complexity and variability of these scenarios. +For assistance planning a multi-region migration, contact your {company} account representative or {support-url}[{company} Support]. + +=== Multi-node migrations + +You can migrate data from any number of nodes in your origin cluster to the same target database or multiple target databases. + +When you xref:sideloader:migrate-sideloader.adoc[migrate data with {sstable-sideloader}], there is no difference in the core process when migrating from one node or multiple nodes. +The following steps summarize the process and outline some considerations for migrating multiple nodes. + +[tabs] +====== +Migrate multiple nodes to one database:: ++ +-- +. On your origin cluster, make sure your data is valid and ready to migrate, as explained in <>. + +. From your origin cluster, create snapshots for all of the nodes that you want to migrate. ++ +Run `nodetool snapshot` as many times as necessary to capture all of your nodes. + +. On your target database, replicate the schemas for all tables that you want to migrate. ++ +This is critical for a successful migration. +If the schemas don't match, the migration fails. ++ +You don't need to make any changes based on the number of nodes, as long as the keyspaces and table schemas are replicated in the target database. + +. Initialize the migration to prompt {sstable-sideloader} to create a migration bucket for your target database. + +. Upload all of your node snapshots to the migration bucket. + +. Use {sstable-sideloader} to import the data to your target database. ++ +{sstable-sideloader} imports snapshots from the migration bucket to your target database based on the matching schemas. +The number of node snapshots that you uploaded to the migration bucket doesn't determine the success of the import. +The success of the import depends primarily on the validity of the schemas and the data in the snapshots. + +. After the import, validate the migrated data to ensure that it matches the data in the origin cluster. +For example, you can xref:ROOT:cassandra-data-migrator.adoc#cdm-validation-steps[run {cass-migrator} ({cass-migrator-short}) in validation mode]. +-- + +Migrate multiple nodes to multiple databases:: ++ +-- +Orchestrating concurrent migrations from multiple nodes to multiple target databases can be complex. + +Consider focusing on one target database at a time, or create a migration plan to track origin nodes, target databases, migration bucket credentials, and timelines for each migration. + +. On your origin cluster, make sure your data is valid and ready to migrate, as explained in <>. + +. From your origin cluster, create snapshots for all of the nodes that you want to migrate. ++ +Run `nodetool snapshot` as many times as necessary to capture all of your nodes. + +. On each of your target databases, replicate the schemas for the tables that you want to migrate to each database. ++ +This is critical for a successful migration. +If the schemas don't match, the migration fails. ++ +You don't need to make any changes based on the number of nodes, as long as the keyspaces and table schemas are replicated in the target databases. ++ +If you want to migrate the same data to multiple databases, you must recreate the schemas in each of those databases. +{sstable-sideloader} requires a schema to be present in the target database in order to migrate data. + +. For each target database, initialize a migration to prompt {sstable-sideloader} to create migration buckets for each database. ++ +At minimum, you must initialize one migration for each database. + +. Upload the node snapshots to their corresponding migration buckets. + +. Use {sstable-sideloader} to import the data to your target databases. ++ +You can import data to multiple databases at once, but each import event must be triggered separately using the unique migration ID. ++ +{sstable-sideloader} imports snapshots from the migration bucket to your target database based on the matching schemas. +The number of node snapshots that you uploaded to the migration bucket doesn't determine the success of the import. +The success of the import depends primarily on the validity of the schemas and the data in the snapshots.\ + +. After the import, validate the migrated data to ensure that it matches the data in the origin cluster. +For example, you can xref:ROOT:cassandra-data-migrator.adoc#cdm-validation-steps[run {cass-migrator} ({cass-migrator-short}) in validation mode]. +-- +====== + +=== Multiple migrations to the same database + +When you initialize a migration with {sstable-sideloader}, a unique migration ID is generated for that specific migration workflow. +For each migration ID, there is a unique migration directory and migration directory credentials. + +If you initialize multiple migrations for the same database, you generate multiple migration IDs, each with its own migration directory and credentials. + +This can be useful for breaking large migrations into smaller batches. +For example, if you have 100 snapshots, you could initialize 10 migrations, and then upload 10 different snapshots to each migration directory. + +You can upload snapshots to multiple migration directories at once. +However, when you reach the import phase of the migration, {sstable-sideloader} can import from only one migration directory at a time per database. +For example, if you have 10 migration IDs for the same database, you must run 10 separate import actions. +Each import must completely finish before starting the next import. + +After all of the imports are complete, validate the migrated data in your target database to ensure that it matches the data in the origin cluster. +For example, you can xref:ROOT:cassandra-data-migrator.adoc#cdm-validation-steps[run {cass-migrator} ({cass-migrator-short}) in validation mode]. + +== Next steps + +* xref:sideloader:migrate-sideloader.adoc[] \ No newline at end of file diff --git a/modules/sideloader/pages/sideloader-overview.adoc b/modules/sideloader/pages/sideloader-overview.adoc new file mode 100644 index 00000000..bc303098 --- /dev/null +++ b/modules/sideloader/pages/sideloader-overview.adoc @@ -0,0 +1,121 @@ += About {sstable-sideloader} +:page-aliases: data-importer:data-importer-overview.adoc, astra-db-serverless:sideloader:sideloader-overview.adoc +:description: {sstable-sideloader} lets you migrate data from an {cass-reg} or {dse} cluster into {astra-db} without impacting the origin cluster or your {astra-db} Serverless database. + +{sstable-sideloader} is a service running in {astra-db} that directly imports data from snapshot backups that you've uploaded to {astra-db} from an existing {cass-reg}, {dse}, or {hcd} cluster. + +Because it imports data directly, {sstable-sideloader} can offer several advantages over CQL-based tools like xref:dsbulk:overview:dsbulk-about.adoc[{company} Bulk Loader (DSBulk)] and xref:ROOT:cassandra-data-migrator.adoc[{cass-short} Data Migrator (CDM)], including faster, more cost-effective data loading, and minimal performance impacts on your origin cluster and target database. + +== {sstable-sideloader} concepts + +Origin, origin cluster:: +In the context of {sstable-sideloader}, this refers to your existing {cass-short}, {dse-short}, or {hcd-short} cluster. + +Target, target database:: +In the context of {sstable-sideloader}, this refers to the {astra-db} Serverless database where you will migrate your data. + +Administration server:: +A server where you run the migration commands, including CLI commands and {astra} {devops-api} calls. +It must have SSH access to each node in your origin cluster. + +Migration:: +A workflow that you initiate within {sstable-sideloader} that encompasses the lifecycle of uploading and importing snapshot backups of a specific set of keyspaces or CQL tables. ++ +This process produces artifacts and parameters including migration buckets, migration IDs, migration directories, and upload credentials. +You use these components throughout the migration workflow. + +[#sideloader-process] +== The {sstable-sideloader} process + +Transferring data with {sstable-sideloader} is a multi-phase process. +Before you use {sstable-sideloader}, learn about the events, outcomes, warnings, and requirements of each phase: + +=== Prepare your infrastructure + +There are requirements for using {sstable-sideloader} that you must consider before you start a migration. +Additionally, you must take steps to prepare your target database, origin cluster, and administration server before you begin the migration. + +For more information, see xref:sideloader:prepare-sideloader.adoc[]. + +=== Create snapshot backups + +{sstable-sideloader} uses snapshot backup files to import SSTable data from your existing origin cluster. +This is an ideal approach for database migrations because creating a snapshot has negligible performance impact on the origin cluster, and it preserves metadata like write timestamps and expiration times (TTLs). + +Each snapshot for each node in the origin cluster must include all the keyspaces and individual CQL tables that you want to migrate. + +For more information, see xref:sideloader:migrate-sideloader.adoc#create-snapshots[Migrate data with {sstable-sideloader}: Create snapshots]. + +=== Prepare the target database + +Because snapshots don't store schema definitions, you must pre-configure the schema definition in your target {astra-db} database so that it matches the origin cluster's schema. + +For the migration to succeed, the schema in your target database must align with the schema in the origin cluster. +However, you might need to modify your schema or data model to be compatible with {astra-db}. + +For specific requirements and more information, see xref:sideloader:migrate-sideloader.adoc#record-schema[Migrate data with {sstable-sideloader}: Configure the target database]. + +=== Initialize a migration + +include::sideloader:partial$sideloader-partials.adoc[tags=initialize] + +For instructions and more information, see xref:sideloader:migrate-sideloader.adoc#initialize-migration[Migrate data with {sstable-sideloader}: Initialize the migration]. + +=== Upload snapshots + +When initialization is complete, use your cloud provider's CLI to xref:sideloader:migrate-sideloader.adoc#upload-snapshots-to-migration-directory[upload your snapshots to the migration directory]. + +To upload snapshots directly from the origin cluster, you must install your cloud provider's CLI on each node in the origin cluster. +While it is possible to orchestrate this process through a staging server, the commands given in this documentation assume you are uploading snapshots directly from the origin cluster. + +The time required to upload the snapshots depends on the size of your dataset and the network throughput between the origin cluster and the migration bucket: + +[cols="10,30,60"] +|=== +|Speed |Migration type |Description + +|Fastest +|Inter-datacenter +|All else equal, snapshots take the least time to upload when the origin cluster is in the same cloud provider and region as the target database. + +|Fast +|Cross-datacenter, co-located +|Uploads are slower by default when they must exit the local datacenter. +The delay increases relative to the physical distance between the datacenters. + +For example, all else equal, uploading from AWS `us-east-1` (Dulles, VA, USA) to AWS `ca-central-1` (Montréal, QC, Canada) is faster than uploading from `us-east-1` to `us-west-2` (The Dalles, OR, USA) because Oregon is significantly further from Virginia than Montréal. + +|Variable +|Cross-provider, co-located +|If the target database is in a different cloud provider than the origin cluster, the upload can be slower as the data passes from one provider's infrastructure to another. + +This is considered a cross-datacenter transfer, and the delay increases relative to the physical distance between the datacenters. + +|Slowest +|Transoceanic +|The slowest uploads happen when the data must travel over transoceanic cables. +If the data must also change cloud providers, there can be additional delays. + +In this case, consider creating your target database in a co-located datacenter, and then xref:astra-db-serverless:databases:manage-regions.adoc[deploy your database to other regions] after the migration. +|=== + +=== Import data + +include::sideloader:partial$sideloader-partials.adoc[tags=import] + +For instructions and more information, see xref:sideloader:migrate-sideloader.adoc#import-data[Migrate data with {sstable-sideloader}: Import data] + +=== Validate imported data + +include::sideloader:partial$sideloader-partials.adoc[tags=validate] + +== Use {sstable-sideloader} with {product-short} + +If you need to migrate a live database, you can use {sstable-sideloader} instead of DSBulk or {cass-short} Data Migrator during of xref:ROOT:migrate-and-validate-data.adoc[Phase 2 of {product} ({product-short})]. + +.Use {sstable-sideloader} in the context of {product}. +image::sideloader:data-importer-zdm.svg[] + +== Next steps + +* xref:sideloader:prepare-sideloader.adoc[] \ No newline at end of file diff --git a/modules/sideloader/pages/stop-restart-sideloader.adoc b/modules/sideloader/pages/stop-restart-sideloader.adoc new file mode 100644 index 00000000..53598c22 --- /dev/null +++ b/modules/sideloader/pages/stop-restart-sideloader.adoc @@ -0,0 +1,66 @@ += Stop or restart an {sstable-sideloader} migration +:description: If necessary, you can pause, cancel, or restart an {sstable-sideloader} migration. + +{description} + +== Pause a migration + +Use the {devops-api} to pause a migration: + +[source,bash] +---- +curl -X POST \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID}/pause \ + | jq . +---- + +A paused migration retains its current state and progress. + +Any in-progress jobs will complete, but no new jobs will start. + +=== Resume a migration + +Resume a previously-paused migration from the point at which it was paused: + +[source,bash] +---- +curl -X POST \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID}/resume \ + | jq . +---- + +You can only resume an active migration that has been paused. +Running this command against migrations in other statuses, such as idle migrations that were automatically cleaned up, has no effect. + +[#abort-migration] +== Abort a migration + +Abort a migration only if you want to abandon it completely. + +. Abort a migration and remove all migration progress: ++ +[source,bash] +---- +curl -X POST \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID}/abort \ + | jq . +---- ++ +include::sideloader:partial$sideloader-partials.adoc[tags=no-return] +For more information about what happens during each phase of a migration and the point of no return, see xref:sideloader:sideloader-overview.adoc[]. + +. Wait a few minutes, and then check the migration status to confirm that the migration stopped: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=check-status] + +== Retry a failed migration + +For information about reattempting a failed migration, see xref:sideloader:troubleshoot-sideloader.adoc[]. + +== See also + +* xref:sideloader:cleanup-sideloader.adoc[] +* xref:sideloader:troubleshoot-sideloader.adoc[] \ No newline at end of file diff --git a/modules/sideloader/pages/troubleshoot-sideloader.adoc b/modules/sideloader/pages/troubleshoot-sideloader.adoc new file mode 100644 index 00000000..14e0790f --- /dev/null +++ b/modules/sideloader/pages/troubleshoot-sideloader.adoc @@ -0,0 +1,101 @@ += Troubleshoot {sstable-sideloader} +:description: Get help with {sstable-sideloader} + +Learn how to troubleshoot common {sstable-sideloader} issues. + +== Migration status is outdated + +You can use the {devops-api} to check the migration status at any time. +However, it can take a few minutes for the {devops-api} to reflect status changes during a migration. +Immediately calling the {devops-api} after starting a new phase of the migration might not return the actual current status. + +[#get-new-upload-credentials] +== Get new upload credentials + +//TODO: Does checking the migration status always generate new creds or only if they are expired? + +If your credentials expire, do the following: + +. Use the `MigrationStatus` endpoint to generate new credentials: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=check-status] + +. Continue the migration with the fresh credentials. ++ +If you set environment variables for your credentials, be sure to update those values. + +== Retry a failed migration + +If a migration fails, there are two ways that you can reattempt the migration. +The option you use depends on the type of error that occurred. + +If you are able to resolve the cause of the failure without modifying the migration directory contents, you can relaunch the migration using the data already present in the migration directory. +Otherwise, you must abandon the failed migration and restart the entire migration process from the beginning. + +The two most common errors are as follows: + +* *Schema discrepancies*: There is a mismatch between the origin and target schemas. +To resolve this error, you can <>. + +* *Invalid data in migration directory*: The data uploaded to the migration directory is invalid or improperly formatted. +Common causes include data corruption, incomplete upload due to a timeout, malformed file paths, and the presence of invalid data such as secondary index directories. ++ +When this type of failure occurs, you must abandon the failed migration and restart the entire migration process. +For more information, see <>. + +[#relaunch] +=== Relaunch a failed migration + +. Check the migration status for an error message related to the failure: ++ +include::sideloader:partial$sideloader-partials.adoc[tags=check-status] + +. If possible, resolve the issue described in the error message. ++ +For example, if there is a problem with the schema in the target database, make sure that your schemas align, as described in xref:sideloader:migrate-sideloader.adoc#record-schema[Configure the target database]. + +. Repeat the `launch` command that you used to xref:sideloader:migrate-sideloader.adoc#import-data[import the data], and continue the migration process from there. + +If the migration fails again, see <>. + +//// +Future: +=== Reset failed migration endpoint +https://datastax.slack.com/archives/C044Q060210/p1741772318884679?thread_ts=1741691860.400749&cid=C044Q060210 +TODO: Add to this page and stop-restart page. + +"resetting" a failed/aborted migration: +- Call a new endpoint called reset, which removes all the metadata and restores their write and read access to the migration directory. +- By doing this they accept that the progress will be wiped and that they will be charged for the failed attempt in accordance with our pricing rules. +- Amend the data in the migration directory as needed. +- Call relaunch to re-execute the migration from scratch. +//// + +[#restart] +=== Restart a failed migration + +When a migration fails due to a problem with the data uploaded to the migration directory, you must completely restart the migration. + +This is because you can't change the data in the migration directory after you upload it. +For example, if your snapshots contain corrupt data, you have to restart the migration with new snapshots and a new migration directory. + +. Review the xref:sideloader:prepare-sideloader.adoc#origin-cluster-requirements[origin cluster requirements] to ensure that your snapshot doesn't contain invalid data, including materialized views, encrypted data, and secondary indexes. + +. If necessary, xref:sideloader:migrate-sideloader.adoc#create-snapshots[create new snapshots] of any invalid snapshots. ++ +If your snapshots don't appear to contain invalid data, continue to the next step. + +. If necessary, xref:sideloader:migrate-sideloader.adoc#record-schema[reprepare the target database]. +There are two reasons you might need to do this: ++ +** The origin and target schemas don't match. +** The migration reached a point that some data was loaded to the target database. +This is unlikely, but, if this happens, you must xref:astra-db-serverless:databases:manage-collections.adoc#delete-a-table-in-the-astra-portal[drop the table] from your target database, and then recreate the table in the target database. ++ +In this case, if the migration _didn't_ fail due to a problem with the snapshot data, you can potentially reuse the existing snapshots for the new migration. + +. Repeat the remainder of the migration process from xref:sideloader:migrate-sideloader.adoc#initialize-migration[Initialize the migration]. ++ +This starts a fresh migration with a new migration directory, a migration ID, and upload credentials. + +. If the migration fails again and you are unable to determine the cause of the failure, contact {support-url}[{company} Support]. \ No newline at end of file diff --git a/modules/sideloader/partials/sideloader-partials.adoc b/modules/sideloader/partials/sideloader-partials.adoc new file mode 100644 index 00000000..00faa3f5 --- /dev/null +++ b/modules/sideloader/partials/sideloader-partials.adoc @@ -0,0 +1,98 @@ +// tag::check-status[] +[source,bash] +---- +curl -X GET \ + -H "Authorization: Bearer ${token}" \ + https://api.astra.datastax.com/v2/databases/${dbID}/migrations/${migrationID} \ + | jq . +---- ++ +A successful response contains a `MigrationStatus` object. +It can take a few minutes for the {devops-api} to reflect status changes during a migration. +Immediately calling this endpoint after starting a new phase of the migration might not return the actual current status. +// end::check-status[] + +// tag::command-placeholders-common[] +* *`CASSANDRA_DATA_DIR`*: The absolute file system path to where {cass-short} data is stored on the node. +For example, `/var/lib/cassandra/data`. +* *`KEYSPACE_NAME`*: The name of the keyspace that contains the tables you want to migrate. +* *`SNAPSHOT_NAME`*: The name of the xref:sideloader:migrate-sideloader.adoc#create-snapshots[snapshot backup] that you created with `nodetool snapshot`. +* *`MIGRATION_DIR`*: The entire `uploadBucketDir` value that was generated when you xref:sideloader:migrate-sideloader.adoc#initialize-migration[initialized the migration], including the trailing slash. +* *`NODE_NAME`*: The host name of the current node you are uploading the snapshot from. +// end::command-placeholders-common[] + +// tag::validate[] +After the migration is complete, you can query the migrated data using the xref:astra-db-serverless:cql:develop-with-cql.adoc#connect-to-the-cql-shell[CQL shell] or xref:astra-db-serverless:api-reference:row-methods/find-many.adoc[{data-api}]. + +You can xref:ROOT:cassandra-data-migrator.adoc#cdm-validation-steps[run {cass-migrator} ({cass-migrator-short}) in validation mode] for more thorough validation. +{cass-migrator-short} also offers an AutoCorrect mode to reconcile any differences that it detects. +// end::validate[] + +// tag::initialize[] +After you create snapshots on the origin cluster and pre-configure the schema on the target database, use the {astra} {devops-api} to initialize the migration. + +image::sideloader:data-importer-workflow.svg[] + +When you initialize a migration, {sstable-sideloader} does the following: + +. Creates a secure migration bucket. ++ +The migration bucket is only created during the first initialization. +All subsequent migrations use different directories in the same migration bucket. ++ +{company} owns the migration bucket, and it is located within the {astra} perimeter. + +. Generates a migration ID that is unique to the new migration. + +. Creates a migration directory within the migration bucket that is unique to the new migration. ++ +The migration directory is also referred to as the `uploadBucketDir`. +In the next phase of the migration process, you will upload your snapshots to this migration directory. + +. Generates upload credentials that grant read/write access to the migration directory. ++ +The credentials are formatted according to the cloud provider where your target database is deployed. +// end::initialize[] + +// tag::import[] +After uploading the snapshots to the migration directory, use the {devops-api} to start the data import process. + +During the import process, {sstable-sideloader} does the following: + +. Revokes access to the migration directory. ++ +You cannot read or write to the migration directory after starting the data import process. + +. Discovers all uploaded SSTables in the migration directory, and then groups them into approximately same-sized subsets. + +. Runs validation checks on each subset. + +. Converts all SSTables of each subset. + +. Disables new compactions on the target database. ++ +[WARNING] +==== +This is the last point at which you can xref:sideloader:stop-restart-sideloader.adoc#abort-migration[abort the migration]. + +Once {sstable-sideloader} begins to import SSTable metadata (the next step), you cannot stop the migration. +==== + +. Imports metadata from each SSTable. ++ +If the dataset contains tombstones, any read operations on the target database can return inconsistent results during this step. +Since compaction is disabled, there is no risk of permanent inconsistencies. +However, in the context of xref:ROOT:introduction.adoc[{product}], it's important that the {product-short} proxy continues to read from the origin cluster. + +. Re-enables compactions on the {astra-db} Serverless database. + +Each step must finish successfully. +If one step fails, the import operation stops and no data is imported into your target database. + +If all steps finish successfully, the migration is complete and you can access the imported data in your target database. +// end::import[] + +// tag::no-return[] +You can abort a migration up until the point at which {sstable-sideloader} starts importing SSTable metadata. +After this point, you must wait for the migration to finish, and then you can use the CQL shell to drop the keyspace/table in your target database before repeating the entire migration procedure. +// end::no-return[] \ No newline at end of file