Merge branch 'stage' of https://github.com/datastax/streaming-learning-docs into stage

mendonk · mendonk · commit a0c4b8ac554a · 2023-04-19T14:37:17.000-04:00
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/aws-S3.csv b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/aws-S3.csv
@@ -0,0 +1,31 @@
+Name,Required,Default,Description
+accessKeyId,true,null,The Cloud Storage access key ID. It requires permission to write objects.
+bucket,true,null,The Cloud Storage bucket.
+endpoint,true,null,The Cloud Storage endpoint.
+provider,true,null,"The Cloud Storage type, such as aws-s3,s3v2(s3v2 uses the AWS client but not the JCloud client)."
+secretAccessKey,true,null,The Cloud Storage secret access key.
+avroCodec,false,snappy,"Compression codec used when formatType=avro. Available compression types are: null (no compression), deflate, bzip2, xz, zstandard, snappy."
+avroCodec,false,snappy,"Compression codec used when formatType=avro. Available compression types are: null (no compression), deflate, bzip2, xz, zstandard, snappy."
+batchSize,false,10,The number of records submitted in batch.
+batchTimeMs,false,1000,The interval for batch submission.
+bytesFormatTypeSeparator,false,0x10,"It is inserted between records for the formatType of bytes. By default, it is set to '0x10'. An input record that contains the line separator looks like multiple records in the output object."
+formatType,false,json,"The data format type. Available options are JSON, Avro, Bytes, or Parquet. By default, it is set to JSON."
+jsonAllowNaN,false,false,"Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=json. Since JSON specification does not allow such values this is a non-standard feature and disabled by default."
+jsonAllowNaN,false,false,"Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=json. Since JSON specification does not allow such values this is a non-standard feature and disabled by default."
+maxBatchBytes,false,10000000,The maximum number of bytes in a batch.
+parquetCodec,false,gzip,"Compression codec used when formatType=parquet. Available compression types are: null (no compression), snappy, gzip, lzo, brotli, lz4, zstd."
+parquetCodec,false,gzip,"Compression codec used when formatType=parquet. Available compression types are: null (no compression), snappy, gzip, lzo, brotli, lz4, zstd."
+partitionerType,false,partition,"The partitioning type. It can be configured by topic partitions or by time. By default, the partition type is configured by topic partitions."
+partitionerUseIndexAsOffset,false,false,"Whether to use the Pulsar's message index as offset or the record sequence. It's recommended if the incoming messages may be batched. The brokers may or not expose the index metadata and, if it's not present on the record, the sequence will be used. See PIP-70 for more details."
+pathPrefix,false,false,"If it is set, the output files are stored in a folder under the given bucket path. The pathPrefix must be in the format of xx/xxx/."
+pendingQueueSize,false,10,"The number of records buffered in queue. By default, it is equal tobatchSize. You can set it manually."
+role,false,null,The Cloud Storage role.
+roleSessionName,false,null,The Cloud Storage role session name.
+skipFailedMessages,false,false,"Configure whether to skip a message which it fails to be processed. If it is set to true, the connector will skip the failed messages by ack it. Otherwise, the connector will fail the message."
+sliceTopicPartitionPath,false,false,"When it is set to true, split the partitioned topic name into separate folders in the bucket path."
+timePartitionDuration,false,86400000,"The time interval for time-based partitioning. Support formatted interval string, such as 30d, 24h, 30m, 10s, and also support number in milliseconds precision, such as 86400000 refers to 24h or 1d."
+timePartitionPattern,false,yyyy-MM-dd,"The format pattern of the time-based partitioning. For details, refer to the Java date and time format."
+useHumanReadableMessageId,false,false,"Use a human-readable format string for messageId in message metadata. The messageId is in a format like ledgerId:entryId:partitionIndex:batchIndex. Otherwise, the messageId is a Hex-encoded string."
+useHumanReadableSchemaVersion,false,false,"Use a human-readable format string for the schema version in the message metadata. If it is set to true, the schema version is in plain string format. Otherwise, the schema version is in hex-encoded string format."
+withMetadata,false,false,Save message attributes to metadata.
+withTopicPartitionNumber,false,true,"When it is set to true, include the topic partition number to the object path."
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/azure-blob.csv b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/azure-blob.csv
@@ -0,0 +1,28 @@
+Name,Required,Default,Description
+azureStorageAccountConnectionString,true,,The Azure Blob Storage connection string. Required when authenticating via connection string.
+azureStorageAccountKey,true,,The Azure Blob Storage account key. Required when authenticating via account name and account key.
+azureStorageAccountName,true,,The Azure Blob Storage account name. Required when authenticating via account name and account key.
+azureStorageAccountSASToken,true,,The Azure Blob Storage account SAS token. Required when authenticating via SAS token.
+bucket,true,null,The Cloud Storage bucket.
+endpoint,true,null,The Azure Blob Storage endpoint.
+provider,true,null,The Cloud Storage type. Azure Blob Storage only supports the azure-blob-storage provider.
+avroCodec,false,snappy,"Compression codec used when formatType=avro. Available compression types are: null (no compression), deflate, bzip2, xz, zstandard, snappy."
+batchSize,false,10,The number of records submitted in batch.
+batchTimeMs,false,1000,The interval for batch submission.
+bytesFormatTypeSeparator,false,0x10,"It is inserted between records for the formatType of bytes. By default, it is set to '0x10'. An input record that contains the line separator looks like multiple records in the output object."
+formatType,false,json,"The data format type. Available options are JSON, Avro, Bytes, or Parquet. By default, it is set to JSON."
+jsonAllowNaN,false,false,"Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=json. Since JSON specification does not allow such values this is a non-standard feature and disabled by default."
+maxBatchBytes,false,10000000,The maximum number of bytes in a batch.
+parquetCodec,false,gzip,"Compression codec used when formatType=parquet. Available compression types are: null (no compression), snappy, gzip, lzo, brotli, lz4, zstd."
+partitionerType,false,partition,"The partitioning type. It can be configured by topic partitions or by time. By default, the partition type is configured by topic partitions."
+partitionerUseIndexAsOffset,false,false,"Whether to use the Pulsar's message index as offset or the record sequence. It's recommended if the incoming messages may be batched. The brokers may or not expose the index metadata and, if it's not present on the record, the sequence will be used. See PIP-70 for more details."
+pathPrefix,false,false,"If it is set, the output files are stored in a folder under the given bucket path. The pathPrefix must be in the format of xx/xxx/."
+pendingQueueSize,false,10,"The number of records buffered in queue. By default, it is equal to batchSize. You can set it manually."
+skipFailedMessages,false,false,"Configure whether to skip a message which it fails to be processed. If it is set to true, the connector will skip the failed messages by ack it. Otherwise, the connector will fail the message."
+sliceTopicPartitionPath,false,false,"When it is set to true, split the partitioned topic name into separate folders in the bucket path."
+timePartitionDuration,false,86400000,"The time interval for time-based partitioning. Support formatted interval string, such as 30d, 24h, 30m, 10s, and also support number in milliseconds precision, such as 86400000 refers to 24h or 1d."
+timePartitionPattern,false,yyyy-MM-dd,"The format pattern of the time-based partitioning. For details, refer to the Java date and time format."
+useHumanReadableMessageId,false,false,"Use a human-readable format string for messageId in message metadata. The messageId is in a format like ledgerId:entryId:partitionIndex:batchIndex. Otherwise, the messageId is a Hex-encoded string."
+useHumanReadableSchemaVersion,false,false,"Use a human-readable format string for the schema version in the message metadata. If it is set to true, the schema version is in plain string format. Otherwise, the schema version is in hex-encoded string format."
+withMetadata,false,false,Save message attributes to metadata.
+withTopicPartitionNumber,false,true,"When it is set to true, include the topic partition number to the object path."
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/curl-create.sh b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/curl-create.sh
@@ -0,0 +1,11 @@
+curl -sS --fail --request POST ''$WEB_SERVICE_URL'/admin/v3/sinks/'$TENANT'/'$NAMESPACE'/'$SINK_NAME'?opt=poweruser' \
+  --header "Authorization: Bearer $PULSAR_TOKEN" \
+  --form 'sinkConfig="{
+    \"archive\":\"builtin:\/\/cloud-storage\",
+    \"tenant\":\"'$TENANT'\",
+    \"namespace\":\"'$NAMESPACE'\",
+    \"name\":\"'$SINK_NAME'\",
+    \"parallelism\": 1,
+    \"inputs\":[\"'$TENANT'\/'$NAMESPACE'\/'$INPUT_TOPIC'\"],
+    \"configs\":{ <see below reference for storage specifics> }
+  }"'
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/curl-update.sh b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/curl-update.sh
@@ -0,0 +1,10 @@
+curl -sS --fail --request PUT ''$WEB_SERVICE_URL'/admin/v3/sinks/'$TENANT'/'$NAMESPACE'/'$SINK_NAME'?opt=poweruser' \
+  --header "Authorization: Bearer $ASTRA_STREAMING_TOKEN" \
+  --form 'sinkConfig="{
+    \"archive\":\"builtin:\/\/cloud-storage\",
+    \"tenant\":\"'$TENANT'\",
+    \"namespace\":\"'$NAMESPACE'\",
+    \"name\":\"'$SINK_NAME'\",
+    \"parallelism\": 2,
+    \"inputs\":[\"'$TENANT'\/'$NAMESPACE'\/'$INPUT_TOPIC'\"]
+  }"'
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/data-format.csv b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/data-format.csv
@@ -0,0 +1,6 @@
+Pulsar Schema,Writer: Avro,Writer: JSON,Writer: Parquet,Writer: Bytes
+Primitive,❌,✅ *,❌,✅
+Avro,✅,✅,✅,✅
+Json,✅,✅,✅,✅
+Protobuf **,✅,✅,✅,✅
+ProtobufNative,✅ * * *,❌,✅,✅
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/gcp-gcs.csv b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/gcp-gcs.csv
@@ -0,0 +1,25 @@
+Name,Required,Default,Description
+bucket,true,null,The Cloud Storage bucket.
+provider,true,null,The Cloud Storage type. Google cloud storage only supports the google-cloud-storage provider.
+avroCodec,false,snappy,"Compression codec used when formatType=avro. Available compression types are: null (no compression), deflate, bzip2, xz, zstandard, snappy."
+batchSize,false,10,The number of records submitted in batch.
+batchTimeMs,false,1000,The interval for batch submission.
+bytesFormatTypeSeparator,false,0x10,"It is inserted between records for the formatType of bytes. By default, it is set to '0x10'. An input record that contains the line separator looks like multiple records in the output object."
+formatType,false,json,"The data format type. Available options are JSON, Avro, Bytes, or Parquet. By default, it is set to JSON."
+gcsServiceAccountKeyFileContent,false,,"The contents of the JSON service key file. If empty, credentials are read from gcsServiceAccountKeyFilePath file."
+gcsServiceAccountKeyFilePath,false,,"Path to the GCS credentials file. If empty, the credentials file are read from the GOOGLE_APPLICATION_CREDENTIALS environment variable."
+jsonAllowNaN,false,false,"Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=json. Since JSON specification does not allow such values this is a non-standard feature and disabled by default."
+maxBatchBytes,false,10000000,The maximum number of bytes in a batch.
+parquetCodec,false,gzip,"Compression codec used when formatType=parquet. Available compression types are: null (no compression), snappy, gzip, lzo, brotli, lz4, zstd."
+partitionerType,false,partition,"The partitioning type. It can be configured by topic partitions or by time. By default, the partition type is configured by topic partitions."
+partitionerUseIndexAsOffset,false,false,"Whether to use the Pulsar's message index as offset or the record sequence. It's recommended if the incoming messages may be batched. The brokers may or not expose the index metadata and, if it's not present on the record, the sequence will be used. See PIP-70 for more details."
+pathPrefix,false,false,"If it is set, the output files are stored in a folder under the given bucket path. The pathPrefix must be in the format of xx/xxx/."
+pendingQueueSize,false,10,"The number of records buffered in queue. By default, it is equal to batchSize. You can set it manually."
+skipFailedMessages,false,false,"Configure whether to skip a message which it fails to be processed. If it is set to true, the connector will skip the failed messages by ack it. Otherwise, the connector will fail the message."
+sliceTopicPartitionPath,false,false,"When it is set to true, split the partitioned topic name into separate folders in the bucket path."
+timePartitionDuration,false,86400000,"The time interval for time-based partitioning. Support formatted interval string, such as 30d, 24h, 30m, 10s, and also support number in milliseconds precision, such as 86400000 refers to 24h or 1d."
+timePartitionPattern,false,yyyy-MM-dd,"The format pattern of the time-based partitioning. For details, refer to the Java date and time format."
+useHumanReadableMessageId,false,false,"Use a human-readable format string for messageId in message metadata. The messageId is in a format like ledgerId:entryId:partitionIndex:batchIndex. Otherwise, the messageId is a Hex-encoded string."
+useHumanReadableSchemaVersion,false,false,"Use a human-readable format string for the schema version in the message metadata. If it is set to true, the schema version is in plain string format. Otherwise, the schema version is in hex-encoded string format."
+withMetadata,false,false,Save message attributes to metadata.
+withTopicPartitionNumber,false,true,"When it is set to true, include the topic partition number to the object path."
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/pulsar-admin-create.sh b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/pulsar-admin-create.sh
@@ -0,0 +1,7 @@
+./bin/pulsar-admin sinks create \
+  --sink-type cloud-storage \
+  --name "$SINK_NAME" \
+  --inputs "$TENANT/$NAMESPACE/$INPUT_TOPIC" \
+  --tenant "$TENANT" \
+  --processing-guarantees EFFECTIVELY_ONCE \
+  --sink-config '{ <see below reference for storage specifics> }'
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/pulsar-admin-update.sh b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/pulsar-admin-update.sh
@@ -0,0 +1,6 @@
+./bin/pulsar-admin sinks update \
+  --sink-type cloud-storage \
+  --name "$SINK_NAME" \
+  --inputs "$TENANT/$NAMESPACE/$INPUT_TOPIC" \
+  --tenant "$TENANT" \
+  --parallelism 2
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/sample-data.adoc b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/sample-data.adoc
@@ -0,0 +1 @@
+xxxx
diff --git a/modules/pulsar-io/examples/connectors/sinks/cloud-storage/with-meta-data.csv b/modules/pulsar-io/examples/connectors/sinks/cloud-storage/with-meta-data.csv
@@ -0,0 +1,5 @@
+Writer Format,withMetadata
+Avro,✅
+JSON,✅
+Parquet,✅ *
+Bytes,❌
diff --git a/modules/pulsar-io/nav.adoc b/modules/pulsar-io/nav.adoc
@@ -1,6 +1,7 @@
 * xref:connectors/index.adoc[Pulsar IO]
 ** Sinks
 *** xref:connectors/sinks/astra-db.adoc[]
+*** xref:connectors/sinks/cloud-storage.adoc[]
 *** xref:connectors/sinks/elastic-search.adoc[]
 *** xref:connectors/sinks/google-bigquery.adoc[]
 *** xref:connectors/sinks/jdbc-postgres.adoc[]
diff --git a/modules/pulsar-io/pages/connectors/sinks/cloud-storage.adoc b/modules/pulsar-io/pages/connectors/sinks/cloud-storage.adoc
@@ -0,0 +1,159 @@
+:connectorName: cloud-storage-sink
+:connectorType: cloud-storage
+:attribute-missing: skip
+:slug: cloud-storage-sink-connector
+:page-tag: cloud storage,sink-connector, aws, azure, gcp
+
+= Cloud Storage
+
+Each public cloud has different ways of persisting data to their storage systems. Each cloud has their own way of formatting and storing the bytes. The Cloud Storage sink connector is a general interface to a chosen cloud storage, that exports data from a Pulsar topic to the given system following a desired format.
+
+The cloud storage system supported are:
+
+- Google Cloud Storage (GCP)
+- S3 (AWS)
+- Azure Blob (Azure)
+
+(see below for supported data formats)
+
+== Get Started
+
+include::partial$connectors/sinks/get-started.adoc[]
+
+== Data format types
+
+Cloud Storage Sink Connector provides multiple output format options, including JSON, Avro, Bytes, or Parquet. The default format is JSON. With current implementation, there are some limitations for different formats:
+
+The Pulsar Schema types supported by the writers.
+
+[%header,format=csv,cols="1,^1,^1,^1,^1"]
+|===
+include::example$connectors/sinks/cloud-storage/data-format.csv[]
+|===
+
+____
+*The JSON writer will try to convert the data with a String or Bytes schema to JSON-format data if convertable.
+
+**The Protobuf schema is based on the Avro schema. It uses Avro as an intermediate format, so it may not provide the best effort conversion.
+
+\*** The ProtobufNative record holds the Protobuf descriptor and the message. When writing to Avro format, the connector uses avro-protobuf to do the conversion.
+____
+
+The support of `withMetadata` configurations for different writer formats:
+
+[%header,format=csv,cols="1,^1",width="50%"]
+|===
+include::example$connectors/sinks/cloud-storage/with-meta-data.csv[]
+|===
+
+____
+*When using Parquet with PROTOBUF_NATIVE format, the connector will write the messages with DynamicMessage format. When withMetadata is set to true, the connector will add __message_metadata__ to the messages with PulsarIOCSCProtobufMessageMetadata format.
+
+For example, if a message User has the following schema:
+
+[source,protobuf]
+----
+syntax = "proto3";
+message User {
+ string name = 1;
+ int32 age = 2;
+}
+----
+
+When withMetadata is set to true, the connector will write the message DynamicMessage with the following schema:
+
+[source,protobuf]
+----
+syntax = "proto3";
+message PulsarIOCSCProtobufMessageMetadata {
+ map<string, string> properties = 1;
+ string schema_version = 2;
+ string message_id = 3;
+}
+message User {
+ string name = 1;
+ int32 age = 2;
+ PulsarIOCSCProtobufMessageMetadata __message_metadata__ = 3;
+}
+----
+____
+
+[NOTE]
+====
+By default, when the connector receives a message with a non-supported schema type, the connector will fail the message. If you want to skip the non-supported messages, you can set skipFailedMessages to true.
+====
+
+== Dead-letter topics
+
+To use a dead-letter topic, set `skipFailedMessages` to `false` in the cloud provider config. Then using either pulsar-admin or curl, set `--max-redeliver-count` and `--dead-letter-topic` . For more info about dead-letter topics, see the https://pulsar.apache.org/docs/en/concepts-messaging/#dead-letter-topic[Pulsar documentation^]{external-link-icon}. If a message fails to be sent to the Cloud Storage and there is a dead-letter topic, the connector will send the message to the assigned topic.
+
+== Managing the Connector
+
+include::partial$connectors/sinks/manage.adoc[]
+
+== Monitoring the Connector
+
+include::partial$connectors/sinks/monitoring.adoc[]
+
+== Connector Reference
+
+With the Cloud Storage Sink there a two sets of parameters. First the Astra Streaming parameters, then the params specific to your chosen cloud store.
+
+=== Astra Streaming
+
+[%header,format=csv,cols="2,1,1,3"]
+|===
+include::example$connectors/sinks/astra.csv[]
+|===
+
+=== Cloud specific parameters (configs)
+
+Choose the storage provider and set the parameter values in the "configs" area.
+
+[tabs]
+====
+Google Cloud Storage::
++
+--
+[%header,format=csv,cols="2,1,1,3"]
+|===
+include::example$connectors/sinks/cloud-storage/gcp-gcs.csv[]
+|===
+--
+
+AWS S3 Storage::
++
+--
+
+The suggested permission policies for AWS S3 are:
+
+- s3:AbortMultipartUpload
+- s3:GetObject*
+- s3:PutObject*
+- s3:List*
+
+If you do not want to provide region in the configuration, you should enable s3:GetBucketLocation permission policy as well.
+
+[%header,format=csv,cols="2,1,1,3"]
+|===
+include::example$connectors/sinks/cloud-storage/aws-S3.csv[]
+|===
+--
+
+Azure Blob Storage::
++
+--
+[%header,format=csv,cols="2,1,1,3"]
+|===
+include::example$connectors/sinks/cloud-storage/azure-blob.csv[]
+|===
+--
+====
+
+== What's next?
+
+Learn more about https://cloud.google.com/storage[Google’s Cloud Storage^]{external-link-icon}.
+
+Learn more about https://azure.microsoft.com/en-us/products/storage/blobs[Azure Blob Store^]{external-link-icon}.
+
+Learn more about https://aws.amazon.com/s3/[AWS S3^]{external-link-icon}.