Skip to content

Commit 4df8f8e

Browse files
authored
Merge pull request #1248 from KimDoKy/s3-chunksize
Refs/heads/s3 chunksize
2 parents 09bd903 + 0a8d75f commit 4df8f8e

File tree

4 files changed

+34
-15
lines changed

4 files changed

+34
-15
lines changed

ReadMe.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ s3:
265265
use_custom_storage_class: false # S3_USE_CUSTOM_STORAGE_CLASS
266266
storage_class: STANDARD # S3_STORAGE_CLASS, by default allow only from list https://github.com/aws/aws-sdk-go-v2/blob/main/service/s3/types/enums.go#L787-L799
267267
concurrency: 1 # S3_CONCURRENCY
268+
chunk_size: 0 # S#_CHUNK_SIZE, default 0: remoteSize / max_part_count
268269
max_parts_count: 4000 # S3_MAX_PARTS_COUNT, number of parts for S3 multipart uploads and downloads
269270
allow_multipart_download: false # S3_ALLOW_MULTIPART_DOWNLOAD, allow faster multipart download speed, but will require additional disk space, download_concurrency * part size in worst case
270271
checksum_algorithm: "" # S3_CHECKSUM_ALGORITHM, use it when you use object lock which allow to avoid delete keys from bucket until some timeout after creation, use CRC32 as fastest

pkg/config/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ type S3Config struct {
154154
RequestPayer string `yaml:"request_payer" envconfig:"S3_REQUEST_PAYER"`
155155
CheckSumAlgorithm string `yaml:"check_sum_algorithm" envconfig:"S3_CHECKSUM_ALGORITHM"`
156156
RetryMode string `yaml:"retry_mode" envconfig:"S3_RETRY_MODE"`
157+
ChunkSize int64 `yaml:"chunk_size" envconfig:"S3_CHUNK_SIZE"`
157158
Debug bool `yaml:"debug" envconfig:"S3_DEBUG"`
158159
}
159160

@@ -624,6 +625,7 @@ func DefaultConfig() *Config {
624625
Concurrency: int(downloadConcurrency + 1),
625626
MaxPartsCount: 4000,
626627
RetryMode: string(aws.RetryModeStandard),
628+
ChunkSize: 5 * 1024 * 1024,
627629
},
628630
GCS: GCSConfig{
629631
CompressionLevel: 1,

pkg/storage/s3.go

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,18 @@ func (s *S3) GetFileReaderWithLocalPath(ctx context.Context, key, localPath stri
257257
downloader := s3manager.NewDownloader(s.client)
258258
downloader.Concurrency = s.Concurrency
259259
downloader.BufferProvider = s3manager.NewPooledBufferedWriterReadFromProvider(s.BufferSize)
260-
partSize := remoteSize / s.Config.MaxPartsCount
261-
if remoteSize%s.Config.MaxPartsCount > 0 {
262-
partSize += max(1, (remoteSize%s.Config.MaxPartsCount)/s.Config.MaxPartsCount)
260+
var partSize int64
261+
if s.Config.ChunkSize > 0 && (remoteSize+s.Config.ChunkSize-1)/s.Config.ChunkSize < 10000 {
262+
// Use configured chunk size
263+
partSize = s.Config.ChunkSize
264+
} else {
265+
partSize := remoteSize / s.Config.MaxPartsCount
266+
if remoteSize%s.Config.MaxPartsCount > 0 {
267+
partSize += max(1, (remoteSize%s.Config.MaxPartsCount)/s.Config.MaxPartsCount)
268+
}
263269
}
264270
downloader.PartSize = AdjustValueByRange(partSize, 5*1024*1024, 5*1024*1024*1024)
271+
log.Debug().Msgf("S3 Download PartSize: %d bytes (%.2f MB)", downloader.PartSize, float64(downloader.PartSize)/(1024*1024))
265272

266273
_, err = downloader.Download(ctx, writer, &s3.GetObjectInput{
267274
Bucket: aws.String(s.Config.Bucket),
@@ -327,11 +334,18 @@ func (s *S3) PutFileAbsolute(ctx context.Context, key string, r io.ReadCloser, l
327334
uploader := s3manager.NewUploader(s.client)
328335
uploader.Concurrency = s.Concurrency
329336
uploader.BufferProvider = s3manager.NewBufferedReadSeekerWriteToPool(s.BufferSize)
330-
partSize := localSize / s.Config.MaxPartsCount
331-
if localSize%s.Config.MaxPartsCount > 0 {
332-
partSize += max(1, (localSize%s.Config.MaxPartsCount)/s.Config.MaxPartsCount)
337+
var partSize int64
338+
if s.Config.ChunkSize > 0 && (localSize+s.Config.ChunkSize-1)/s.Config.ChunkSize < 10000 {
339+
partSize = s.Config.ChunkSize
340+
} else {
341+
partSize := localSize / s.Config.MaxPartsCount
342+
if localSize%s.Config.MaxPartsCount > 0 {
343+
partSize += max(1, (localSize%s.Config.MaxPartsCount)/s.Config.MaxPartsCount)
344+
}
333345
}
334346
uploader.PartSize = AdjustValueByRange(partSize, 5*1024*1024, 5*1024*1024*1024)
347+
log.Debug().Msgf("S3 Upload PartSize: %d bytes (%.2f MB)", uploader.PartSize, float64(uploader.PartSize)/(1024*1024))
348+
335349
_, err := uploader.Upload(ctx, &params)
336350
return err
337351
}
@@ -532,12 +546,18 @@ func (s *S3) CopyObject(ctx context.Context, srcSize int64, srcBucket, srcKey, d
532546
// Get the upload ID
533547
uploadID := initResp.UploadId
534548

535-
// Set the part size (128 MB minimum)
536-
partSize := srcSize / s.Config.MaxPartsCount
537-
if srcSize%s.Config.MaxPartsCount > 0 {
538-
partSize += max(1, (srcSize%s.Config.MaxPartsCount)/s.Config.MaxPartsCount)
549+
// Set the part size (128 MB minimum for CopyObject, or use configured chunk size)
550+
var partSize int64
551+
if s.Config.ChunkSize > 0 && (srcSize+s.Config.ChunkSize-1)/s.Config.ChunkSize < 10000 {
552+
partSize = s.Config.ChunkSize
553+
} else {
554+
partSize := srcSize / s.Config.MaxPartsCount
555+
if srcSize%s.Config.MaxPartsCount > 0 {
556+
partSize += max(1, (srcSize%s.Config.MaxPartsCount)/s.Config.MaxPartsCount)
557+
}
539558
}
540559
partSize = AdjustValueByRange(partSize, 128*1024*1024, 5*1024*1024*1024)
560+
log.Debug().Msgf("S3 CopyObject PartSize: %d bytes (%.2f MB)", partSize, float64(partSize)/(1024*1024))
541561

542562
// Calculate the number of parts
543563
numParts := (srcSize + partSize - 1) / partSize
Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,2 @@
1-
default_config = r"""'[\'general:\', \' remote_storage: none\', \' backups_to_keep_local: 0\', \' backups_to_keep_remote: 0\', \' log_level: info\', \' allow_empty_backups: false\', \' allow_object_disk_streaming: false\', \' use_resumable_state: true\', \' restore_schema_on_cluster: ""\', \' upload_by_part: true\', \' download_by_part: true\', \' restore_database_mapping: {}\', \' restore_table_mapping: {}\', \' retries_on_failure: 3\', \' retries_pause: 5s\', \' retries_jitter: 0\', \' watch_interval: 1h\', \' full_interval: 24h\', \' watch_backup_name_template: shard{shard}-{type}-{time:20060102150405}\', \' sharded_operation_mode: ""\', \' cpu_nice_priority: 15\', \' io_nice_priority: idle\', \' rbac_backup_always: true\', \' rbac_conflict_resolution: recreate\', \' config_backup_always: false\', \' named_collections_backup_always: false\', \' retriesduration: 5s\', \' watchduration: 1h0m0s\', \' fullduration: 24h0m0s\', \'clickhouse:\', \' username: default\', \' password: ""\', \' host: localhost\', \' port: 9000\', \' disk_mapping: {}\', \' skip_tables:\', \' - system.*\', \' - INFORMATION_SCHEMA.*\', \' - information_schema.*\', \' - _temporary_and_external_tables.*\', \' skip_table_engines: []\', \' skip_disks: []\', \' skip_disk_types: []\', \' timeout: 30m\', \' freeze_by_part: false\', \' freeze_by_part_where: ""\', \' use_embedded_backup_restore: false\', \' embedded_backup_disk: ""\', \' backup_mutations: true\', \' restore_as_attach: false\', \' check_parts_columns: true\', \' secure: false\', \' skip_verify: false\', \' sync_replicated_tables: false\', \' log_sql_queries: true\', \' config_dir: /etc/clickhouse-server/\', \' restart_command: exec:systemctl restart clickhouse-server\', \' ignore_not_exists_error_during_freeze: true\', \' check_replicas_before_attach: true\', \' default_replica_path: /clickhouse/tables/{cluster}/{shard}/{database}/{table}\', " default_replica_name: \'{replica}\'", \' tls_key: ""\', \' tls_cert: ""\', \' tls_ca: ""\', \' debug: false\', \'s3:\', \' access_key: ""\', \' secret_key: ""\', \' bucket: ""\', \' endpoint: ""\', \' region: us-east-1\', \' acl: private\', \' assume_role_arn: ""\', \' force_path_style: false\', \' path: ""\', \' object_disk_path: ""\', \' disable_ssl: false\', \' compression_level: 1\', \' compression_format: tar\', \' sse: ""\', \' sse_kms_key_id: ""\', \' sse_customer_algorithm: ""\', \' sse_customer_key: ""\', \' sse_customer_key_md5: ""\', \' sse_kms_encryption_context: ""\', \' disable_cert_verification: false\', \' use_custom_storage_class: false\', \' storage_class: STANDARD\', \' custom_storage_class_map: {}\', \' allow_multipart_download: false\', \' object_labels: {}\', \' request_payer: ""\', \' check_sum_algorithm: ""\', \' retry_mode: standard\', \' debug: false\', \'gcs:\', \' credentials_file: ""\', \' credentials_json: ""\', \' credentials_json_encoded: ""\', \' sa_email: ""\', \' embedded_access_key: ""\', \' embedded_secret_key: ""\', \' skip_credentials: false\', \' bucket: ""\', \' path: ""\', \' object_disk_path: ""\', \' compression_level: 1\', \' compression_format: tar\', \' debug: false\', \' force_http: false\', \' endpoint: ""\', \' storage_class: STANDARD\', \' object_labels: {}\', \' custom_storage_class_map: {}\', \' chunk_size: 0\', \'cos:\', \' url: ""\', \' timeout: 2m\', \' secret_id: ""\', \' secret_key: ""\', \' path: ""\', \' object_disk_path: ""\', \' compression_format: tar\', \' compression_level: 1\', \' allow_multipart_download: false\', \' debug: false\', \'api:\', \' listen: localhost:7171\', \' enable_metrics: true\', \' enable_pprof: false\', \' username: ""\', \' password: ""\', \' secure: false\', \' certificate_file: ""\', \' private_key_file: ""\', \' ca_cert_file: ""\', \' ca_key_file: ""\', \' create_integration_tables: false\', \' integration_tables_host: ""\', \' allow_parallel: false\', \' complete_resumable_after_restart: true\', \' watch_is_main_process: false\', \'ftp:\', \' address: ""\', \' timeout: 2m\', \' username: ""\', \' password: ""\', \' tls: false\', \' skip_tls_verify: false\', \' path: ""\', \' object_disk_path: ""\', \' compression_format: tar\', \' compression_level: 1\', \' debug: false\', \'sftp:\', \' address: ""\', \' port: 22\', \' username: ""\', \' password: ""\', \' key: ""\', \' path: ""\', \' object_disk_path: ""\', \' compression_format: tar\', \' compression_level: 1\', \' debug: false\', \'azblob:\', \' endpoint_schema: https\', \' endpoint_suffix: core.windows.net\', \' account_name: ""\', \' account_key: ""\', \' sas: ""\', \' use_managed_identity: false\', \' container: ""\', \' assume_container_exists: false\', \' path: ""\', \' object_disk_path: ""\', \' compression_level: 1\', \' compression_format: tar\', \' sse_key: ""\', \' buffer_count: 3\', \' timeout: 4h\', \' debug: false\', \'custom:\', \' upload_command: ""\', \' download_command: ""\', \' list_command: ""\', \' delete_command: ""\', \' command_timeout: 4h\', \' commandtimeoutduration: 4h0m0s\']'"""
2-
3-
help_flag = r"""'NAME:\n clickhouse-backup - Tool for easy backup of ClickHouse with cloud supportUSAGE:\n clickhouse-backup <command> [-t, --tables=<db>.<table>] <backup_name>DESCRIPTION:\n Run as \'root\' or \'clickhouse\' userCOMMANDS:\n tables List of tables, exclude skip_tables\n create Create new backup\n create_remote Create and upload new backup\n upload Upload backup to remote storage\n list List of backups\n download Download backup from remote storage\n restore Create schema and restore data from backup\n restore_remote Download and restore\n delete Delete specific backup\n default-config Print default config\n print-config Print current config merged with environment variables\n clean Remove data in \'shadow\' folder from all \'path\' folders available from \'system.disks\'\n clean_remote_broken Remove all broken remote backups\n clean_local_broken Remove all broken local backups\n watch Run infinite loop which create full + incremental backup sequence to allow efficient backup sequences\n server Run API server\n help, h Shows a list of commands or help for one commandGLOBAL OPTIONS:\n --config value, -c value Config \'FILE\' name. (default: "/etc/clickhouse-backup/config.yml") [$CLICKHOUSE_BACKUP_CONFIG]\n --environment-override value, --env value override any environment variable via CLI parameter\n --help, -h show help\n --version, -v print the version'"""
4-
5-
cli_usage = r"""'NAME:\n clickhouse-backup - Tool for easy backup of ClickHouse with cloud supportUSAGE:\n clickhouse-backup <command> [-t, --tables=<db>.<table>] <backup_name>DESCRIPTION:\n Run as \'root\' or \'clickhouse\' userCOMMANDS:\n tables List of tables, exclude skip_tables\n create Create new backup\n create_remote Create and upload new backup\n upload Upload backup to remote storage\n list List of backups\n download Download backup from remote storage\n restore Create schema and restore data from backup\n restore_remote Download and restore\n delete Delete specific backup\n default-config Print default config\n print-config Print current config merged with environment variables\n clean Remove data in \'shadow\' folder from all \'path\' folders available from \'system.disks\'\n clean_remote_broken Remove all broken remote backups\n clean_local_broken Remove all broken local backups\n watch Run infinite loop which create full + incremental backup sequence to allow efficient backup sequences\n server Run API server\n help, h Shows a list of commands or help for one commandGLOBAL OPTIONS:\n --config value, -c value Config \'FILE\' name. (default: "/etc/clickhouse-backup/config.yml") [$CLICKHOUSE_BACKUP_CONFIG]\n --environment-override value, --env value override any environment variable via CLI parameter\n --help, -h show help\n --version, -v print the version'"""
1+
default_config = r"""'[\'general:\', \' remote_storage: none\', \' backups_to_keep_local: 0\', \' backups_to_keep_remote: 0\', \' log_level: info\', \' allow_empty_backups: false\', \' allow_object_disk_streaming: false\', \' use_resumable_state: true\', \' restore_schema_on_cluster: ""\', \' upload_by_part: true\', \' download_by_part: true\', \' restore_database_mapping: {}\', \' restore_table_mapping: {}\', \' retries_on_failure: 3\', \' retries_pause: 5s\', \' retries_jitter: 0\', \' watch_interval: 1h\', \' full_interval: 24h\', \' watch_backup_name_template: shard{shard}-{type}-{time:20060102150405}\', \' sharded_operation_mode: ""\', \' cpu_nice_priority: 15\', \' io_nice_priority: idle\', \' rbac_backup_always: true\', \' rbac_conflict_resolution: recreate\', \' config_backup_always: false\', \' named_collections_backup_always: false\', \' retriesduration: 5s\', \' watchduration: 1h0m0s\', \' fullduration: 24h0m0s\', \'clickhouse:\', \' username: default\', \' password: ""\', \' host: localhost\', \' port: 9000\', \' disk_mapping: {}\', \' skip_tables:\', \' - system.*\', \' - INFORMATION_SCHEMA.*\', \' - information_schema.*\', \' - _temporary_and_external_tables.*\', \' skip_table_engines: []\', \' skip_disks: []\', \' skip_disk_types: []\', \' timeout: 30m\', \' freeze_by_part: false\', \' freeze_by_part_where: ""\', \' use_embedded_backup_restore: false\', \' embedded_backup_disk: ""\', \' backup_mutations: true\', \' restore_as_attach: false\', \' check_parts_columns: true\', \' secure: false\', \' skip_verify: false\', \' sync_replicated_tables: false\', \' log_sql_queries: true\', \' config_dir: /etc/clickhouse-server/\', \' restart_command: exec:systemctl restart clickhouse-server\', \' ignore_not_exists_error_during_freeze: true\', \' check_replicas_before_attach: true\', \' default_replica_path: /clickhouse/tables/{cluster}/{shard}/{database}/{table}\', " default_replica_name: \'{replica}\'", \' tls_key: ""\', \' tls_cert: ""\', \' tls_ca: ""\', \' debug: false\', \'s3:\', \' access_key: ""\', \' secret_key: ""\', \' bucket: ""\', \' endpoint: ""\', \' region: us-east-1\', \' acl: private\', \' assume_role_arn: ""\', \' force_path_style: false\', \' path: ""\', \' object_disk_path: ""\', \' disable_ssl: false\', \' compression_level: 1\', \' compression_format: tar\', \' sse: ""\', \' sse_kms_key_id: ""\', \' sse_customer_algorithm: ""\', \' sse_customer_key: ""\', \' sse_customer_key_md5: ""\', \' sse_kms_encryption_context: ""\', \' disable_cert_verification: false\', \' use_custom_storage_class: false\', \' storage_class: STANDARD\', \' custom_storage_class_map: {}\', \' allow_multipart_download: false\', \' object_labels: {}\', \' request_payer: ""\', \' check_sum_algorithm: ""\', \' retry_mode: standard\', \' chunk_size: 5242880\', \' debug: false\', \'gcs:\', \' credentials_file: ""\', \' credentials_json: ""\', \' credentials_json_encoded: ""\', \' sa_email: ""\', \' embedded_access_key: ""\', \' embedded_secret_key: ""\', \' skip_credentials: false\', \' bucket: ""\', \' path: ""\', \' object_disk_path: ""\', \' compression_level: 1\', \' compression_format: tar\', \' debug: false\', \' force_http: false\', \' endpoint: ""\', \' storage_class: STANDARD\', \' object_labels: {}\', \' custom_storage_class_map: {}\', \' chunk_size: 0\', \'cos:\', \' url: ""\', \' timeout: 2m\', \' secret_id: ""\', \' secret_key: ""\', \' path: ""\', \' object_disk_path: ""\', \' compression_format: tar\', \' compression_level: 1\', \' allow_multipart_download: false\', \' debug: false\', \'api:\', \' listen: localhost:7171\', \' enable_metrics: true\', \' enable_pprof: false\', \' username: ""\', \' password: ""\', \' secure: false\', \' certificate_file: ""\', \' private_key_file: ""\', \' ca_cert_file: ""\', \' ca_key_file: ""\', \' create_integration_tables: false\', \' integration_tables_host: ""\', \' allow_parallel: false\', \' complete_resumable_after_restart: true\', \' watch_is_main_process: false\', \'ftp:\', \' address: ""\', \' timeout: 2m\', \' username: ""\', \' password: ""\', \' tls: false\', \' skip_tls_verify: false\', \' path: ""\', \' object_disk_path: ""\', \' compression_format: tar\', \' compression_level: 1\', \' debug: false\', \'sftp:\', \' address: ""\', \' port: 22\', \' username: ""\', \' password: ""\', \' key: ""\', \' path: ""\', \' object_disk_path: ""\', \' compression_format: tar\', \' compression_level: 1\', \' debug: false\', \'azblob:\', \' endpoint_schema: https\', \' endpoint_suffix: core.windows.net\', \' account_name: ""\', \' account_key: ""\', \' sas: ""\', \' use_managed_identity: false\', \' container: ""\', \' assume_container_exists: false\', \' path: ""\', \' object_disk_path: ""\', \' compression_level: 1\', \' compression_format: tar\', \' sse_key: ""\', \' buffer_count: 3\', \' timeout: 4h\', \' debug: false\', \'custom:\', \' upload_command: ""\', \' download_command: ""\', \' list_command: ""\', \' delete_command: ""\', \' command_timeout: 4h\', \' commandtimeoutduration: 4h0m0s\']'"""
62

0 commit comments

Comments
 (0)