31
31
#
32
32
# You must set either .host or .scb.
33
33
# -----------------------------------------------------------------------------------------------------------
34
- spark.cdm.origin.connect.host localhost
34
+ spark.cdm.origin.connect.host cass-origin
35
35
spark.cdm.origin.connect.port 9042
36
36
# spark.cdm.origin.connect.scb file:///aaa/bbb/secure-connect-enterprise.zip
37
37
spark.cdm.origin.connect.username cassandra
38
38
spark.cdm.origin.connect.password cassandra
39
39
40
- spark.cdm.target.connect.host localhost
40
+ spark.cdm.target.connect.host cass-target
41
41
spark.cdm.target.connect.port 9042
42
42
# spark.cdm.target.connect.scb file:///aaa/bbb/secure-connect-enterprise.zip
43
43
spark.cdm.target.connect.username cassandra
@@ -53,14 +53,29 @@ spark.cdm.target.connect.password cassandra
53
53
# Recommended Parameters:
54
54
# spark.cdm.schema.origin
55
55
# .column
56
- # .ttl.names : Default is empty. Names from .column.names to be combined using the MAX
57
- # function to determine the TTL of the entire migrated record. Will use target
58
- # table default when not set. The names cannot include any columns listed in
59
- # partition-key,clustering-key.
60
- # .writetime.names: Default is empty. Names from .column.names to be combined using the MAX
61
- # function to determine the TIMESTAMP of the entire migrated record. Will use
62
- # target table default when not set. The names cannot include any columns
63
- # listed in the primary key e.g. partition-key,clustering-key
56
+ # .ttl
57
+ # .automatic : Default is true, unless .ttl.names is specified. When true, the TTL of the
58
+ # target record will be determined by finding the maxiumum TTL of
59
+ # all origin columns that can have TTL set (which excludes partition key,
60
+ # clustering key, collections/UDT/tuple, and frozen columns). When false, and
61
+ # .names is not set, the target record will have the TTL determined by the target
62
+ # table configuration.
63
+ # .names : Default is empty, meaning they will be determined automatically if that is set
64
+ # (see above). Specify a subset of eligible columns that are used to calculate
65
+ # the TTL of the target record.
66
+ # .writetime
67
+ # .automatic : Default is true, unless .writetime.names is specified. When true, the WRITETIME of
68
+ # the target record will be determined by finding the maxiumum WRITETIME of
69
+ # all origin columns that can have WRITETIME set (which excludes partition key,
70
+ # clustering key, collections/UDT/tuple, and frozen columns). When false, and
71
+ # .names is not set, the target record will have the WRITETIME determined by the target
72
+ # table configuration.
73
+ #
74
+ # *** Note spark.cdm.transform.custom.writetime overrides this setting ***
75
+ #
76
+ # .names : Default is empty, meaning they will be determined automatically if that is set
77
+ # (see above). Specify a subset of eligible columns that are used to calculate
78
+ # the WRITETIME of the target record.
64
79
#
65
80
# Other Parameters:
66
81
# spark.cdm.schema.origin
@@ -70,10 +85,12 @@ spark.cdm.target.connect.password cassandra
70
85
# origin_column_name:target_column_name. The list is comma-separated. Only renamed
71
86
# columns need to be listed.
72
87
# -----------------------------------------------------------------------------------------------------------
73
- spark.cdm.schema.origin.keyspaceTable keyspace_name.table_name
74
- spark.cdm.schema.origin.column.ttl.names data_col1,data_col2,...
75
- spark.cdm.schema.origin.column.writetime.names data_col1,data_col2,...
76
- # spark.cdm.schema.origin.column.names.to.target partition_col1:partition_col_1,partition_col2:partition_col_2,...
88
+ spark.cdm.schema.origin.keyspaceTable devices.sensor_data
89
+ # spark.cdm.schema.origin.column.ttl.automatic true
90
+ # spark.cdm.schema.origin.column.ttl.names data_col1,data_col2,...
91
+ # spark.cdm.schema.origin.column.writetime.automatic true
92
+ # spark.cdm.schema.origin.column.writetime.names data_col1,data_col2,...
93
+ # spark.cdm.schema.origin.column.names.to.target partition_col1:partition_col_1,partition_col2:partition_col_2,...
77
94
78
95
# ===========================================================================================================
79
96
# Details about the Target Schema
@@ -82,7 +99,7 @@ spark.cdm.schema.origin.column.writetime.names data_col1,data_col2,...
82
99
# spark.cdm.schema.target
83
100
# .keyspaceTable : <keyspace>.<table_name> of the table to be migrated. Table must exist in Target.
84
101
# -----------------------------------------------------------------------------------------------------------
85
- spark.cdm.schema.target.keyspaceTable keyspace_name.table_name
102
+ spark.cdm.schema.target.keyspaceTable devices.sensor_data
86
103
87
104
# ===========================================================================================================
88
105
# Autocorrection parameters allow CDM to correct data differences found between Origin and Target when
@@ -126,7 +143,7 @@ spark.cdm.autocorrect.mismatch false
126
143
# spark.cdm.perfops
127
144
# .numParts : Defaults is 10000. In standard operation, the full token range (-2^63..2^63-1)
128
145
# is divided into a number of parts which will be parallel-processed. You should
129
- # aim for each part to comprise a total of ≈1 -10GB of data to migrate. During
146
+ # aim for each part to comprise a total of â1 -10GB of data to migrate. During
130
147
# intial testing, you may want this to be a small number (even 1).
131
148
# .batchSize : Defaults is 5. When writing to Target, this comprises the number of records that
132
149
# will be put into an UNLOGGED batch. CDM will tend to work on the same partition
@@ -158,8 +175,8 @@ spark.cdm.autocorrect.mismatch false
158
175
# -----------------------------------------------------------------------------------------------------------
159
176
spark.cdm.perfops.numParts 10000
160
177
spark.cdm.perfops.batchSize 5
161
- spark.cdm.perfops.readRateLimit 20000
162
- spark.cdm.perfops.writeRateLimit 40000
178
+ spark.cdm.perfops.readRateLimit 5000
179
+ spark.cdm.perfops.writeRateLimit 5000
163
180
# spark.cdm.perfops.consistency.read LOCAL_QUORUM
164
181
# spark.cdm.perfops.consistency.write LOCAL_QUORUM
165
182
# spark.cdm.perfops.printStatsAfter 100000
@@ -177,12 +194,18 @@ spark.cdm.perfops.writeRateLimit 40000
177
194
# MigrateData operation would fail. This parameter allows a crude
178
195
# constant value to be used in its place, separate from the Constant
179
196
# Values feature.
180
- # .custom.writetime Default is 0 (diabled). Timestamp value in microseconds to use as the
197
+ # .custom
198
+ # .writetime Default is 0 (diabled). Timestamp value in microseconds to use as the
181
199
# WRITETIME for the target record. This is useful when the WRITETIME of
182
200
# the record in Origin cannot be determined (such as the only non-key
183
201
# columns are collections). This parameter allows a crude constant value
184
202
# to be used in its place, and overrides
185
203
# .schema.origin.column.writetime.indexes.
204
+ # .writetime.incrementBy Default is 0. This is useful when you have a List that is not frozen,
205
+ # and are updating this via the autocorrect feature. Lists are not idempotent,
206
+ # and subsequent UPSERTs would add duplicates to the list. Future versions
207
+ # of CDM may tombstone the previous list, but for now this solution is
208
+ # viable and, crucially, more performant.
186
209
# .codecs Default is empty. A comma-separated list of additional codecs to
187
210
# enable. Current codecs are:
188
211
# INT_STRING : int stored in a String
@@ -202,7 +225,8 @@ spark.cdm.perfops.writeRateLimit 40000
202
225
# .string.zone Default is UTC ; Must be in ZoneRulesProvider.getAvailableZoneIds()
203
226
# -----------------------------------------------------------------------------------------------------------
204
227
# spark.cdm.transform.missing.key.ts.replace.value
205
- # spark.cdm.transform.custom.writetime 0
228
+ # spark.cdm.transform.custom.writetime 0
229
+ # spark.cdm.transform.custom.writetime.incrementBy 0
206
230
# spark.cdm.transform.codecs
207
231
# spark.cdm.transform.codecs.timestamp.string.format yyyyMMddHHmmss
208
232
# spark.cdm.transform.codecs.timestamp.string.zone UTC
@@ -287,6 +311,16 @@ spark.cdm.perfops.writeRateLimit 40000
287
311
# spark.cdm.feature.explodeMap.target.name.key my_map_key
288
312
# spark.cdm.feature.explodeMap.target.name.value my_map_value
289
313
314
+ # ===========================================================================================================
315
+ # Guardrail feature manages records that exceed guardrail checks. The Guardrail job will generate a
316
+ # a report, other jobs will skip records that exceed the guardrail.
317
+ #
318
+ # spark.cdm.feature.guardrail
319
+ # .colSizeInKB Default 0, meaning the check is not done. Records with one or more fields that
320
+ # exceed this size will be flagged. Note this is kB (base 10), not kiB (base 2).
321
+ #
322
+ # ===========================================================================================================
323
+ # spark.cdm.feature.guardrail.colSizeInKB 1000
290
324
291
325
# ===========================================================================================================
292
326
# TLS (SSL) connection parameters, if so configured. Note that Secure Bundles embed these details.
@@ -320,3 +354,4 @@ spark.cdm.perfops.writeRateLimit 40000
320
354
# spark.cdm.target.connect.tls.keyStore.password
321
355
# spark.cdm.target.connect.tls.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
322
356
357
+
0 commit comments