Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0ac4af6
Support ZSTD dictionary compression
yifan-c Aug 29, 2025
b9d3c6c
Add configs to cassandra.yaml and refactoring
yifan-c Sep 27, 2025
03a3fce
Handle CFS invalidate event and close the CompressionDictionaryManager
yifan-c Oct 3, 2025
eb52e6a
Added docs for zstd dict
rustyrazorblade Oct 7, 2025
3a5d532
review
smiklosovic Oct 8, 2025
032ac7d
Yifan's refactoring on top of Stefan's
yifan-c Oct 8, 2025
79fdd80
Revert the long to DictId change from Stefan
yifan-c Oct 8, 2025
1f8a0cc
Revert "Revert the long to DictId change from Stefan"
yifan-c Oct 8, 2025
5c3a535
Address Stefan's review comments
yifan-c Oct 8, 2025
302dec4
Fix currentDictionary being closed prematurely
yifan-c Oct 9, 2025
8baa36a
Display statistics when running traincompressiondictionary
yifan-c Oct 9, 2025
1e3eef9
Add --use-existing-sstables option in traincompressiondictionary
yifan-c Oct 10, 2025
77c928d
resolve remaining checkstyle issues
smiklosovic Oct 9, 2025
d2c8ea9
reuse constant
smiklosovic Oct 9, 2025
4763c03
rework nodetool command to use table builder
smiklosovic Oct 9, 2025
b1adf2b
log just message instead of whole stacktrace
smiklosovic Oct 9, 2025
17b2f08
minor change
yifan-c Oct 10, 2025
bb8fe74
Define DEFAULT_SAMPLING_DURATION_SECONDS
yifan-c Oct 10, 2025
5421ee8
simplify the close method in CompressionDictionaryCache
yifan-c Oct 11, 2025
798ebae
Fix issues in SSTableChunkSampler
yifan-c Oct 14, 2025
d9c7dea
Remove write-based sampling from nodetool traincompressiondictionary
yifan-c Oct 14, 2025
a802a61
Update traincompressiondictionary command
yifan-c Oct 15, 2025
3886427
Print error when table not found
yifan-c Oct 15, 2025
3ca20dc
Fix mbean not found issue
yifan-c Oct 15, 2025
430003a
Add ZstdDictionaryCompressor for auto-completion
yifan-c Oct 15, 2025
6f737df
Only register mbean when dictionary compression is enabled
yifan-c Oct 16, 2025
748ef86
Fix tests
yifan-c Oct 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
trunk?? (The current trunk is on 5.1)
* Support ZSTD dictionary compression (CASSANDRA-17021)

5.1
* Add cqlsh autocompletion for the identity mapping feature (CASSANDRA-20021)
* Add DDL Guardrail enabling administrators to disallow creation/modification of keyspaces with durable_writes = false (CASSANDRA-20913)
Expand Down
46 changes: 46 additions & 0 deletions conf/cassandra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2849,3 +2849,49 @@ storage_compatibility_mode: NONE
# # especially in keyspaces with many tables. The splitter avoids batching tables together if they
# # exceed other configuration parameters like bytes_per_assignment or partitions_per_assignment.
# max_tables_per_assignment: 64

# Dictionary compression settings for ZSTD dictionary-based compression
# These settings control the automatic training and caching of compression dictionaries
# for tables that use ZSTD dictionary compression.

# How often to refresh compression dictionaries across the cluster.
# During refresh, nodes will check for newer dictionary versions and update their caches.
# Min unit: s
compression_dictionary_refresh_interval: 3600s

# Initial delay before starting the first dictionary refresh cycle after node startup.
# This prevents all nodes from refreshing simultaneously when the cluster starts.
# Min unit: s
compression_dictionary_refresh_initial_delay: 10s

# Maximum number of compression dictionaries to cache per table.
# Each table using dictionary compression can have multiple dictionaries cached
# (current version plus recently used versions for reading older SSTables).
compression_dictionary_cache_size: 10

# How long to keep compression dictionaries in the cache before they expire.
# Expired dictionaries will be removed from memory but can be reloaded if needed.
# Min unit: s
compression_dictionary_cache_expire: 24h

# Dictionary training configuration (advanced settings)
# These settings control how compression dictionaries are trained from sample data.

# Maximum size of a trained compression dictionary.
# Larger dictionaries may provide better compression but use more memory.
compression_dictionary_training_max_dictionary_size: 64KiB

# Maximum total size of sample data to collect for dictionary training.
# More sample data generally produces better dictionaries but takes longer to train.
# The recommended sample size is 100x the dictionary size.
compression_dictionary_training_max_total_sample_size: 10MiB

# Enable automatic dictionary training based on sampling of write operations.
# When enabled, the system will automatically collect samples and train new dictionaries.
# Manual training via nodetool is always available regardless of this setting.
compression_dictionary_training_auto_train_enabled: false

# Sampling rate for automatic dictionary training (1-10000).
# Value of 100 means 1% of writes are sampled. Lower values reduce overhead but may
# result in less representative sample data for dictionary training.
compression_dictionary_training_sampling_rate: 0.01
46 changes: 46 additions & 0 deletions conf/cassandra_latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2529,3 +2529,49 @@ storage_compatibility_mode: NONE
# # especially in keyspaces with many tables. The splitter avoids batching tables together if they
# # exceed other configuration parameters like bytes_per_assignment or partitions_per_assignment.
# max_tables_per_assignment: 64

# Dictionary compression settings for ZSTD dictionary-based compression
# These settings control the automatic training and caching of compression dictionaries
# for tables that use ZSTD dictionary compression.

# How often to refresh compression dictionaries across the cluster.
# During refresh, nodes will check for newer dictionary versions and update their caches.
# Min unit: s
compression_dictionary_refresh_interval: 3600s

# Initial delay before starting the first dictionary refresh cycle after node startup.
# This prevents all nodes from refreshing simultaneously when the cluster starts.
# Min unit: s
compression_dictionary_refresh_initial_delay: 10s

# Maximum number of compression dictionaries to cache per table.
# Each table using dictionary compression can have multiple dictionaries cached
# (current version plus recently used versions for reading older SSTables).
compression_dictionary_cache_size: 10

# How long to keep compression dictionaries in the cache before they expire.
# Expired dictionaries will be removed from memory but can be reloaded if needed.
# Min unit: s
compression_dictionary_cache_expire: 24h

# Dictionary training configuration (advanced settings)
# These settings control how compression dictionaries are trained from sample data.

# Maximum size of a trained compression dictionary.
# Larger dictionaries may provide better compression but use more memory.
compression_dictionary_training_max_dictionary_size: 64KiB

# Maximum total size of sample data to collect for dictionary training.
# More sample data generally produces better dictionaries but takes longer to train.
# The recommended sample size is 100x the dictionary size.
compression_dictionary_training_max_total_sample_size: 10MiB

# Enable automatic dictionary training based on sampling of write operations.
# When enabled, the system will automatically collect samples and train new dictionaries.
# Manual training via nodetool is always available regardless of this setting.
compression_dictionary_training_auto_train_enabled: false

# Sampling rate for automatic dictionary training (1-10000).
# Value of 100 means 1% of writes are sampled. Lower values reduce overhead but may
# result in less representative sample data for dictionary training.
compression_dictionary_training_sampling_rate: 0.01
210 changes: 210 additions & 0 deletions doc/modules/cassandra/pages/managing/operating/compression.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ these areas (A is relatively good, F is relatively bad):

|https://facebook.github.io/zstd/[Zstd] |`ZstdCompressor` | A- | A- | A+ | `>= 4.0`

|https://facebook.github.io/zstd/[Zstd with Dictionary] |`ZstdDictionaryCompressor` | A- | A- | A++ | `>= 6.0`

|http://google.github.io/snappy/[Snappy] |`SnappyCompressor` | A- | A | C | `>= 1.0`

|https://zlib.net[Deflate (zlib)] |`DeflateCompressor` | C | C | A | `>= 1.0`
Expand All @@ -60,13 +62,101 @@ cycle spent. This is why it is the default choice in Cassandra.

For storage critical applications (disk footprint), however, `Zstd` may
be a better choice as it can get significant additional ratio to `LZ4`.
For workloads with highly repetitive or similar data patterns,
`ZstdDictionaryCompressor` can achieve even better compression ratios by
training a compression dictionary on representative data samples.

`Snappy` is kept for backwards compatibility and `LZ4` will typically be
preferable.

`Deflate` is kept for backwards compatibility and `Zstd` will typically
be preferable.

== ZSTD Dictionary Compression

The `ZstdDictionaryCompressor` extends standard ZSTD compression by using
trained compression dictionaries to achieve superior compression ratios,
particularly for workloads with repetitive or similar data patterns.

=== How Dictionary Compression Works

Dictionary compression improves upon standard compression by training a
compression dictionary on representative samples of your data. This
dictionary captures common patterns, repeated strings, and data structures,
allowing the compressor to reference these patterns more efficiently than
discovering them independently in each compression chunk.

=== When to Use Dictionary Compression

Dictionary compression is most effective for:

* *Tables with similar row structures*: JSON documents, XML data, or
repeated data schemas benefit significantly from dictionary compression.
* *Storage-critical workloads*: When disk space savings justify the
additional operational overhead of dictionary training and management.
* *Large datasets with repetitive patterns*: The more similar your data,
the better the compression ratio improvement.

Dictionary compression may not be ideal for:

* *Highly random or unique data*: Already-compressed data or cryptographic
data will see minimal benefit.
* *Small tables*: The overhead of dictionary management may outweigh the
storage savings.
* *Frequently changing schemas*: Schema changes may require retraining
dictionaries to maintain optimal compression ratios.

=== Dictionary Training

Before dictionary compression can provide optimal results, a compression
dictionary must be trained on representative data samples. Cassandra
supports both manual and automatic training approaches.

==== Manual Dictionary Training

Use the `nodetool traincompressiondictionary` command to manually train
a compression dictionary:

[source,bash]
----
nodetool traincompressiondictionary <keyspace> <table>
----

The command trains a dictionary by sampling from existing SSTables. If no
SSTables are available on disk (e.g., all data is in memtables), the command
will automatically flush the memtable before sampling.

The training process completes synchronously and displays progress information
including sample count, sample size, and elapsed time. Training typically
completes within minutes for most workloads.

==== Automatic Dictionary Training

Enable automatic training in `cassandra.yaml`:

[source,yaml]
----
compression_dictionary_training_auto_train_enabled: true
compression_dictionary_training_sampling_rate: 100 # 1% of writes
----

When enabled, Cassandra automatically samples write operations and
trains dictionaries in the background based on the configured sampling
rate (range: 1-10000, where 100 = 1% of writes).

=== Dictionary Storage and Distribution

Compression dictionaries are stored cluster-wide in the
`system_distributed.compression_dictionaries` table. Each table can
maintain multiple dictionary versions: the current dictionary for
compressing new SSTables, plus historical dictionaries needed for
reading older SSTables.

Dictionaries are identified by `dict_id`, with higher IDs representing
newer dictionaries. Cassandra automatically refreshes dictionaries
across the cluster based on configured intervals, and caches them
locally to minimize lookup overhead.

== Configuring Compression

Compression is configured on a per-table basis as an optional argument
Expand Down Expand Up @@ -105,6 +195,17 @@ should be used with caution, as they require more memory. The default of
`3` is a good choice for competing with `Deflate` ratios and `1` is a
good choice for competing with `LZ4`.

The `ZstdDictionaryCompressor` supports the same options as
`ZstdCompressor`:

* `compression_level` (default `3`): Same range and behavior as
`ZstdCompressor`. Dictionary compression provides improved ratios at
any compression level compared to standard ZSTD.

NOTE: `ZstdDictionaryCompressor` requires a trained compression
dictionary to achieve optimal results. See the ZSTD Dictionary
Compression section above for training instructions.

Users can set compression using the following syntax:

[source,cql]
Expand All @@ -121,6 +222,25 @@ ALTER TABLE keyspace.table
WITH compression = {'class': 'LZ4Compressor', 'chunk_length_in_kb': 64};
----

For dictionary compression:

[source,cql]
----
CREATE TABLE keyspace.table (id int PRIMARY KEY)
WITH compression = {'class': 'ZstdDictionaryCompressor'};
----

Or with a specific compression level:

[source,cql]
----
ALTER TABLE keyspace.table
WITH compression = {
'class': 'ZstdDictionaryCompressor',
'compression_level': '3'
};
----

Once enabled, compression can be disabled with `ALTER TABLE` setting
`enabled` to `false`:

Expand All @@ -140,6 +260,63 @@ immediately, the operator can trigger an SSTable rewrite using
`nodetool scrub` or `nodetool upgradesstables -a`, both of which will
rebuild the SSTables on disk, re-compressing the data in the process.

== Dictionary Compression Configuration

When using `ZstdDictionaryCompressor`, several additional configuration
options are available in `cassandra.yaml` to control dictionary
management, caching, and training behavior.

=== Dictionary Refresh Settings

* `compression_dictionary_refresh_interval` (default: `3600`): How often
(in seconds) to check for and refresh compression dictionaries
cluster-wide. Newly trained dictionaries will be picked up by all nodes
within this interval.
* `compression_dictionary_refresh_initial_delay` (default: `10`): Initial
delay (in seconds) before the first dictionary refresh check after node
startup.

=== Dictionary Caching

* `compression_dictionary_cache_size` (default: `10`): Maximum number of
compression dictionaries to cache per table. Higher values reduce lookup
overhead but increase memory usage.
* `compression_dictionary_cache_expire` (default: `3600`): Dictionary
cache entry TTL in seconds. Expired entries are evicted and reloaded on
next access.

=== Training Configuration

* `compression_dictionary_training_max_dictionary_size` (default: `65536`):
Maximum size of trained dictionaries in bytes. Larger dictionaries can
capture more patterns but increase memory overhead.
* `compression_dictionary_training_max_total_sample_size` (default:
`10485760`): Maximum total size of sample data to collect for training,
approximately 10MB.
* `compression_dictionary_training_auto_train_enabled` (default: `false`):
Enable automatic background dictionary training. When enabled, Cassandra
samples writes and trains dictionaries automatically.
* `compression_dictionary_training_sampling_rate` (default: `100`):
Sampling rate for automatic training, range 1-10000 where 100 = 1% of
writes. Lower values reduce training overhead but may miss data patterns.

Example configuration:

[source,yaml]
----
# Dictionary refresh and caching
compression_dictionary_refresh_interval: 3600
compression_dictionary_refresh_initial_delay: 10
compression_dictionary_cache_size: 10
compression_dictionary_cache_expire: 3600

# Automatic training
compression_dictionary_training_auto_train_enabled: false
compression_dictionary_training_sampling_rate: 100
compression_dictionary_training_max_dictionary_size: 65536
compression_dictionary_training_max_total_sample_size: 10485760
----

== Other options

* `crc_check_chance` (default: `1.0`): determines how likely Cassandra
Expand Down Expand Up @@ -186,6 +363,39 @@ correctness of data on disk, compressed tables allow the user to set
probabilistically validate chunks on read to verify bits on disk are not
corrupt.

=== Dictionary Compression Operational Considerations

When using `ZstdDictionaryCompressor`, additional operational factors
apply:

* *Dictionary Storage*: Compression dictionaries are stored in the
`system_distributed.compression_dictionaries` table and replicated
cluster-wide. Each table maintains current and historical dictionary
versions.
* *Dictionary Cache Memory*: Dictionaries are cached locally on each node
according to `compression_dictionary_cache_size`. Memory overhead is
typically minimal (default 64KB per dictionary × cache size).
* *Dictionary Training Overhead*: Manual training via
`nodetool traincompressiondictionary` samples SSTable chunk data and
performs CPU-intensive dictionary training. Consider running training
during off-peak hours.
* *Automatic Training Impact*: When
`compression_dictionary_training_auto_train_enabled` is true, write
operations are sampled based on `compression_dictionary_training_sampling_rate`.
This adds minimal overhead but should be monitored in write-intensive
workloads.
* *Dictionary Refresh*: The dictionary refresh process
(`compression_dictionary_refresh_interval`) checks for new dictionaries
cluster-wide. The default 1-hour interval balances freshness with
overhead.
* *SSTable Compatibility*: Each SSTable is compressed with a specific
dictionary version. Historical dictionaries must be retained to read
older SSTables until they are compacted with new dictionaries.
* *Schema Changes*: Significant schema changes or data pattern shifts may
require retraining dictionaries to maintain optimal compression ratios.
Monitor the `SSTable Compression Ratio` via `nodetool tablestats` to
detect degradation.

== Advanced Use

Advanced users can provide their own compression class by implementing
Expand Down
1 change: 1 addition & 0 deletions pylib/cqlshlib/cqlhandling.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class CqlParsingRuleSet(pylexotron.ParsingRuleSet):
'SnappyCompressor',
'LZ4Compressor',
'ZstdCompressor',
'ZstdDictionaryCompressor'
)

available_compaction_classes = (
Expand Down
11 changes: 11 additions & 0 deletions src/java/org/apache/cassandra/config/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,17 @@ public static class SSTableConfig
public volatile DurationSpec.IntSecondsBound counter_cache_save_period = new DurationSpec.IntSecondsBound("7200s");
public volatile int counter_cache_keys_to_save = Integer.MAX_VALUE;

public volatile DurationSpec.IntSecondsBound compression_dictionary_refresh_interval = new DurationSpec.IntSecondsBound("3600s"); // 1 hour - TODO: re-assess whether daily (86400s) is more appropriate
public volatile DurationSpec.IntSecondsBound compression_dictionary_refresh_initial_delay = new DurationSpec.IntSecondsBound("10s"); // 10 seconds default
public volatile int compression_dictionary_cache_size = 10; // max dictionaries per table
public volatile DurationSpec.IntSecondsBound compression_dictionary_cache_expire = new DurationSpec.IntSecondsBound("24h");

// Dictionary training settings
public volatile DataStorageSpec.IntKibibytesBound compression_dictionary_training_max_dictionary_size = new DataStorageSpec.IntKibibytesBound("64KiB");
public volatile DataStorageSpec.IntKibibytesBound compression_dictionary_training_max_total_sample_size = new DataStorageSpec.IntKibibytesBound("10MiB");
public volatile boolean compression_dictionary_training_auto_train_enabled = false;
public volatile float compression_dictionary_training_sampling_rate = 0.01f; // samples 1%

public DataStorageSpec.LongMebibytesBound paxos_cache_size = null;

public DataStorageSpec.LongMebibytesBound consensus_migration_cache_size = null;
Expand Down
Loading