dsq can be configured via TOML or YAML configuration files to customize default behavior.
Configuration files are searched in the following order:
-
Current directory:
dsq.toml.dsq.tomldsq.yaml.dsq.yaml
-
Home directory:
~/.config/dsq/config.toml~/.config/dsq/config.yaml~/.dsq.toml~/.dsq.yaml
-
System directory:
/etc/dsq/config.toml/etc/dsq/config.yaml
The first found file is used. You can also specify a custom config file with --config.
# dsq.toml
[filter]
lazy_evaluation = true
dataframe_optimizations = true
optimization_level = "advanced" # none, basic, advanced
max_recursion_depth = 100
strict_mode = false
[performance]
batch_size = 10000
threads = 0 # 0 = auto-detect
parallel = true
memory_limit = "4GB" # Optional: "1GB", "500MB", etc.
[formats.csv]
separator = ","
has_header = true
quote_char = "\""
comment_char = "#"
null_values = ["NULL", ""]
infer_schema_length = 1000
trim_whitespace = true
encoding = "utf8"
[formats.parquet]
compression = "snappy" # snappy, gzip, lzo, brotli, lz4, zstd
statistics = true
row_group_size = 50000
use_dictionary = true
parallel_read = true
[formats.json]
pretty = true
indent = 2
date_format = "%Y-%m-%d"
[formats.jsonl]
buffer_size = 8192
[display]
color.enabled = true
color.auto_detect = true
compact = false
raw_output = false
sort_keys = false
indent = 2
max_display_rows = 100
[output]
default_format = "json" # json, csv, parquet, etc.
overwrite = false
include_header = true# dsq.yaml
filter:
lazy_evaluation: true
dataframe_optimizations: true
optimization_level: advanced
max_recursion_depth: 100
strict_mode: false
performance:
batch_size: 10000
threads: 0
parallel: true
memory_limit: 4GB
formats:
csv:
separator: ","
has_header: true
quote_char: "\""
comment_char: "#"
null_values: ["NULL", ""]
infer_schema_length: 1000
trim_whitespace: true
encoding: utf8
parquet:
compression: snappy
statistics: true
row_group_size: 50000
use_dictionary: true
parallel_read: true
json:
pretty: true
indent: 2
date_format: "%Y-%m-%d"
display:
color:
enabled: true
auto_detect: true
compact: false
raw_output: false
sort_keys: false
indent: 2
max_display_rows: 100
output:
default_format: json
overwrite: false
include_header: trueControls filter compilation and execution behavior.
[filter]
lazy_evaluation = true # Enable lazy evaluation
dataframe_optimizations = true # Enable DataFrame-specific optimizations
optimization_level = "advanced" # Optimization level: none, basic, advanced
max_recursion_depth = 100 # Maximum recursion depth
strict_mode = false # Strict error handling
collect_stats = false # Collect execution statisticsOptions:
lazy_evaluation- Defer execution until needed (default:false)dataframe_optimizations- Apply DataFrame optimizations (default:true)optimization_level-"none","basic","advanced"(default:"basic")max_recursion_depth- Recursion limit (default:100)strict_mode- Fail on type errors vs. coerce (default:false)collect_stats- Gather execution statistics (default:false)
Controls execution performance and resource usage.
[performance]
batch_size = 10000 # Rows per batch
threads = 0 # Thread count (0 = auto)
parallel = true # Enable parallel processing
memory_limit = "4GB" # Optional memory limit
streaming_threshold = 1000000 # Switch to streaming above thisOptions:
batch_size- Rows to process in each batch (default:10000)threads- Number of threads, 0 for auto-detection (default:0)parallel- Enable parallel processing (default:true)memory_limit- Maximum memory usage (optional)streaming_threshold- Row count to trigger streaming (default:1000000)
[formats.csv]
separator = ","
has_header = true
quote_char = "\""
comment_char = "#"
null_values = ["NULL", "", "N/A"]
infer_schema_length = 1000
trim_whitespace = true
encoding = "utf8"
skip_rows = 0
skip_rows_after_header = 0Options:
separator- Field delimiter (default:",")has_header- First row is header (default:true)quote_char- Quote character (default:"\"")comment_char- Comment line prefix (default:"#")null_values- Strings treated as null (default:["NULL", ""])infer_schema_length- Rows to scan for schema (default:1000)trim_whitespace- Trim field whitespace (default:false)encoding- Character encoding (default:"utf8")skip_rows- Skip N rows at start (default:0)skip_rows_after_header- Skip N rows after header (default:0)
[formats.parquet]
compression = "snappy"
statistics = true
row_group_size = 50000
use_dictionary = true
parallel_read = true
memory_map = falseOptions:
compression- Compression algorithm:"snappy","gzip","lzo","brotli","lz4","zstd","none"(default:"snappy")statistics- Generate column statistics (default:true)row_group_size- Rows per row group (default:50000)use_dictionary- Use dictionary encoding (default:true)parallel_read- Parallel reading (default:true)memory_map- Use memory mapping (default:false)
[formats.json]
pretty = true
indent = 2
date_format = "%Y-%m-%d"Options:
pretty- Pretty-print output (default:true)indent- Indentation spaces (default:2)date_format- Date formatting string (default: ISO 8601)
Controls output appearance in terminal.
[display]
color.enabled = true
color.auto_detect = true
compact = false
raw_output = false
sort_keys = false
indent = 2
max_display_rows = 100Options:
color.enabled- Enable colored output (default:true)color.auto_detect- Auto-detect terminal color support (default:true)compact- Compact output without whitespace (default:false)raw_output- Output raw strings without quotes (default:false)sort_keys- Sort object keys (default:false)indent- Indentation spaces (default:2)max_display_rows- Maximum rows to display (default:100)
Default output behavior.
[output]
default_format = "json"
overwrite = false
include_header = trueOptions:
default_format- Default output format (default:"json")overwrite- Overwrite existing files (default:false)include_header- Include headers in output (default:true)
# Show current configuration
dsq config show
# Get specific value
dsq config get filter.lazy_evaluation
# Set value
dsq config set filter.lazy_evaluation true
dsq config set formats.csv.separator ";"
dsq config set performance.threads 4
# Create default config file
dsq config init
# Create in specific location
dsq config init --path ~/.config/dsq/config.toml
# Force overwrite existing config
dsq config init --force# Use specific config file
dsq --config my-config.toml '.' data.csv
# Override config settings
dsq --threads 8 --lazy '.' data.csvSome settings can be overridden with environment variables:
# Override thread count
DSQ_THREADS=8 dsq '.' data.csv
# Disable color
DSQ_COLOR=false dsq '.' data.csv
# Set memory limit
DSQ_MEMORY_LIMIT=2GB dsq '.' large.parquetFor processing large datasets quickly:
[filter]
lazy_evaluation = true
dataframe_optimizations = true
optimization_level = "advanced"
[performance]
batch_size = 50000
threads = 0 # Use all cores
parallel = true
streaming_threshold = 500000
[formats.parquet]
compression = "lz4" # Fast compression
parallel_read = trueFor minimizing output file sizes:
[formats.parquet]
compression = "zstd"
statistics = true
use_dictionary = true
[formats.csv]
# No compression for CSV, use gzip externallyFor interactive development:
[filter]
strict_mode = true
collect_stats = true
[display]
color.enabled = true
compact = false
indent = 2
max_display_rows = 50
[performance]
batch_size = 1000 # Smaller batches for responsivenessFor production data pipelines:
[filter]
lazy_evaluation = true
dataframe_optimizations = true
optimization_level = "advanced"
strict_mode = true # Fail fast on errors
[performance]
batch_size = 100000
threads = 0
parallel = true
memory_limit = "8GB"
[output]
overwrite = false # Prevent accidental overwrites
[formats.parquet]
compression = "snappy" # Good balance
statistics = true
row_group_size = 100000Settings are applied in this order (later overrides earlier):
- Default values
- System config (
/etc/dsq/) - Home directory config (
~/.config/dsq/) - Current directory config (
./dsq.toml) - Custom config (
--config) - Environment variables
- Command-line flags
Example:
# Config file sets threads=4
# Environment variable overrides to 8
DSQ_THREADS=8 dsq '.' data.csv
# Command-line flag overrides everything
dsq --threads 16 '.' data.csvCheck configuration validity:
# Show resolved configuration
dsq config show
# Validate config file
dsq config validate dsq.toml
# Test with specific config
dsq --config test.toml --explain '.' data.csv- Use project-specific configs - Keep
dsq.tomlin project directories - Global defaults in home - Set user preferences in
~/.config/dsq/config.toml - Version control configs - Commit project configs to git
- Document custom settings - Add comments to explain non-default values
- Test before deploying - Validate configs with sample data first
When upgrading dsq versions, check for configuration changes:
# Backup current config
cp ~/.config/dsq/config.toml ~/.config/dsq/config.toml.bak
# Generate new default config
dsq config init --force
# Merge custom settings back