-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
51 lines (40 loc) · 1.41 KB
/
config.yaml
File metadata and controls
51 lines (40 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# ABP Pipeline Configuration
# All paths are relative to this config file's directory unless absolute
paths:
# Base working directory for all data
work_dir: ./data
# Downloaded zip files from OS
downloads_dir: ./data/downloads
# Extracted CSV files from zip
extracted_dir: ./data/extracted
# Intermediate parquet files (one per record type)
parquet_dir: ./data/parquet
# Final output parquet files
output_dir: ./data/output
# OS Data Hub download settings
# Full supply
# os_downloads:
# # Data package ID from OS Data Hub
# package_id: "0040204651"
# # Version ID (update this when new data is released)
# version_id: "6758807"
# Small subset for testing
os_downloads:
# Data package ID from OS Data Hub
package_id: "0040206240"
# Version ID (update this when new data is released)
version_id: "6777574"
# Processing options
processing:
# Parquet compression settings
parquet_compression: zstd
parquet_compression_level: 9
# DuckDB memory limit (optional)
# If set, limits how much RAM DuckDB can use (e.g., '4GB', '500MB')
# Useful for testing on low-memory machines
# If not set, DuckDB uses its default (80% of physical RAM)
# duckdb_memory_limit: "8GB"
# Number of chunks to split flatfile processing into (default: 1)
# Use higher values (e.g., 10) for lower memory usage on laptops
# Output will be split into multiple parquet files, one per chunk
num_chunks: 20