prepare_addressbase_for_address_matching/config.yaml at main · moj-analytical-services/prepare_addressbase_for_address_matching · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# ABP Pipeline Configuration
# All paths are relative to this config file's directory unless absolute

paths:
  # Base working directory for all data
  work_dir: ./data

  # Downloaded zip files from OS
  downloads_dir: ./data/downloads

  # Extracted CSV files from zip
  extracted_dir: ./data/extracted

  # Intermediate parquet files (one per record type)
  parquet_dir: ./data/parquet

  # Final output parquet files
  output_dir: ./data/output

# OS Data Hub download settings

# Full supply
# os_downloads:
#   # Data package ID from OS Data Hub
#   package_id: "0040204651"
#   # Version ID (update this when new data is released)
#   version_id: "6758807"

# Small subset for testing
os_downloads:
  # Data package ID from OS Data Hub
  package_id: "0040206240"
  # Version ID (update this when new data is released)
  version_id: "6777574"

# Processing options
processing:
  # Parquet compression settings
  parquet_compression: zstd
  parquet_compression_level: 9

  # DuckDB memory limit (optional)
  # If set, limits how much RAM DuckDB can use (e.g., '4GB', '500MB')
  # Useful for testing on low-memory machines
  # If not set, DuckDB uses its default (80% of physical RAM)
  # duckdb_memory_limit: "8GB"

  # Number of chunks to split flatfile processing into (default: 1)
  # Use higher values (e.g., 10) for lower memory usage on laptops
  # Output will be split into multiple parquet files, one per chunk
  num_chunks: 20