ONSdigital
diff --git a/‎config/test_configs/test_dev_config.yaml‎
Lines changed: 241 additions & 0 deletions b/‎config/test_configs/test_dev_config.yaml‎
Lines changed: 241 additions & 0 deletions
diff --git a/‎config/test_configs/test_user_config.yaml‎
Lines changed: 140 additions & 0 deletions b/‎config/test_configs/test_user_config.yaml‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎main.py‎
Lines changed: 2 additions & 2 deletions b/‎main.py‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,241 @@
+config_validation:
+  validate: True
+  path: src/dev_config_schema.yaml
+dev_global:
+  # Logging settings
+  logging_level: "DEBUG"
+  # Environment settings
+  dev_test : False
+  platform: network # network #whether to load from hdfs, network (Windows) or s3 (CDP)
+  load_from_feather: True
+runlog_writer:
+  write_csv: True # Write the runlog to a CSV file
+  write_hdf5: False # Write the runlog to an HDF5 file
+  write_sql: False # Write the runlog to a SQL database
+  display: False # Display the runlog in the terminal
+  log_path: "/bat/res_dev/project_data/logs"
+s3_paths:
+  logs_foldername: "/bat/res_dev/project_data/logs/run_logs"
+staging_paths:
+  folder: "01_staging"
+  feather_output: "feather"
+  staging_output_path: "staging_qa/full_responses_qa"
+  pcode_val_path: "staging_qa/postcode_validation"
+freezing_paths:
+  folder: "02_freezing"
+  frozen_data_staged_output_path: "frozen_data_staged"
+  frozen_data_staged_path: "frozen_data_staged"
+  freezing_changes_to_review_path: "changes_to_review"
+  freezing_amendments_path: "freezing_updates"
+  freezing_additions_path: "freezing_updates"
+ni_paths:
+  folder: "03_northern_ireland"
+  ni_staging_output_path: "ni_staging_qa"
+construction_paths:
+  folder: "04_construction"
+  qa_path: "construction_qa"
+mapping_paths:
+  folder: "05_mapping"
+  qa_path: "mapping_qa"
+imputation_paths:
+  folder: "06_imputation"
+  qa_path: "imputation_qa"
+  manual_trimming_path: "manual_trimming"
+  backdata_out_path: "backdata_output"
+outliers_paths:
+  folder: "07_outliers"
+  qa_path: "outliers_qa"
+  auto_outliers_path: "auto_outliers"
+estimation_paths:
+  folder: "08_estimation"
+  qa_path: "estimation_qa"
+apportionment_paths:
+  folder: "09_apportionment"
+  qa_path: "apportionment_qa"
+outputs_paths:
+  folder: "10_outputs"
+  #TODO: add all the output subpaths
+  outputs_master: ""
+pnp_paths:
+  staging_qa_path: "01_staging/pnp_staging_qa"
+export_paths:
+  export_folder: "outgoing_export"
+network_paths:
+  root: "R:/BERD Results System Development 2023/DAP_emulation/"
+  logs_foldername: "logs/run_logs"
+  # snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2023_snapshots/snapshot-202312-002-b9b6048a-51c9-4669-919a-e92fc6e9c433.json"
+  snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2023_snapshots/snapshot-202312-002-85ae5659-7147-42c3-a5dd-d69beccc9e09.json"
+  updated_snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2023_snapshots/snapshot-202312-002-b9b6048a-51c9-4669-919a-e92fc6e9c433.json"
+  ni_full_responses_path:  "R:/BERD Results System Development 2023/DAP_emulation/2023_surveys/BERD/03_northern_ireland/2023/ONS_Data_RD2022_Revised_Dataset_Weighted_Unrounded_NISRA.csv"
+  # 2022 paths
+  # snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2022_snapshots/snapshot-202212-002-83b5bacd-7c99-45cf-b989-d43d762dd054.json"
+  # updated_snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2022_snapshots/snapshot-202212-002-83b5bacd-7c99-45cf-b989-d43d762dd054.json"
+  # Freezing data paths
+  frozen_data_staged_output_path: "02_freezing/frozen_data_staged/"
+  berd_frozen_data_staged_path: "02_freezing/frozen_data_staged/2023_FROZEN_staged_full_responses_25-04-29_v104.csv"
+  pnp_frozen_data_staged_path: "02_freezing/frozen_data_staged/PNP_2023_FROZEN_staged_full_responses_25-01-29_v598.csv"
+  freezing_changes_to_review_path: "02_freezing/changes_to_review/"
+  freezing_additions_path: "02_freezing/freezing_updates/2023_freezing_additions_to_review_25-04-29_v108_all_true.csv"
+  freezing_deletions_path: "02_freezing/freezing_updates/2023_freezing_deletions_to_review_25-04-29_v108_all_true.csv"
+  freezing_amendments_path: "02_freezing/freezing_updates/2023_freezing_amendments_to_review_25-04-29_v108_all_true.csv"
+  # Imputation and outliers input paths
+  # backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_oct_24.csv"
+  backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_published_v347.csv"
+  pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_output/PNP_2021_backdata_with_pg.csv"
+  manual_imp_trim_path: "06_imputation/manual_trimming/2023_manual_trimming_v1.csv"
+  manual_outliers_path: "07_outliers/manual_outliers/2023_manual_outliers_v1.csv"
+  # Construction paths
+  all_data_construction_file_path: "04_construction/manual_construction/2023_test_construction_file_v3.csv"
+  postcode_construction_file_path: "04_construction/manual_construction/2023_test_postcode_construction_file.csv"
+  construction_file_path_ni: "04_construction/manual_construction/test_construction_ni_file.csv"
+  # postcode paths
+  postcode_masterlist: "R:/BERD Results System Development 2023/DAP_emulation/ONS_Postcode_Reference/postcodes_pcd2_itl.csv"
+  pcode_val_path: "01_staging/staging_qa/postcode_validation"
+# schema paths
+schema_paths:
+  manual_trimming_schema: "config/output_schemas/manual_trimming_qa_schema.toml"
+  short_form_schema: "config/output_schemas/short_form_schema.toml"
+  long_form_schema: "config/output_schemas/long_form_schema.toml"
+  tau_schema: "config/output_schemas/tau_schema.toml"
+  gb_sas_schema: "config/output_schemas/gb_sas_schema.toml"
+  ni_sas_schema: "config/output_schemas/ni_sas_schema.toml"
+  intram_by_pg_gb_schema: "config/output_schemas/intram_by_pg_gb_schema.toml"
+  intram_by_pg_uk_schema: "config/output_schemas/intram_by_pg_uk_schema.toml"
+  intram_gb_itl1_schema: "config/output_schemas/intram_gb_itl1_schema.toml"
+  intram_gb_itl2_schema: "config/output_schemas/intram_gb_itl2_schema.toml"
+  intram_uk_itl1_schema: "config/output_schemas/intram_uk_itl1_schema.toml"
+  intram_uk_itl2_schema: "config/output_schemas/intram_uk_itl2_schema.toml"
+  intram_by_sic_schema: "config/output_schemas/intram_by_sic_schema.toml"
+  status_filtered_qa_schema: "config/output_schemas/status_filtered_qa_schema.toml"
+  fte_total_qa_schema: "config/output_schemas/fte_total_qa_schema.toml"
+  frozen_group_schema: "config/output_schemas/frozen_group_schema.toml"
+  full_estimation_qa_schema: "config/output_schemas/full_estimation_qa_schema.toml"
+  full_responses_imputed_schema: "config/output_schemas/full_responses_imputed_schema.toml"
+  staged_full_responses_schema: "config/output_schemas/staged_full_responses_schema.toml"
+  invalid_unrecognised_postcodes_schema: "config/output_schemas/invalid_unrecognised_postcodes_schema.toml"
+  full_responses_mapped_schema: "config/output_schemas/full_responses_mapped_schema.toml"
+  pnp_national_accounts_schema: "config/output_schemas/pnp_national_accounts_schema.toml"
+
+# Export config for users
+mappers:
+  geo_cols: ["ITL221CD", "ITL221NM", "ITL121CD", "ITL121NM"]
+  gb_itl: "LAU121CD"
+  ni_itl: "N92000002"
+outliers:
+  flag_cols: ["701", "702", "703", "704", "705", "706", "707"] # NOT for user config. Columns to flag for outliers.
+devtest:
+  seltype_list: [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 37, 38, 39]
+log_filenames:
+  main: "main_runlog.csv"
+  configs: "configs_runlog.csv"
+  logs: "logs_runlog.csv"
+run_log_sql:
+  log_db: "test_runlog"
+  log_mode: "append"
+estimation:
+  numeric_cols: ["701", "702", "703", "704", "705", "706", "707", "709", "710", "711"]
+imputation:
+  lf_target_vars:
+    - "211"
+    - "305"
+    - "emp_researcher"
+    - "emp_technician"
+    - "emp_other"
+    - "headcount_res_m"
+    - "headcount_res_f"
+    - "headcount_tec_m"
+    - "headcount_tec_f"
+    - "headcount_oth_m"
+    - "headcount_oth_f"
+  sum_cols:
+    - "emp_total"
+    - "headcount_tot_m"
+    - "headcount_tot_f"
+    - "headcount_total"
+breakdowns:
+  "211":
+    - "202"
+    - "203"
+    - "204"
+    - "205"
+    - "206"
+    - "207"
+    - "209"
+    - "210"
+    - "212"
+    - "214"
+    - "216"
+    - "218"
+    - "219"
+    - "220"
+    - "221"
+    - "222"
+    - "223"
+    - "225"
+    - "226"
+    - "227"
+    - "228"
+    - "229"
+    - "237"
+    - "242"
+    - "243"
+    - "244"
+    - "245"
+    - "246"
+    - "247"
+    - "248"
+    - "249"
+    - "250"
+  "305":
+    - "302"
+    - "303"
+    - "304"
+  emp_total:
+    - "emp_researcher"
+    - "emp_technician"
+    - "emp_other"
+  headcount_total:
+    - "headcount_res_m"
+    - "headcount_res_f"
+    - "headcount_tec_m"
+    - "headcount_tec_f"
+    - "headcount_oth_m"
+    - "headcount_oth_f"
+consistency_checks:
+  2xx_totals:
+    purchases_split: ["222", "223", "203"]
+    sal_oth_expend: ["202", "203", "204"]
+    research_expend: ["205", "206", "207", "204"]
+    capex: ["219", "220", "209", "210"]
+    intram: ["204", "210", "211"]
+    funding: ['212', '214', '216', '242', '250', '243', '244', '245', '246', '247', '248', '249', '218']
+    ownership: ['225', '226', '227', '228', '229', '237', '218']
+    equality: ['211', '218']
+    inequality: ["221"]
+  3xx_totals:
+    purchases: ['302', '303', '304', '305']
+  4xx_totals:
+    emp_civil: ['405', '407', '409', '411']
+    emp_defence: ['406', '408', '410', '412']
+  5xx_totals:
+    headcount_tot_m: ['501', '503', '505', '507']
+    headcount_tot_f: ['502', '504', '506', '508']
+  emp_xx_totals:
+    employment: ["emp_researcher", "emp_technician", "emp_other", "emp_total"]
+  hc_xx_totals:
+    headcount_tot_m: ["headcount_res_m", "headcount_tec_m", "headcount_oth_m", "headcount_tot_m"]
+    headcount_tot_f: ["headcount_res_f", "headcount_tec_f", "headcount_oth_f", "headcount_tot_f"]
+    headcount_total: ["headcount_tot_m", "headcount_tot_f", "headcount_total"]
+  6xx_totals:
+    site_percentage: ["602"]
+  7xx_a_totals:
+    sf_expend: ["701", "702", "709"]
+    sf_purchases: ["703", "704", "710"]
+  7xx_b_totals:
+    sf_fte: ["706", "707", "711"]
+    sf_headcount: ["705"]
+
+s3:
+  ssl_file: "/etc/pki/tls/certs/ca-bundle.crt"
+  s3_bucket: "onscdp-dev-data01-5320d6ca"
+  #s3_bucket: "onscdp-mig-data01-0221a8af"
@@ -0,0 +1,140 @@
+config_validation:
+  validate: True
+  path: src/user_config_schema.yaml
+survey:
+  survey_type: "BERD"
+  survey_year: 2023
+global:
+  # Staging and validation settings
+  postcode_csv_check: True
+  use_backdata: True
+  load_ni_data: True
+  # Pipeline run type
+  run_with_snapshot: True                    # Run until point of freezing
+  run_with_frozen_data: False                 # Run with specified frozen csv
+  # Freezing settings
+  run_with_snapshot_and_freeze: False         # Last run with frozen snapshot - produces frozen csv
+  load_updated_snapshot_for_comparison: False    # Run with new json snapshot, and compare to specified frozen csv
+  run_updates_and_freeze: False              # Add specified changes to frozen csv, and refreeze
+  # Construction Settings
+  run_all_data_construction: True
+  run_postcode_construction: True
+  run_ni_construction: False
+  load_manual_outliers: True
+  load_manual_imputation: True
+  # Backdata output settings
+  output_backdata: False
+  # QA output settings
+  output_full_responses: False
+  output_ni_full_responses: False
+  output_mapping_qa: False
+  output_mapping_ni_qa: False
+  output_imputation_qa: False
+  output_auto_outliers: False
+  output_outlier_qa : False
+  output_estimation_qa: False
+  output_apportionment_qa: False
+  # Final output settings
+  output_long_form: False
+  output_short_form: False
+  output_gb_sas: False
+  output_ni_sas: False
+  output_tau: True
+  output_intram_by_pg_gb: False
+  output_intram_by_pg_uk: False
+  output_intram_gb_itl: False
+  output_intram_uk_itl: False
+  output_intram_by_civil_defence: False
+  output_intram_by_sic: False
+  output_fte_total_qa: False
+  output_status_filtered: False
+  output_frozen_group: False
+  output_intram_totals: False
+  output_pnp_na: False
+s3_paths:
+  root: "/bat/res_dev/project_data/"
+  # staging input paths
+  snapshot_path: "/bat/res_dev/anonymised/snapshot_202312_a_copy_of_202012_anon.json" # Run until point of freezing - last run is first freeze
+  updated_snapshot_path: "/bat/res_dev/anonymised/snapshot_202312_a_copy_of_202012_anon.json"
+  ni_full_responses_path: "03_northern_ireland/2021/TEST_ni.csv"
+  feather_path: "staging/feather"
+  # Freezing data paths
+  frozen_data_staged_output_path: "02_freezing/frozen_data_staged" # path for folder
+  berd_frozen_data_staged_path: "02_freezing/frozen_data_staged/2023_FROZEN_staged_full_responses_24-08-07_v906.csv" # pragma: allowlist secret
+  pnp_frozen_data_staged_path: "02_freezing/frozen_data_staged/2023_FROZEN_staged_full_responses_24-08-07_v906.csv" # pragma: allowlist secret
+  freezing_changes_to_review_path: "02_freezing/changes_to_review"
+  freezing_additions_path: "02_freezing/freezing_updates"
+  freezing_amendments_path: "02_freezing/freezing_updates"
+  # Imputation and outliers input paths
+  backdata_path: "/bat/res_dev/project_data/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_published_v347_anon.csv"
+  manual_imp_trim_path: "06_imputation/manual_trimming/trimming_qa_2023-11-27_v359.csv"
+  manual_outliers_path: "07_outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv"
+  # Construction paths
+  all_data_construction_file_path:  "04_construction/manual_construction/test_construction_file - Copy.csv"
+  postcode_construction_file_path: "04_construction/manual_construction/test_postcode_construction_file.csv"
+  construction_file_path_ni:  "04_construction/manual_construction/test_construction_file.csv"
+  # postcode paths
+  postcode_masterlist: "/bat/res_dev/mappers/ONSPD_NOV_2022_UK_first_100.csv"
+  pcode_val_path: "01_staging/staging_qa/postcode_validation"
+  # mapper paths
+
+  # mapper paths
+2022_mappers:
+  mappers_version: "v1"
+  postcode_mapper: "postcodes_2022.csv"
+  itl_mapper_path: "itl_2022.csv"
+  ultfoc_mapper_path: "BERD_2022_ultfoc.csv"
+  ref_list_817_mapper_path: "BERD_2022_ref_list.csv"
+  cellno_path: 'berd_2022_cellno_coverage.csv'
+  pg_num_alpha_mapper_path: 'pg_num_alpha_2022.csv'
+  sic_pg_alpha_mapper_path: 'sic_pg_alpha_2022.csv'
+  sic_pg_num_mapper_path: 'sic_pg_num_2022.csv'
+  pg_detailed_mapper_path: "pg_detailed_2022.csv"
+  sic_division_detailed_mapper_path: "sic_div_detailed_2022.csv"
+2023_mappers:
+  mappers_version: "v3"
+  postcode_mapper: "postcodes_2023.csv"
+  itl_mapper_path: "itl_2023.csv"
+  ultfoc_mapper_path: "BERD_2023_ultfoc.csv"
+  cellno_path: 'BERD_2023_cellno_coverage.csv'
+  pg_num_alpha_mapper_path: 'pg_num_alpha_2023.csv'
+  sic_pg_alpha_mapper_path: 'sic_pg_alpha_2023.csv'
+  sic_pg_num_mapper_path: 'sic_pg_num_2023.csv'
+  pg_detailed_mapper_path: "pg_detailed_2023.csv"
+  sic_division_detailed_mapper_path: "sic_div_detailed_2023.csv"
+
+# outliers and imputation settings
+outliers:
+  upper_clip: 0.05  # enter percentage as a decimal (float) - default is 0.05
+  lower_clip: 0.0  # enter percentage as a decimal (float) - default is 0.0
+imputation:
+  lower_trim_perc: 15
+  upper_trim_perc: 15
+  trim_threshold: 10 # trimming will only occur on classes strictly larger than this value
+  sf_expansion_threshold: 3 # default is 3: the minimum viable imputation class size for short form imputation
+  mor_threshold: 3 # default is 3: the minimum viable imputation class size for MoR imputation
+# export settings
+export_choices:
+  copy_or_move_files: "copy"
+  export_short_form: None
+  export_long_form: None
+  export_tau: None
+  export_gb_sas: "PNP_2023_output_gb_sas_24-11-19_v914.csv"
+  export_ni_sas: None
+  export_intram_by_pg_gb: None
+  export_intram_by_pg_uk: None
+  export_intram_gb_itl1: None
+  export_intram_uk_itl1: None
+  export_intram_gb_itl2: None
+  export_intram_uk_itl2: None
+  export_intram_by_sic: None
+  export_fte_total_qa: None
+  export_status_filtered: None
+  export_frozen_group: None
+  export_staged_BERD_full_responses: None #"2023_staged_BERD_full_responses_24-11-26_v922.csv"
+  export_staged_full_responses: None # "PNP_2023_staged_full_responses_24-11-26_v922.csv"
+  export_staged_NI_full_responses: None
+  export_full_responses_imputed: "PNP_2023_full_responses_imputed_24-12-18_v971.csv"
+  export_full_estimation_qa: None  # "2022_full_estimation_qa_24-07-15_v555.csv"
+  export_invalid_unrecognised_postcodes: None # "2022_invalid_unrecognised_postcodes_24-07-04_v503.csv"
+  export_full_responses_mapped: "PNP_2023_full_responses_mapped_24-12-11_v212.csv"
@@ -15,8 +15,8 @@ def setup_environment():
     return src
 
 
-user_path = os.path.join("src", "user_config.yaml")
-dev_path = os.path.join("src", "dev_config.yaml")
+user_path = os.path.join("config", "test_configs/test_user_config.yaml")
+dev_path = os.path.join("config", "test_configs/test_dev_config.yaml")
 
 src = setup_environment()
 run_time = src.run_pipeline(user_path, dev_path)