Skip to content

Commit 259b5bf

Browse files
authored
Merge pull request #484 from ONSdigital/regression_configs
new test configs set up
2 parents 2ff9b86 + 58abc4f commit 259b5bf

File tree

10 files changed

+395
-14
lines changed

10 files changed

+395
-14
lines changed
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
config_validation:
2+
validate: True
3+
path: src/dev_config_schema.yaml
4+
dev_global:
5+
# Logging settings
6+
logging_level: "DEBUG"
7+
# Environment settings
8+
dev_test : False
9+
platform: network # network #whether to load from hdfs, network (Windows) or s3 (CDP)
10+
load_from_feather: True
11+
runlog_writer:
12+
write_csv: True # Write the runlog to a CSV file
13+
write_hdf5: False # Write the runlog to an HDF5 file
14+
write_sql: False # Write the runlog to a SQL database
15+
display: False # Display the runlog in the terminal
16+
log_path: "/bat/res_dev/project_data/logs"
17+
s3_paths:
18+
logs_foldername: "/bat/res_dev/project_data/logs/run_logs"
19+
staging_paths:
20+
folder: "01_staging"
21+
feather_output: "feather"
22+
staging_output_path: "staging_qa/full_responses_qa"
23+
pcode_val_path: "staging_qa/postcode_validation"
24+
freezing_paths:
25+
folder: "02_freezing"
26+
frozen_data_staged_output_path: "frozen_data_staged"
27+
frozen_data_staged_path: "frozen_data_staged"
28+
freezing_changes_to_review_path: "changes_to_review"
29+
freezing_amendments_path: "freezing_updates"
30+
freezing_additions_path: "freezing_updates"
31+
ni_paths:
32+
folder: "03_northern_ireland"
33+
ni_staging_output_path: "ni_staging_qa"
34+
construction_paths:
35+
folder: "04_construction"
36+
qa_path: "construction_qa"
37+
mapping_paths:
38+
folder: "05_mapping"
39+
qa_path: "mapping_qa"
40+
imputation_paths:
41+
folder: "06_imputation"
42+
qa_path: "imputation_qa"
43+
manual_trimming_path: "manual_trimming"
44+
backdata_out_path: "backdata_output"
45+
outliers_paths:
46+
folder: "07_outliers"
47+
qa_path: "outliers_qa"
48+
auto_outliers_path: "auto_outliers"
49+
estimation_paths:
50+
folder: "08_estimation"
51+
qa_path: "estimation_qa"
52+
apportionment_paths:
53+
folder: "09_apportionment"
54+
qa_path: "apportionment_qa"
55+
outputs_paths:
56+
folder: "10_outputs"
57+
#TODO: add all the output subpaths
58+
outputs_master: ""
59+
pnp_paths:
60+
staging_qa_path: "01_staging/pnp_staging_qa"
61+
export_paths:
62+
export_folder: "outgoing_export"
63+
network_paths:
64+
root: "R:/BERD Results System Development 2023/DAP_emulation/"
65+
logs_foldername: "logs/run_logs"
66+
# snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2023_snapshots/snapshot-202312-002-b9b6048a-51c9-4669-919a-e92fc6e9c433.json"
67+
snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2023_snapshots/snapshot-202312-002-85ae5659-7147-42c3-a5dd-d69beccc9e09.json"
68+
updated_snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2023_snapshots/snapshot-202312-002-b9b6048a-51c9-4669-919a-e92fc6e9c433.json"
69+
ni_full_responses_path: "R:/BERD Results System Development 2023/DAP_emulation/2023_surveys/BERD/03_northern_ireland/2023/ONS_Data_RD2022_Revised_Dataset_Weighted_Unrounded_NISRA.csv"
70+
# 2022 paths
71+
# snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2022_snapshots/snapshot-202212-002-83b5bacd-7c99-45cf-b989-d43d762dd054.json"
72+
# updated_snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2022_snapshots/snapshot-202212-002-83b5bacd-7c99-45cf-b989-d43d762dd054.json"
73+
# Freezing data paths
74+
frozen_data_staged_output_path: "02_freezing/frozen_data_staged/"
75+
berd_frozen_data_staged_path: "02_freezing/frozen_data_staged/2023_FROZEN_staged_full_responses_25-04-29_v104.csv"
76+
pnp_frozen_data_staged_path: "02_freezing/frozen_data_staged/PNP_2023_FROZEN_staged_full_responses_25-01-29_v598.csv"
77+
freezing_changes_to_review_path: "02_freezing/changes_to_review/"
78+
freezing_additions_path: "02_freezing/freezing_updates/2023_freezing_additions_to_review_25-04-29_v108_all_true.csv"
79+
freezing_deletions_path: "02_freezing/freezing_updates/2023_freezing_deletions_to_review_25-04-29_v108_all_true.csv"
80+
freezing_amendments_path: "02_freezing/freezing_updates/2023_freezing_amendments_to_review_25-04-29_v108_all_true.csv"
81+
# Imputation and outliers input paths
82+
# backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_oct_24.csv"
83+
backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_published_v347.csv"
84+
pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_output/PNP_2021_backdata_with_pg.csv"
85+
manual_imp_trim_path: "06_imputation/manual_trimming/2023_manual_trimming_v1.csv"
86+
manual_outliers_path: "07_outliers/manual_outliers/2023_manual_outliers_v1.csv"
87+
# Construction paths
88+
all_data_construction_file_path: "04_construction/manual_construction/2023_test_construction_file_v3.csv"
89+
postcode_construction_file_path: "04_construction/manual_construction/2023_test_postcode_construction_file.csv"
90+
construction_file_path_ni: "04_construction/manual_construction/test_construction_ni_file.csv"
91+
# postcode paths
92+
postcode_masterlist: "R:/BERD Results System Development 2023/DAP_emulation/ONS_Postcode_Reference/postcodes_pcd2_itl.csv"
93+
pcode_val_path: "01_staging/staging_qa/postcode_validation"
94+
# schema paths
95+
schema_paths:
96+
manual_trimming_schema: "config/output_schemas/manual_trimming_qa_schema.toml"
97+
short_form_schema: "config/output_schemas/short_form_schema.toml"
98+
long_form_schema: "config/output_schemas/long_form_schema.toml"
99+
tau_schema: "config/output_schemas/tau_schema.toml"
100+
gb_sas_schema: "config/output_schemas/gb_sas_schema.toml"
101+
ni_sas_schema: "config/output_schemas/ni_sas_schema.toml"
102+
intram_by_pg_gb_schema: "config/output_schemas/intram_by_pg_gb_schema.toml"
103+
intram_by_pg_uk_schema: "config/output_schemas/intram_by_pg_uk_schema.toml"
104+
intram_gb_itl1_schema: "config/output_schemas/intram_gb_itl1_schema.toml"
105+
intram_gb_itl2_schema: "config/output_schemas/intram_gb_itl2_schema.toml"
106+
intram_uk_itl1_schema: "config/output_schemas/intram_uk_itl1_schema.toml"
107+
intram_uk_itl2_schema: "config/output_schemas/intram_uk_itl2_schema.toml"
108+
intram_by_sic_schema: "config/output_schemas/intram_by_sic_schema.toml"
109+
status_filtered_qa_schema: "config/output_schemas/status_filtered_qa_schema.toml"
110+
fte_total_qa_schema: "config/output_schemas/fte_total_qa_schema.toml"
111+
frozen_group_schema: "config/output_schemas/frozen_group_schema.toml"
112+
full_estimation_qa_schema: "config/output_schemas/full_estimation_qa_schema.toml"
113+
full_responses_imputed_schema: "config/output_schemas/full_responses_imputed_schema.toml"
114+
staged_full_responses_schema: "config/output_schemas/staged_full_responses_schema.toml"
115+
invalid_unrecognised_postcodes_schema: "config/output_schemas/invalid_unrecognised_postcodes_schema.toml"
116+
full_responses_mapped_schema: "config/output_schemas/full_responses_mapped_schema.toml"
117+
pnp_national_accounts_schema: "config/output_schemas/pnp_national_accounts_schema.toml"
118+
119+
# Export config for users
120+
mappers:
121+
geo_cols: ["ITL221CD", "ITL221NM", "ITL121CD", "ITL121NM"]
122+
gb_itl: "LAU121CD"
123+
ni_itl: "N92000002"
124+
outliers:
125+
flag_cols: ["701", "702", "703", "704", "705", "706", "707"] # NOT for user config. Columns to flag for outliers.
126+
devtest:
127+
seltype_list: [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 37, 38, 39]
128+
log_filenames:
129+
main: "main_runlog.csv"
130+
configs: "configs_runlog.csv"
131+
logs: "logs_runlog.csv"
132+
run_log_sql:
133+
log_db: "test_runlog"
134+
log_mode: "append"
135+
estimation:
136+
numeric_cols: ["701", "702", "703", "704", "705", "706", "707", "709", "710", "711"]
137+
imputation:
138+
lf_target_vars:
139+
- "211"
140+
- "305"
141+
- "emp_researcher"
142+
- "emp_technician"
143+
- "emp_other"
144+
- "headcount_res_m"
145+
- "headcount_res_f"
146+
- "headcount_tec_m"
147+
- "headcount_tec_f"
148+
- "headcount_oth_m"
149+
- "headcount_oth_f"
150+
sum_cols:
151+
- "emp_total"
152+
- "headcount_tot_m"
153+
- "headcount_tot_f"
154+
- "headcount_total"
155+
breakdowns:
156+
"211":
157+
- "202"
158+
- "203"
159+
- "204"
160+
- "205"
161+
- "206"
162+
- "207"
163+
- "209"
164+
- "210"
165+
- "212"
166+
- "214"
167+
- "216"
168+
- "218"
169+
- "219"
170+
- "220"
171+
- "221"
172+
- "222"
173+
- "223"
174+
- "225"
175+
- "226"
176+
- "227"
177+
- "228"
178+
- "229"
179+
- "237"
180+
- "242"
181+
- "243"
182+
- "244"
183+
- "245"
184+
- "246"
185+
- "247"
186+
- "248"
187+
- "249"
188+
- "250"
189+
"305":
190+
- "302"
191+
- "303"
192+
- "304"
193+
emp_total:
194+
- "emp_researcher"
195+
- "emp_technician"
196+
- "emp_other"
197+
headcount_total:
198+
- "headcount_res_m"
199+
- "headcount_res_f"
200+
- "headcount_tec_m"
201+
- "headcount_tec_f"
202+
- "headcount_oth_m"
203+
- "headcount_oth_f"
204+
consistency_checks:
205+
2xx_totals:
206+
purchases_split: ["222", "223", "203"]
207+
sal_oth_expend: ["202", "203", "204"]
208+
research_expend: ["205", "206", "207", "204"]
209+
capex: ["219", "220", "209", "210"]
210+
intram: ["204", "210", "211"]
211+
funding: ['212', '214', '216', '242', '250', '243', '244', '245', '246', '247', '248', '249', '218']
212+
ownership: ['225', '226', '227', '228', '229', '237', '218']
213+
equality: ['211', '218']
214+
inequality: ["221"]
215+
3xx_totals:
216+
purchases: ['302', '303', '304', '305']
217+
4xx_totals:
218+
emp_civil: ['405', '407', '409', '411']
219+
emp_defence: ['406', '408', '410', '412']
220+
5xx_totals:
221+
headcount_tot_m: ['501', '503', '505', '507']
222+
headcount_tot_f: ['502', '504', '506', '508']
223+
emp_xx_totals:
224+
employment: ["emp_researcher", "emp_technician", "emp_other", "emp_total"]
225+
hc_xx_totals:
226+
headcount_tot_m: ["headcount_res_m", "headcount_tec_m", "headcount_oth_m", "headcount_tot_m"]
227+
headcount_tot_f: ["headcount_res_f", "headcount_tec_f", "headcount_oth_f", "headcount_tot_f"]
228+
headcount_total: ["headcount_tot_m", "headcount_tot_f", "headcount_total"]
229+
6xx_totals:
230+
site_percentage: ["602"]
231+
7xx_a_totals:
232+
sf_expend: ["701", "702", "709"]
233+
sf_purchases: ["703", "704", "710"]
234+
7xx_b_totals:
235+
sf_fte: ["706", "707", "711"]
236+
sf_headcount: ["705"]
237+
238+
s3:
239+
ssl_file: "/etc/pki/tls/certs/ca-bundle.crt"
240+
s3_bucket: "onscdp-dev-data01-5320d6ca"
241+
#s3_bucket: "onscdp-mig-data01-0221a8af"
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
config_validation:
2+
validate: True
3+
path: src/user_config_schema.yaml
4+
survey:
5+
survey_type: "BERD"
6+
survey_year: 2023
7+
global:
8+
# Staging and validation settings
9+
postcode_csv_check: True
10+
use_backdata: True
11+
load_ni_data: True
12+
# Pipeline run type
13+
run_with_snapshot: True # Run until point of freezing
14+
run_with_frozen_data: False # Run with specified frozen csv
15+
# Freezing settings
16+
run_with_snapshot_and_freeze: False # Last run with frozen snapshot - produces frozen csv
17+
load_updated_snapshot_for_comparison: False # Run with new json snapshot, and compare to specified frozen csv
18+
run_updates_and_freeze: False # Add specified changes to frozen csv, and refreeze
19+
# Construction Settings
20+
run_all_data_construction: True
21+
run_postcode_construction: True
22+
run_ni_construction: False
23+
load_manual_outliers: True
24+
load_manual_imputation: True
25+
# Backdata output settings
26+
output_backdata: False
27+
# QA output settings
28+
output_full_responses: False
29+
output_ni_full_responses: False
30+
output_mapping_qa: False
31+
output_mapping_ni_qa: False
32+
output_imputation_qa: False
33+
output_auto_outliers: False
34+
output_outlier_qa : False
35+
output_estimation_qa: False
36+
output_apportionment_qa: False
37+
# Final output settings
38+
output_long_form: False
39+
output_short_form: False
40+
output_gb_sas: False
41+
output_ni_sas: False
42+
output_tau: True
43+
output_intram_by_pg_gb: False
44+
output_intram_by_pg_uk: False
45+
output_intram_gb_itl: False
46+
output_intram_uk_itl: False
47+
output_intram_by_civil_defence: False
48+
output_intram_by_sic: False
49+
output_fte_total_qa: False
50+
output_status_filtered: False
51+
output_frozen_group: False
52+
output_intram_totals: False
53+
output_pnp_na: False
54+
s3_paths:
55+
root: "/bat/res_dev/project_data/"
56+
# staging input paths
57+
snapshot_path: "/bat/res_dev/anonymised/snapshot_202312_a_copy_of_202012_anon.json" # Run until point of freezing - last run is first freeze
58+
updated_snapshot_path: "/bat/res_dev/anonymised/snapshot_202312_a_copy_of_202012_anon.json"
59+
ni_full_responses_path: "03_northern_ireland/2021/TEST_ni.csv"
60+
feather_path: "staging/feather"
61+
# Freezing data paths
62+
frozen_data_staged_output_path: "02_freezing/frozen_data_staged" # path for folder
63+
berd_frozen_data_staged_path: "02_freezing/frozen_data_staged/2023_FROZEN_staged_full_responses_24-08-07_v906.csv" # pragma: allowlist secret
64+
pnp_frozen_data_staged_path: "02_freezing/frozen_data_staged/2023_FROZEN_staged_full_responses_24-08-07_v906.csv" # pragma: allowlist secret
65+
freezing_changes_to_review_path: "02_freezing/changes_to_review"
66+
freezing_additions_path: "02_freezing/freezing_updates"
67+
freezing_amendments_path: "02_freezing/freezing_updates"
68+
# Imputation and outliers input paths
69+
backdata_path: "/bat/res_dev/project_data/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_published_v347_anon.csv"
70+
manual_imp_trim_path: "06_imputation/manual_trimming/trimming_qa_2023-11-27_v359.csv"
71+
manual_outliers_path: "07_outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv"
72+
# Construction paths
73+
all_data_construction_file_path: "04_construction/manual_construction/test_construction_file - Copy.csv"
74+
postcode_construction_file_path: "04_construction/manual_construction/test_postcode_construction_file.csv"
75+
construction_file_path_ni: "04_construction/manual_construction/test_construction_file.csv"
76+
# postcode paths
77+
postcode_masterlist: "/bat/res_dev/mappers/ONSPD_NOV_2022_UK_first_100.csv"
78+
pcode_val_path: "01_staging/staging_qa/postcode_validation"
79+
# mapper paths
80+
81+
# mapper paths
82+
2022_mappers:
83+
mappers_version: "v1"
84+
postcode_mapper: "postcodes_2022.csv"
85+
itl_mapper_path: "itl_2022.csv"
86+
ultfoc_mapper_path: "BERD_2022_ultfoc.csv"
87+
ref_list_817_mapper_path: "BERD_2022_ref_list.csv"
88+
cellno_path: 'berd_2022_cellno_coverage.csv'
89+
pg_num_alpha_mapper_path: 'pg_num_alpha_2022.csv'
90+
sic_pg_alpha_mapper_path: 'sic_pg_alpha_2022.csv'
91+
sic_pg_num_mapper_path: 'sic_pg_num_2022.csv'
92+
pg_detailed_mapper_path: "pg_detailed_2022.csv"
93+
sic_division_detailed_mapper_path: "sic_div_detailed_2022.csv"
94+
2023_mappers:
95+
mappers_version: "v3"
96+
postcode_mapper: "postcodes_2023.csv"
97+
itl_mapper_path: "itl_2023.csv"
98+
ultfoc_mapper_path: "BERD_2023_ultfoc.csv"
99+
cellno_path: 'BERD_2023_cellno_coverage.csv'
100+
pg_num_alpha_mapper_path: 'pg_num_alpha_2023.csv'
101+
sic_pg_alpha_mapper_path: 'sic_pg_alpha_2023.csv'
102+
sic_pg_num_mapper_path: 'sic_pg_num_2023.csv'
103+
pg_detailed_mapper_path: "pg_detailed_2023.csv"
104+
sic_division_detailed_mapper_path: "sic_div_detailed_2023.csv"
105+
106+
# outliers and imputation settings
107+
outliers:
108+
upper_clip: 0.05 # enter percentage as a decimal (float) - default is 0.05
109+
lower_clip: 0.0 # enter percentage as a decimal (float) - default is 0.0
110+
imputation:
111+
lower_trim_perc: 15
112+
upper_trim_perc: 15
113+
trim_threshold: 10 # trimming will only occur on classes strictly larger than this value
114+
sf_expansion_threshold: 3 # default is 3: the minimum viable imputation class size for short form imputation
115+
mor_threshold: 3 # default is 3: the minimum viable imputation class size for MoR imputation
116+
# export settings
117+
export_choices:
118+
copy_or_move_files: "copy"
119+
export_short_form: None
120+
export_long_form: None
121+
export_tau: None
122+
export_gb_sas: "PNP_2023_output_gb_sas_24-11-19_v914.csv"
123+
export_ni_sas: None
124+
export_intram_by_pg_gb: None
125+
export_intram_by_pg_uk: None
126+
export_intram_gb_itl1: None
127+
export_intram_uk_itl1: None
128+
export_intram_gb_itl2: None
129+
export_intram_uk_itl2: None
130+
export_intram_by_sic: None
131+
export_fte_total_qa: None
132+
export_status_filtered: None
133+
export_frozen_group: None
134+
export_staged_BERD_full_responses: None #"2023_staged_BERD_full_responses_24-11-26_v922.csv"
135+
export_staged_full_responses: None # "PNP_2023_staged_full_responses_24-11-26_v922.csv"
136+
export_staged_NI_full_responses: None
137+
export_full_responses_imputed: "PNP_2023_full_responses_imputed_24-12-18_v971.csv"
138+
export_full_estimation_qa: None # "2022_full_estimation_qa_24-07-15_v555.csv"
139+
export_invalid_unrecognised_postcodes: None # "2022_invalid_unrecognised_postcodes_24-07-04_v503.csv"
140+
export_full_responses_mapped: "PNP_2023_full_responses_mapped_24-12-11_v212.csv"

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ def setup_environment():
1515
return src
1616

1717

18-
user_path = os.path.join("src", "user_config.yaml")
19-
dev_path = os.path.join("src", "dev_config.yaml")
18+
user_path = os.path.join("config", "test_configs/test_user_config.yaml")
19+
dev_path = os.path.join("config", "test_configs/test_dev_config.yaml")
2020

2121
src = setup_environment()
2222
run_time = src.run_pipeline(user_path, dev_path)

0 commit comments

Comments
 (0)