Skip to content

Commit c7ef6fb

Browse files
committed
fix data dictionary bug
1 parent b131c86 commit c7ef6fb

File tree

4 files changed

+135
-26
lines changed

4 files changed

+135
-26
lines changed

main.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def get_config(snapshot_date=None):
3434
'ifoi_en': 'https://open.canada.ca/data/dataset/a35cf382-690c-4221-a971-cf0fd189a46f/resource/7c131a87-7784-4208-8e5c-043451240d95/download/ifoi_roif_en.csv',
3535
'ifoi_fr': 'https://open.canada.ca/data/dataset/a35cf382-690c-4221-a971-cf0fd189a46f/resource/45069fe9-abe3-437f-97dd-3f64958bfa85/download/ifoi_roif_fr.csv',
3636
'rbpo': 'https://open.canada.ca/data/dataset/a35cf382-690c-4221-a971-cf0fd189a46f/resource/64774bc1-c90a-4ae2-a3ac-d9b50673a895/download/rbpo_rppo_en.csv',
37-
# 'op_cost': 'https://donnees-data.tpsgc-pwgsc.gc.ca/ba1/respessentielles-coreresp/respessentielles-coreresp.csv'
37+
# 'op_cost': 'https://donnees-data.tpsgc-pwgsc.gc.ca/ba1/respessentielles-coreresp/respessentielles-coreresp.csv' no longer in use
3838
}
3939

4040
json_urls = {
@@ -64,7 +64,7 @@ def get_config(snapshot_date=None):
6464
'2025-2026':'https://donnees-data.tpsgc-pwgsc.gc.ca/ba1/cp-pc/cp-pc-2526-fra.csv'
6565
}
6666

67-
if snapshot_date:
67+
if snapshot_date: # if a snapshot date has been defined, process as a snapshot
6868
input_snapshot_dir = input_dir / "snapshots" / snapshot_date
6969
output_dir = base_dir / "outputs" / "snapshots" / snapshot_date
7070
indicators_dir = base_dir / "outputs" / "snapshots" / snapshot_date / "indicators"
@@ -127,7 +127,7 @@ def main():
127127
logging.info("Starting data processing")
128128

129129
# Download and process raw data
130-
if not args.local:
130+
if not args.local: # If the "local" option was passed, do not download these files
131131
logging.info("Downloading raw data...")
132132
download_csv_files(config)
133133
download_json_files(config)

notebooks/dd.ipynb

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "fd2d413a-666f-49d4-b86c-7d6898a496a5",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import pandas as pd\n",
11+
"import numpy as np\n",
12+
"import json\n",
13+
"import re, pytz, os, requests, sys\n",
14+
"from pathlib import Path\n",
15+
"from datetime import datetime\n",
16+
"import sys\n",
17+
"sys.path.append(\"/workspaces/service-data\")\n",
18+
"\n",
19+
"from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names\n",
20+
"from src.load import load_csv\n",
21+
"from src.export import export_to_csv\n",
22+
"from src.merge import merge_si, merge_ss\n",
23+
"from src.utils import dept_list, program_list\n",
24+
"from main import get_config\n",
25+
"\n",
26+
"import pandas as pd\n",
27+
"import numpy as np\n",
28+
"import pytz\n",
29+
"from pathlib import Path\n",
30+
"\n",
31+
"\n",
32+
"\n",
33+
"base_dir = Path.cwd()\n",
34+
"parent_dir = base_dir.parent\n",
35+
"config = get_config()\n"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 5,
41+
"id": "e4c339d6",
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"\"\"\"Builds a structured data dictionary from a JSON file, processes nested data, \n",
46+
"renames columns, standardizes names, and exports to CSV.\"\"\"\n",
47+
"\n",
48+
"INPUT_DIR = config['input_dir']\n",
49+
"file_path = INPUT_DIR / 'service_data_dict.json'\n",
50+
"\n",
51+
"# Load JSON file into a dictionary\n",
52+
"with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
53+
" data = json.load(file)\n",
54+
"\n",
55+
"# Initial normalization of json file\n",
56+
"data_dict = pd.json_normalize(data)\n",
57+
"\n",
58+
"# Explode and normalize the 'resources' portion\n",
59+
"data_dict = data_dict.explode('resources').reset_index(drop=True)\n",
60+
"data_dict = pd.json_normalize(data_dict['resources'])\n",
61+
"\n",
62+
"# Explode the 'fields' portion\n",
63+
"data_dict = data_dict.explode('fields').reset_index(drop=True)\n",
64+
"\n",
65+
"# Tie the resource fields to the 'fields portion'\n",
66+
"data_dict_fields = pd.json_normalize(data_dict['fields'])\n",
67+
"data_dict = data_dict.merge(data_dict_fields, left_index=True, right_index=True)\n",
68+
"\n",
69+
"# List of field names and details about their type and requirements\n",
70+
"dd_field_names = data_dict.loc[:, ~data_dict.columns.str.startswith('choices.')].drop(columns=['fields'])\n",
71+
"\n",
72+
"# List of translated code labels for fields with restricted input choices\n",
73+
"dd_choices = data_dict.melt(\n",
74+
" id_vars = ['resource_name', 'title.en', 'title.fr','id','label.en', 'label.fr'], \n",
75+
" value_vars=[col for col in data_dict.columns if col.startswith('choices.')]\n",
76+
")\n",
77+
"\n",
78+
"dd_choices.dropna(subset=['value'], inplace=True)\n",
79+
"\n",
80+
"dd_choices['code'] = dd_choices['variable'].str.split('.').str[1]\n",
81+
"dd_choices['en_fr'] = dd_choices['variable'].str.split('.').str[2]\n",
82+
"dd_choices = dd_choices.dropna(subset='en_fr')\n",
83+
"dd_choices = dd_choices.loc[dd_choices['en_fr'].isin(['en', 'fr'])]\n",
84+
"\n",
85+
"dd_choices = dd_choices.pivot(index=['resource_name', 'id', 'code'], columns='en_fr', values='value')\n",
86+
"dd_choices = dd_choices.reset_index()\n",
87+
"\n",
88+
"# Keep dd_choices tidy by removing program_id and splitting into its own file (dd_program)\n",
89+
"dd_program = dd_choices.loc[dd_choices['id'] == 'program_id']\n",
90+
"dd_choices = dd_choices.loc[dd_choices['id'] != 'program_id']\n",
91+
"\n",
92+
"# Standardize column names\n",
93+
"dd_field_names = standardize_column_names(dd_field_names)\n",
94+
"dd_program = standardize_column_names(dd_program)\n",
95+
"dd_choices = standardize_column_names(dd_choices)\n",
96+
"\n",
97+
"data_dictionary_file_dict = {\n",
98+
" 'dd_field_names': dd_field_names,\n",
99+
" 'dd_program': dd_program,\n",
100+
" 'dd_choices': dd_choices\n",
101+
"}\n"
102+
]
103+
}
104+
],
105+
"metadata": {
106+
"kernelspec": {
107+
"display_name": "Python 3",
108+
"language": "python",
109+
"name": "python3"
110+
},
111+
"language_info": {
112+
"codemirror_mode": {
113+
"name": "ipython",
114+
"version": 3
115+
},
116+
"file_extension": ".py",
117+
"mimetype": "text/x-python",
118+
"name": "python",
119+
"nbconvert_exporter": "python",
120+
"pygments_lexer": "ipython3",
121+
"version": "3.12.1"
122+
}
123+
},
124+
"nbformat": 4,
125+
"nbformat_minor": 5
126+
}

notebooks/experiment-template.ipynb

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"source": [
1010
"import pandas as pd\n",
1111
"import numpy as np\n",
12+
"import json\n",
1213
"import re, pytz, os, requests, sys\n",
1314
"from pathlib import Path\n",
1415
"from datetime import datetime\n",
@@ -27,8 +28,11 @@
2728
"import pytz\n",
2829
"from pathlib import Path\n",
2930
"\n",
31+
"\n",
32+
"\n",
3033
"base_dir = Path.cwd()\n",
31-
"parent_dir = base_dir.parent"
34+
"parent_dir = base_dir.parent\n",
35+
"config = get_config()"
3236
]
3337
},
3438
{
@@ -50,28 +54,6 @@
5054
" delimiter=';'\n",
5155
" )"
5256
]
53-
},
54-
{
55-
"cell_type": "code",
56-
"execution_count": 11,
57-
"id": "434a0e1a",
58-
"metadata": {},
59-
"outputs": [
60-
{
61-
"ename": "SyntaxError",
62-
"evalue": "invalid syntax (310464099.py, line 3)",
63-
"output_type": "error",
64-
"traceback": [
65-
" \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[31m \u001b[39m\u001b[31mfor each program_csv_urls_en_list;\u001b[39m\n ^\n\u001b[31mSyntaxError\u001b[39m\u001b[31m:\u001b[39m invalid syntax\n"
66-
]
67-
}
68-
],
69-
"source": [
70-
"program_csv_urls_en_list = get_config()['program_csv_urls_en']\n",
71-
"\n",
72-
"for each program_csv_urls_en_list;\n",
73-
" print('hello')"
74-
]
7557
}
7658
],
7759
"metadata": {

src/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,7 @@ def build_data_dictionary(config):
371371
dd_choices['code'] = dd_choices['variable'].str.split('.').str[1]
372372
dd_choices['en_fr'] = dd_choices['variable'].str.split('.').str[2]
373373
dd_choices = dd_choices.dropna(subset='en_fr')
374+
dd_choices = dd_choices.loc[dd_choices['en_fr'].isin(['en', 'fr'])]
374375

375376

376377
dd_choices = dd_choices.pivot(index=['resource_name', 'id', 'code'], columns='en_fr', values='value')

0 commit comments

Comments
 (0)