fix data dictionary bug

ge-tbs · ge-tbs · commit c7ef6fba6657 · 2025-09-04T17:20:55.000Z
diff --git a/main.py b/main.py
@@ -34,7 +34,7 @@ def get_config(snapshot_date=None):
         'ifoi_en': 'https://open.canada.ca/data/dataset/a35cf382-690c-4221-a971-cf0fd189a46f/resource/7c131a87-7784-4208-8e5c-043451240d95/download/ifoi_roif_en.csv',
         'ifoi_fr': 'https://open.canada.ca/data/dataset/a35cf382-690c-4221-a971-cf0fd189a46f/resource/45069fe9-abe3-437f-97dd-3f64958bfa85/download/ifoi_roif_fr.csv',
         'rbpo': 'https://open.canada.ca/data/dataset/a35cf382-690c-4221-a971-cf0fd189a46f/resource/64774bc1-c90a-4ae2-a3ac-d9b50673a895/download/rbpo_rppo_en.csv',
-        # 'op_cost': 'https://donnees-data.tpsgc-pwgsc.gc.ca/ba1/respessentielles-coreresp/respessentielles-coreresp.csv'   
+        # 'op_cost': 'https://donnees-data.tpsgc-pwgsc.gc.ca/ba1/respessentielles-coreresp/respessentielles-coreresp.csv'   no longer in use
     }
 
     json_urls = {
@@ -64,7 +64,7 @@ def get_config(snapshot_date=None):
         '2025-2026':'https://donnees-data.tpsgc-pwgsc.gc.ca/ba1/cp-pc/cp-pc-2526-fra.csv'
     }
 
-    if snapshot_date:
+    if snapshot_date: # if a snapshot date has been defined, process as a snapshot
         input_snapshot_dir = input_dir / "snapshots" / snapshot_date
         output_dir = base_dir / "outputs" / "snapshots" / snapshot_date
         indicators_dir = base_dir / "outputs" / "snapshots" / snapshot_date / "indicators"
@@ -127,7 +127,7 @@ def main():
         logging.info("Starting data processing")
 
         # Download and process raw data
-        if not args.local:
+        if not args.local: # If the "local" option was passed, do not download these files
             logging.info("Downloading raw data...")
             download_csv_files(config)
             download_json_files(config)
diff --git a/notebooks/dd.ipynb b/notebooks/dd.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fd2d413a-666f-49d4-b86c-7d6898a496a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import json\n",
+    "import re, pytz, os, requests, sys\n",
+    "from pathlib import Path\n",
+    "from datetime import datetime\n",
+    "import sys\n",
+    "sys.path.append(\"/workspaces/service-data\")\n",
+    "\n",
+    "from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names\n",
+    "from src.load import load_csv\n",
+    "from src.export import export_to_csv\n",
+    "from src.merge import merge_si, merge_ss\n",
+    "from src.utils import dept_list, program_list\n",
+    "from main import get_config\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import pytz\n",
+    "from pathlib import Path\n",
+    "\n",
+    "\n",
+    "\n",
+    "base_dir = Path.cwd()\n",
+    "parent_dir = base_dir.parent\n",
+    "config = get_config()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e4c339d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Builds a structured data dictionary from a JSON file, processes nested data, \n",
+    "renames columns, standardizes names, and exports to CSV.\"\"\"\n",
+    "\n",
+    "INPUT_DIR = config['input_dir']\n",
+    "file_path =  INPUT_DIR / 'service_data_dict.json'\n",
+    "\n",
+    "# Load JSON file into a dictionary\n",
+    "with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "    data = json.load(file)\n",
+    "\n",
+    "# Initial normalization of json file\n",
+    "data_dict = pd.json_normalize(data)\n",
+    "\n",
+    "# Explode and normalize the 'resources' portion\n",
+    "data_dict = data_dict.explode('resources').reset_index(drop=True)\n",
+    "data_dict = pd.json_normalize(data_dict['resources'])\n",
+    "\n",
+    "# Explode the 'fields' portion\n",
+    "data_dict = data_dict.explode('fields').reset_index(drop=True)\n",
+    "\n",
+    "# Tie the resource fields to the 'fields portion'\n",
+    "data_dict_fields = pd.json_normalize(data_dict['fields'])\n",
+    "data_dict = data_dict.merge(data_dict_fields, left_index=True, right_index=True)\n",
+    "\n",
+    "# List of field names and details about their type and requirements\n",
+    "dd_field_names = data_dict.loc[:, ~data_dict.columns.str.startswith('choices.')].drop(columns=['fields'])\n",
+    "\n",
+    "# List of translated code labels for fields with restricted input choices\n",
+    "dd_choices = data_dict.melt(\n",
+    "    id_vars = ['resource_name', 'title.en', 'title.fr','id','label.en', 'label.fr'], \n",
+    "    value_vars=[col for col in data_dict.columns if col.startswith('choices.')]\n",
+    ")\n",
+    "\n",
+    "dd_choices.dropna(subset=['value'], inplace=True)\n",
+    "\n",
+    "dd_choices['code'] = dd_choices['variable'].str.split('.').str[1]\n",
+    "dd_choices['en_fr'] = dd_choices['variable'].str.split('.').str[2]\n",
+    "dd_choices = dd_choices.dropna(subset='en_fr')\n",
+    "dd_choices = dd_choices.loc[dd_choices['en_fr'].isin(['en', 'fr'])]\n",
+    "\n",
+    "dd_choices = dd_choices.pivot(index=['resource_name', 'id', 'code'], columns='en_fr', values='value')\n",
+    "dd_choices = dd_choices.reset_index()\n",
+    "\n",
+    "# Keep dd_choices tidy by removing program_id and splitting into its own file (dd_program)\n",
+    "dd_program = dd_choices.loc[dd_choices['id'] == 'program_id']\n",
+    "dd_choices = dd_choices.loc[dd_choices['id'] != 'program_id']\n",
+    "\n",
+    "# Standardize column names\n",
+    "dd_field_names = standardize_column_names(dd_field_names)\n",
+    "dd_program = standardize_column_names(dd_program)\n",
+    "dd_choices = standardize_column_names(dd_choices)\n",
+    "\n",
+    "data_dictionary_file_dict = {\n",
+    "    'dd_field_names': dd_field_names,\n",
+    "    'dd_program': dd_program,\n",
+    "    'dd_choices': dd_choices\n",
+    "}\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/experiment-template.ipynb b/notebooks/experiment-template.ipynb
@@ -9,6 +9,7 @@
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
+    "import json\n",
     "import re, pytz, os, requests, sys\n",
     "from pathlib import Path\n",
     "from datetime import datetime\n",
@@ -27,8 +28,11 @@
     "import pytz\n",
     "from pathlib import Path\n",
     "\n",
+    "\n",
+    "\n",
     "base_dir = Path.cwd()\n",
-    "parent_dir = base_dir.parent"
+    "parent_dir = base_dir.parent\n",
+    "config = get_config()"
    ]
   },
   {
@@ -50,28 +54,6 @@
     "                 delimiter=';'\n",
     "                 )"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "434a0e1a",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "SyntaxError",
-     "evalue": "invalid syntax (310464099.py, line 3)",
-     "output_type": "error",
-     "traceback": [
-      "  \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[31m    \u001b[39m\u001b[31mfor each program_csv_urls_en_list;\u001b[39m\n             ^\n\u001b[31mSyntaxError\u001b[39m\u001b[31m:\u001b[39m invalid syntax\n"
-     ]
-    }
-   ],
-   "source": [
-    "program_csv_urls_en_list = get_config()['program_csv_urls_en']\n",
-    "\n",
-    "for each program_csv_urls_en_list;\n",
-    "    print('hello')"
-   ]
   }
  ],
  "metadata": {
diff --git a/src/utils.py b/src/utils.py
@@ -371,6 +371,7 @@ def build_data_dictionary(config):
     dd_choices['code'] = dd_choices['variable'].str.split('.').str[1]
     dd_choices['en_fr'] = dd_choices['variable'].str.split('.').str[2]
     dd_choices = dd_choices.dropna(subset='en_fr')
+    dd_choices = dd_choices.loc[dd_choices['en_fr'].isin(['en', 'fr'])]
     
     
     dd_choices = dd_choices.pivot(index=['resource_name', 'id', 'code'], columns='en_fr', values='value')