LBHackney-IT
diff --git a/‎scripts/helpers/housing_nec_migration_gx_dq_inputs.py‎
Lines changed: 2 additions & 2 deletions b/‎scripts/helpers/housing_nec_migration_gx_dq_inputs.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/jobs/housing/housing_nec_migration_addresses_data_load_gx_suite.py‎
Lines changed: 10 additions & 10 deletions b/‎scripts/jobs/housing/housing_nec_migration_addresses_data_load_gx_suite.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎scripts/jobs/housing/housing_nec_migration_apply_gx_dq_tests.py‎
Lines changed: 116 additions & 104 deletions b/‎scripts/jobs/housing/housing_nec_migration_apply_gx_dq_tests.py‎
Lines changed: 116 additions & 104 deletions
diff --git a/‎scripts/jobs/housing/housing_nec_migration_arrears_actions_data_load_gx_suite.py‎
Lines changed: 9 additions & 9 deletions b/‎scripts/jobs/housing/housing_nec_migration_arrears_actions_data_load_gx_suite.py‎
Lines changed: 9 additions & 9 deletions
@@ -6,7 +6,7 @@
     "arrears_actions": {"id_field": "LACA_PAY_REF"},
     "revenue_accounts": {"id_field": "LRAC_PAY_REF"},
     "transactions": {"id_field": "LTRN_ALT_REF"},
-    "addresses": {"id_field": "LAUS_LEGACY_REF"}
+    "addresses": {"id_field": "LAUS_LEGACY_REF"},
 }
 
 data_load_list = [
@@ -16,7 +16,7 @@
     "contacts",
     "arrears_actions",
     "revenue_accounts",
-    "transactions",
+    # "transactions",
     "addresses",
 ]
 
 
@@ -7,23 +7,23 @@
 import great_expectations.expectations as gxe
 
 
-class ExpectPropRefColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
+class AddressesExpectPropRefColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
     column: str = "LAUS_LEGACY_REF"
     description: str = "Expect LAUS_LEGACY_REF values to be unique"
 
-class ExpectPropRefColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
+class AddressesExpectPropRefColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
     column: str = "LAUS_LEGACY_REF"
     description: str = "Expect LAUS_LEGACY_REF values to not be Null"
 
-class ExpectUPRNColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
+class AddressesExpectUPRNColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
     column: str = "LADR_UPRN"
     description: str = "Expect UPRN values to be unique"
 
-class ExpectUPRNColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
+class AddressesExpectUPRNColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
     column: str = "LADR_UPRN"
     description: str = "Expect UPRN values to not be Null"
 
-class ExpectAddressColumnsToMatchOrderedList(gxe.ExpectTableColumnsToMatchOrderedList):
+class AddressesExpectAddressColumnsToMatchOrderedList(gxe.ExpectTableColumnsToMatchOrderedList):
     column_list = [
         "LAUS_LEGACY_REF",
         "LAUS_AUT_FAO_CODE",
@@ -65,9 +65,9 @@ class ExpectAddressColumnsToMatchOrderedList(gxe.ExpectTableColumnsToMatchOrdere
 
 suite = gx.ExpectationSuite(name="addresses_data_load_suite")
 
-suite.add_expectation(ExpectPropRefColumnValuesToBeUnique())
-suite.add_expectation(ExpectUPRNColumnValuesToBeUnique())
-suite.add_expectation(ExpectAddressColumnsToMatchOrderedList())
-suite.add_expectation(ExpectPropRefColumnValuesToNotBeNull())
-suite.add_expectation(ExpectUPRNColumnValuesToNotBeNull())
+suite.add_expectation(AddressesExpectPropRefColumnValuesToBeUnique())
+suite.add_expectation(AddressesExpectUPRNColumnValuesToBeUnique())
+suite.add_expectation(AddressesExpectAddressColumnsToMatchOrderedList())
+suite.add_expectation(AddressesExpectPropRefColumnValuesToNotBeNull())
+suite.add_expectation(AddressesExpectUPRNColumnValuesToNotBeNull())
 suite = context.suites.add(suite)
@@ -21,7 +21,7 @@
 import scripts.jobs.housing.housing_nec_migration_contacts_data_load_gx_suite
 import scripts.jobs.housing.housing_nec_migration_arrears_actions_data_load_gx_suite
 import scripts.jobs.housing.housing_nec_migration_revenue_accounts_data_load_gx_suite
-import scripts.jobs.housing.housing_nec_migration_transactions_data_load_gx_suite
+# import scripts.jobs.housing.housing_nec_migration_transactions_data_load_gx_suite
 import scripts.jobs.housing.housing_nec_migration_addresses_data_load_gx_suite
 
 logging.basicConfig(level=logging.INFO)
@@ -68,119 +68,133 @@ def main():
         for table in table_list.get(data_load):
             logger.info(f"{table} loading...")
 
-            sql_query, id_field = get_sql_query(
-                sql_config=sql_config, data_load=data_load, table=table
-            )
-
-            conn = connect(s3_staging_dir=s3_staging_location, region_name=region_name)
-
             try:
-                df = pd.read_sql_query(sql_query, conn)
-            except Exception as e:
-                logger.info(f"Problem found with {table}: {e}, skipping table.")
-                continue
+                sql_query, id_field = get_sql_query(
+                    sql_config=sql_config, data_load=data_load, table=table
+                )
 
-            # set up batch
-            data_source = context.data_sources.add_pandas(f"{table}_pandas")
-            data_asset = data_source.add_dataframe_asset(name=f"{table}_df_asset")
-            batch_definition = data_asset.add_batch_definition_whole_dataframe(
-                "Athena batch definition"
-            )
-            batch_parameters = {"dataframe": df}
+                conn = connect(s3_staging_dir=s3_staging_location, region_name=region_name)
 
-            # get expectation suite for dataset
-            try:
-                suite = context.suites.get(name=f"{data_load}_data_load_suite")
-            except Exception as e:
-                logger.info(f"Problem found with {data_load}: GX suite {e}, skipping suite.")
-                continue
-            else:
-                expectations = suite.expectations
-
-            validation_definition = gx.ValidationDefinition(
-                data=batch_definition,
-                suite=suite,
-                name=f"validation_definition_{table}",
-            )
-            validation_definition = context.validation_definitions.add(
-                validation_definition
-            )
-
-            # create and start checking data with checkpoints
-            checkpoint = context.checkpoints.add(
-                gx.checkpoint.checkpoint.Checkpoint(
-                    name=f"{table}_checkpoint",
-                    validation_definitions=[validation_definition],
-                    result_format={
-                        "result_format": "COMPLETE",
-                        "return_unexpected_index_query": False,
-                        "partial_unexpected_count": 0,
-                    },
+                try:
+                    df = pd.read_sql_query(sql_query, conn)
+                except Exception as e:
+                    logger.error(f"SQL Read Problem found with {table}: {e}, skipping table.")
+                    continue
+
+                # set up batch
+                data_source = context.data_sources.add_pandas(f"{table}_pandas")
+                data_asset = data_source.add_dataframe_asset(name=f"{table}_df_asset")
+                batch_definition = data_asset.add_batch_definition_whole_dataframe(
+                    "Athena batch definition"
                 )
-            )
-
-            checkpoint_result = checkpoint.run(batch_parameters=batch_parameters)
-            results_dict = list(checkpoint_result.run_results.values())[
-                0
-            ].to_json_dict()
-            table_results_df = pd.json_normalize(results_dict["results"])
-            cols_not_needed = ["result.unexpected_list", "result.observed_value"]
-            cols_to_drop = [
-                c
-                for c in table_results_df.columns
-                if c.startswith("exception_info") or c in cols_not_needed
-            ]
-
-            table_results_df = table_results_df.drop(columns=cols_to_drop)
-            table_results_df_list.append(table_results_df)
-
-            # generate id lists for each unexpected result set
-            query_df = table_results_df.loc[
-                (~table_results_df["result.unexpected_index_list"].isna())
-                & (table_results_df["result.unexpected_index_list"].values != "[]")
-            ]
-
-            table_results_df["unexpected_id_list"] = pd.Series(dtype="object")
-            for i, row in query_df.iterrows():
+                batch_parameters = {"dataframe": df}
+
+                # get expectation suite for dataset
                 try:
-                    list(df[id_field].iloc[row["result.unexpected_index_list"]])
+                    suite = context.suites.get(name=f"{data_load}_data_load_suite")
                 except Exception as e:
-                    logger.info(
-                        f"Problem found with {table}: {e}, skipping making unexpected_id_list."
-                    )
+                    logger.error(f"GX Suite Problem found with {data_load}: {e}, skipping suite.")
                     continue
                 else:
-                    table_results_df.loc[i, "unexpected_id_list"] = str(
-                        list(df[id_field].iloc[row["result.unexpected_index_list"]])
+                    expectations = suite.expectations
+
+                validation_definition = gx.ValidationDefinition(
+                    data=batch_definition,
+                    suite=suite,
+                    name=f"validation_definition_{table}",
+                )
+
+                validation_definition = context.validation_definitions.add_or_update(
+                    validation_definition
+                )
+
+                # create and start checking data with checkpoints
+                checkpoint = context.checkpoints.add_or_update(
+                    gx.checkpoint.checkpoint.Checkpoint(
+                        name=f"{table}_checkpoint",
+                        validation_definitions=[validation_definition],
+                        result_format={
+                            "result_format": "COMPLETE",
+                            "return_unexpected_index_query": False,
+                            "partial_unexpected_count": 0,
+                        },
                     )
+                )
+
+                checkpoint_result = checkpoint.run(batch_parameters=batch_parameters)
+
+                # Logic to handle results
+                results_dict = list(checkpoint_result.run_results.values())[0].to_json_dict()
+                table_results_df = pd.json_normalize(results_dict["results"])
+
+                cols_not_needed = ["result.unexpected_list", "result.observed_value"]
+                cols_to_drop = [
+                    c
+                    for c in table_results_df.columns
+                    if c.startswith("exception_info") or c in cols_not_needed
+                ]
+
+                table_results_df = table_results_df.drop(columns=cols_to_drop)
+                table_results_df_list.append(table_results_df)
 
-            # drop columns not needed in metatdata
-            cols_to_drop_meta = [
-                "notes",
-                "result_format",
-                "catch_exceptions",
-                "rendered_content",
-                "windows",
-            ]
-
-            suite_df = pd.DataFrame()
-            for i in expectations:
-                temp_i = i
-                temp_df = pd.json_normalize(dict(temp_i))
-                temp_df["expectation_type"] = temp_i.expectation_type
-                temp_df["dataset_name"] = table
-                temp_df = temp_df.drop(columns=cols_to_drop_meta)
-                suite_df = pd.concat([suite_df, temp_df])
-
-            df_all_suite_list.append(suite_df)
+                # generate id lists for each unexpected result set
+                query_df = table_results_df.loc[
+                    (~table_results_df["result.unexpected_index_list"].isna())
+                    & (table_results_df["result.unexpected_index_list"].values != "[]")
+                    ]
+
+                table_results_df["unexpected_id_list"] = pd.Series(dtype="object")
+                for i, row in query_df.iterrows():
+                    try:
+                        # check this
+                        list(df[id_field].iloc[row["result.unexpected_index_list"]])
+                    except Exception as e:
+                        logger.warning(
+                            f"Problem mapping IDs for {table}: {e}. Proceeding without ID list."
+                        )
+                        continue
+                    else:
+                        table_results_df.loc[i, "unexpected_id_list"] = str(
+                            list(df[id_field].iloc[row["result.unexpected_index_list"]])
+                        )
+
+                # drop columns not needed in metadata
+                cols_to_drop_meta = [
+                    "notes",
+                    "result_format",
+                    "catch_exceptions",
+                    "rendered_content",
+                    "windows",
+                ]
+
+                suite_df = pd.DataFrame()
+                for i in expectations:
+                    temp_i = i
+                    temp_df = pd.json_normalize(dict(temp_i))
+                    temp_df["expectation_type"] = temp_i.expectation_type
+                    temp_df["dataset_name"] = table
+                    temp_df["expectation_id_full"] = temp_i.expectation_type + '_' + table
+                    temp_df = temp_df.drop(columns=cols_to_drop_meta, errors='ignore')  # errors='ignore' is safer
+                    suite_df = pd.concat([suite_df, temp_df])
+
+                df_all_suite_list.append(suite_df)
+
+            except Exception as e:
+                logger.error(f"CRITICAL ERROR processing table '{table}': {str(e)}")
+                logger.error("Skipping this table and moving to the next.")
+                continue
+
+    if not table_results_df_list:
+        logger.error("No tables were processed successfully. Exiting.")
+        return
 
     results_df = pd.concat(table_results_df_list)
     metadata_df = pd.concat(df_all_suite_list)
 
     # add expectation_id
-    metadata_df["expectation_id"] = (
-        metadata_df["expectation_type"] + "_" + metadata_df["dataset_name"]
-    )
+    # metadata_df["expectation_id"] = (
+    #     metadata_df["expectation_type"] + "_" + metadata_df["dataset_name"]
+    # )
     metadata_df["import_date"] = datetime.today().strftime("%Y%m%d")
 
     # set dtypes for Athena with default of string
@@ -199,10 +213,10 @@ def main():
         value=results_df.set_index(
             ["expectation_config.type", "dataset_name"]
         ).index.factorize()[0]
-        + 1,
+              + 1,
     )
     results_df["expectation_id"] = (
-        results_df["expectation_config.type"] + "_" + results_df["dataset_name"]
+            results_df["expectation_config.type"] + "_" + results_df["dataset_name"]
     )
     results_df["import_date"] = datetime.today().strftime("%Y%m%d")
 
@@ -216,6 +230,7 @@ def main():
         "result.element_count": "bigint",
         "result.unexpected_count": "bigint",
         "result.missing_count": "bigint",
+        "result.details_mismatched": 'string',
         "result.partial_unexpected_list": "array<string>",
         "result.unexpected_index_list": "array<bigint>",
         "result.unexpected_index_query": "string",
@@ -225,9 +240,6 @@ def main():
         "import_date": "string",
     }
 
-    # TODO for df_vars in [[results_df, dtype_dict_results, target_table], [metadata_df, dtype_dict_metadata, target_metadata_table]]:
-    # will loop the writing of these tables
-
     wr.s3.to_parquet(
         df=results_df,
         path=s3_target_location_results,
 
@@ -7,19 +7,19 @@
 import great_expectations.expectations as gxe
 
 
-class ExpectPayRefColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
+class ArrearsActionsExpectPayRefColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
     column: str = "LACA_PAY_REF"
     description: str = (
         "Expect LACA_PAY_REF (pay ref) values to not be Null in contacts load"
     )
 
 
-class ExpectValueColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
+class ArrearsActionsExpectCreatedDateToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
     column: str = "LACA_CREATED_DATE"
-    description: str = "Expect LCDE_CONTACT_VALUE (contact value) to not be Null"
+    description: str = "Expect LACA_CREATED_DATE to not be Null"
 
 
-class ExpectArrearCodeToBeInSet(gxe.ExpectColumnValuesToBeInSet):
+class ArrearsActionsExpectArrearCodeToBeInSet(gxe.ExpectColumnValuesToBeInSet):
     column: str = "LACA_ARA_CODE"
     value_set: list = [
         "BAGF",
@@ -104,7 +104,7 @@ class ExpectArrearCodeToBeInSet(gxe.ExpectColumnValuesToBeInSet):
     description: str = "Expect arrear code to be one of the set"
 
 
-class ExpectArrearsActionsColumnsToMatchOrderedList(gxe.ExpectTableColumnsToMatchOrderedList):
+class ArrearsActionsExpectArrearsActionsColumnsToMatchOrderedList(gxe.ExpectTableColumnsToMatchOrderedList):
     column_list = [
         "LACA_BALANCE",
         "LACA_PAY_REF",
@@ -136,8 +136,8 @@ class ExpectArrearsActionsColumnsToMatchOrderedList(gxe.ExpectTableColumnsToMatc
 
 suite = gx.ExpectationSuite(name="arrears_actions_data_load_suite")
 
-suite.add_expectation(ExpectArrearsActionsColumnsToMatchOrderedList())
-suite.add_expectation(ExpectArrearCodeToBeInSet())
-suite.add_expectation(ExpectPayRefColumnValuesToNotBeNull())
-suite.add_expectation(ExpectValueColumnValuesToNotBeNull())
+suite.add_expectation(ArrearsActionsExpectArrearsActionsColumnsToMatchOrderedList())
+suite.add_expectation(ArrearsActionsExpectArrearCodeToBeInSet())
+suite.add_expectation(ArrearsActionsExpectPayRefColumnValuesToNotBeNull())
+suite.add_expectation(ArrearsActionsExpectCreatedDateToNotBeNull())
 suite = context.suites.add(suite)